From e4c94330e3395ae87451bded2840a25d04f27902 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 22 Sep 2005 21:43:45 -0700
Subject: [PATCH] reboot: comment and factor the main reboot functions

In the lead up to 2.6.13 I fixed a large number of reboot problems by
making the calling conventions consistent.  Despite checking and double
checking my work it appears I missed an obvious one.

This first patch simply refactors the reboot routines so all of the
preparation for various kinds of reboots are in their own functions.
Making it very hard to get the various kinds of reboot out of sync.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index f723522e6986..2fa1ed18123c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -361,17 +361,35 @@ out_unlock:
 	return retval;
 }
 
+/**
+ *	emergency_restart - reboot the system
+ *
+ *	Without shutting down any hardware or taking any locks
+ *	reboot the system.  This is called when we know we are in
+ *	trouble so this is our best effort to reboot.  This is
+ *	safe to call in interrupt context.
+ */
 void emergency_restart(void)
 {
 	machine_emergency_restart();
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
 
-void kernel_restart(char *cmd)
+/**
+ *	kernel_restart - reboot the system
+ *
+ *	Shutdown everything and perform a clean reboot.
+ *	This is not safe to call in interrupt context.
+ */
+void kernel_restart_prepare(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
+}
+void kernel_restart(char *cmd)
+{
+	kernel_restart_prepare(cmd);
 	if (!cmd) {
 		printk(KERN_EMERG "Restarting system.\n");
 	} else {
@@ -382,6 +400,12 @@ void kernel_restart(char *cmd)
 }
 EXPORT_SYMBOL_GPL(kernel_restart);
 
+/**
+ *	kernel_kexec - reboot the system
+ *
+ *	Move into place and start executing a preloaded standalone
+ *	executable.  If nothing was preloaded return an error.
+ */
 void kernel_kexec(void)
 {
 #ifdef CONFIG_KEXEC
@@ -390,9 +414,7 @@ void kernel_kexec(void)
 	if (!image) {
 		return;
 	}
-	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-	system_state = SYSTEM_RESTART;
-	device_shutdown();
+	kernel_restart_prepare(NULL);
 	printk(KERN_EMERG "Starting new kernel\n");
 	machine_shutdown();
 	machine_kexec(image);
@@ -400,21 +422,39 @@ void kernel_kexec(void)
 }
 EXPORT_SYMBOL_GPL(kernel_kexec);
 
-void kernel_halt(void)
+/**
+ *	kernel_halt - halt the system
+ *
+ *	Shutdown everything and perform a clean system halt.
+ */
+void kernel_halt_prepare(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
 	system_state = SYSTEM_HALT;
 	device_shutdown();
+}
+void kernel_halt(void)
+{
+	kernel_halt_prepare();
 	printk(KERN_EMERG "System halted.\n");
 	machine_halt();
 }
 EXPORT_SYMBOL_GPL(kernel_halt);
 
-void kernel_power_off(void)
+/**
+ *	kernel_power_off - power_off the system
+ *
+ *	Shutdown everything and perform a clean system power_off.
+ */
+void kernel_power_off_prepare(void)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
 	system_state = SYSTEM_POWER_OFF;
 	device_shutdown();
+}
+void kernel_power_off(void)
+{
+	kernel_power_off_prepare();
 	printk(KERN_EMERG "Power down.\n");
 	machine_power_off();
 }
-- 
cgit v1.2.3


From 88d10bbaaec38856f913313b6c0858d9e9b7a066 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 22 Sep 2005 21:43:46 -0700
Subject: [PATCH] suspend: cleanup calling of power off methods.

In the lead up to 2.6.13 I fixed a large number of reboot problems by
making the calling conventions consistent.  Despite checking and double
checking my work it appears I missed an obvious one.

The S4 suspend code for PM_DISK_PLATFORM was also calling device_shutdown
without setting system_state, and was not calling the appropriate
reboot_notifier.

This patch fixes the bug by replacing the call of device_suspend with
kernel_poweroff_prepare.

Various forms of this failure have been fixed and tracked for a while.

Thanks for tracking this down go to: Alexey Starikovskiy, Meelis Roos
<mroos@linux.ee>, Nigel Cunningham <ncunningham@cyclades.com>, Pierre
Ossman <drzeus-list@drzeus.cx>

History of this bug is at:
http://bugme.osdl.org/show_bug.cgi?id=4320

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d8bf054d036..761956e813f5 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -17,12 +17,12 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/mount.h>
+#include <linux/pm.h>
 
 #include "power.h"
 
 
 extern suspend_disk_method_t pm_disk_mode;
-extern struct pm_ops * pm_ops;
 
 extern int swsusp_suspend(void);
 extern int swsusp_write(void);
@@ -49,13 +49,11 @@ dev_t swsusp_resume_device;
 
 static void power_down(suspend_disk_method_t mode)
 {
-	unsigned long flags;
 	int error = 0;
 
-	local_irq_save(flags);
 	switch(mode) {
 	case PM_DISK_PLATFORM:
- 		device_shutdown();
+		kernel_power_off_prepare();
 		error = pm_ops->enter(PM_SUSPEND_DISK);
 		break;
 	case PM_DISK_SHUTDOWN:
-- 
cgit v1.2.3


From 720b9429e8f41f7c4ee9df293403650905042035 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Thu, 22 Sep 2005 21:43:56 -0700
Subject: [PATCH] SOFTWARE_SUSPEND needs HOTPLUG_CPU on SMP

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 396c7873e804..46a5e5acff97 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -29,7 +29,7 @@ config PM_DEBUG
 
 config SOFTWARE_SUSPEND
 	bool "Software Suspend"
-	depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP))
+	depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FVR || PPC32) && !SMP)
 	---help---
 	  Enable the possibility of suspending the machine.
 	  It doesn't need APM.
-- 
cgit v1.2.3


From 57487f4376e16558ccbe45a5b41d8cb5192309a4 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 22 Sep 2005 21:44:01 -0700
Subject: [PATCH] swsusp: do not trigger BUG_ON() if there is not enough memory

The following patch makes swsusp avoid triggering the BUG_ON() in
swsusp_suspend() if there is not enough memory for suspend.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index d967e875ee82..353f37f1ba28 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -918,6 +918,7 @@ static int swsusp_alloc(void)
 
 	pagedir_nosave = NULL;
 	nr_copy_pages = calc_nr(nr_copy_pages);
+	nr_copy_pages_check = nr_copy_pages;
 
 	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
 		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
@@ -940,7 +941,6 @@ static int swsusp_alloc(void)
 		return error;
 	}
 
-	nr_copy_pages_check = nr_copy_pages;
 	return 0;
 }
 
-- 
cgit v1.2.3


From 8686bcd0a5f5e3f599ed9f1028ec9e449e7b87e3 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Thu, 22 Sep 2005 21:44:11 -0700
Subject: [PATCH] swsusp: fix comments

Fix comments in swsusp.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  |  2 +-
 kernel/power/swsusp.c | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index cd6a3493cc0d..9c9167d910dd 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,7 +1,7 @@
 #include <linux/suspend.h>
 #include <linux/utsname.h>
 
-/* With SUSPEND_CONSOLE defined, it suspend looks *really* cool, but
+/* With SUSPEND_CONSOLE defined suspend looks *really* cool, but
    we probably do not take enough locks for switching consoles, etc,
    so bad things might happen.
 */
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 353f37f1ba28..1cc9ff25e479 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -363,7 +363,7 @@ static void lock_swapdevices(void)
 }
 
 /**
- *	write_swap_page - Write one page to a fresh swap location.
+ *	write_page - Write one page to a fresh swap location.
  *	@addr:	Address we're writing.
  *	@loc:	Place to store the entry we used.
  *
@@ -863,6 +863,9 @@ static int alloc_image_pages(void)
 	return 0;
 }
 
+/* Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
 void swsusp_free(void)
 {
 	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
@@ -1213,8 +1216,9 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 		free_pagedir(pblist);
 		free_eaten_memory();
 		pblist = NULL;
-	}
-	else
+		/* Is this even worth handling? It should never ever happen, and we
+		   have just lost user's state, anyway... */
+	} else
 		printk("swsusp: Relocated %d pages\n", rel);
 
 	return pblist;
-- 
cgit v1.2.3


From 188a1eafa03aaa5e5fe6f53e637e704cd2c31c7c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 23 Sep 2005 13:22:21 -0700
Subject: Make sure SIGKILL gets proper respect

Bhavesh P. Davda <bhavesh@avaya.com> noticed that SIGKILL wouldn't
properly kill a process under just the right cicumstances: a stopped
task that already had another signal queued would get the SIGKILL
queued onto the shared queue, and there it would remain until SIGCONT.

This simplifies the signal acceptance logic, and fixes the bug in the
process.

Losely based on an earlier patch by Bhavesh.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 31 ++++++++++++++-----------------
 1 file changed, 14 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index b92c3c9f8b9a..5a274705ba19 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -936,34 +936,31 @@ force_sig_specific(int sig, struct task_struct *t)
  * as soon as they're available, so putting the signal on the shared queue
  * will be equivalent to sending it to one such thread.
  */
-#define wants_signal(sig, p, mask) 			\
-	(!sigismember(&(p)->blocked, sig)		\
-	 && !((p)->state & mask)			\
-	 && !((p)->flags & PF_EXITING)			\
-	 && (task_curr(p) || !signal_pending(p)))
-
+static inline int wants_signal(int sig, struct task_struct *p)
+{
+	if (sigismember(&p->blocked, sig))
+		return 0;
+	if (p->flags & PF_EXITING)
+		return 0;
+	if (sig == SIGKILL)
+		return 1;
+	if (p->state & (TASK_STOPPED | TASK_TRACED))
+		return 0;
+	return task_curr(p) || !signal_pending(p);
+}
 
 static void
 __group_complete_signal(int sig, struct task_struct *p)
 {
-	unsigned int mask;
 	struct task_struct *t;
 
-	/*
-	 * Don't bother traced and stopped tasks (but
-	 * SIGKILL will punch through that).
-	 */
-	mask = TASK_STOPPED | TASK_TRACED;
-	if (sig == SIGKILL)
-		mask = 0;
-
 	/*
 	 * Now find a thread we can wake up to take the signal off the queue.
 	 *
 	 * If the main thread wants the signal, it gets first crack.
 	 * Probably the least surprising to the average bear.
 	 */
-	if (wants_signal(sig, p, mask))
+	if (wants_signal(sig, p))
 		t = p;
 	else if (thread_group_empty(p))
 		/*
@@ -981,7 +978,7 @@ __group_complete_signal(int sig, struct task_struct *p)
 			t = p->signal->curr_target = p;
 		BUG_ON(t->tgid != p->tgid);
 
-		while (!wants_signal(sig, t, mask)) {
+		while (!wants_signal(sig, t)) {
 			t = next_thread(t);
 			if (t == p->signal->curr_target)
 				/*
-- 
cgit v1.2.3


From beeca08738c4c4024c81a591812bfe38f8c436c0 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 28 Sep 2005 20:29:44 +1000
Subject: Don't call a NULL ack function in the generic IRQ code.

Some IRQ controllers don't need an ack function (e.g. OpenPIC on
PPC platforms) and for them we'd rather not have the overhead
of doing an indirect call to a function that does nothing.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/irq/handle.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3ff7b925c387..51df337b37db 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -117,14 +117,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		desc->handler->ack(irq);
+		if (desc->handler->ack)
+			desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
 		desc->handler->end(irq);
 		return 1;
 	}
 
 	spin_lock(&desc->lock);
-	desc->handler->ack(irq);
+	if (desc->handler->ack)
+		desc->handler->ack(irq);
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
-- 
cgit v1.2.3


From 254b54771cc4c00002f796be65d2885191dca9dc Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 27 Sep 2005 21:45:31 -0700
Subject: [PATCH] swsusp: remove wrong code from data_free

The following patch removes some wrong code from the data_free() function
in swsusp.

This function could only be called if there's an error while writing the
suspend image to swap, so it is not triggered easily.  However, if
triggered, it would probably corrupt some memory.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1cc9ff25e479..8aef1b49150f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -402,15 +402,14 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 static void data_free(void)
 {
 	swp_entry_t entry;
-	int i;
+	struct pbe * p;
 
-	for (i = 0; i < nr_copy_pages; i++) {
-		entry = (pagedir_nosave + i)->swap_address;
+	for_each_pbe(p, pagedir_nosave) {
+		entry = p->swap_address;
 		if (entry.val)
 			swap_free(entry);
 		else
 			break;
-		(pagedir_nosave + i)->swap_address = (swp_entry_t){0};
 	}
 }
 
-- 
cgit v1.2.3


From f2d613799af915da1fd78463ba8ec5086a0d6f92 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 27 Sep 2005 21:45:32 -0700
Subject: [PATCH] swsusp: prevent possible memory leak

Prevent swsusp from leaking some memory in case of an error in
read_pagedir().  It also prevents the BUG_ON() from triggering if there's
an error while reading swap.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 8aef1b49150f..0dfb24948907 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1437,9 +1437,9 @@ static int read_pagedir(struct pbe *pblist)
 	}
 
 	if (error)
-		free_page((unsigned long)pblist);
-
-	BUG_ON(i != swsusp_info.pagedir_pages);
+		free_pagedir(pblist);
+	else
+		BUG_ON(i != swsusp_info.pagedir_pages);
 
 	return error;
 }
-- 
cgit v1.2.3


From f36462f078403c1859a7e58177b28e01b3a179e4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 27 Sep 2005 21:45:34 -0700
Subject: [PATCH] Ignore trailing whitespace on kernel parameters correctly

Dave Jones says:

... if the modprobe.conf has trailing whitespace, modules fail to load
with the following helpful message..

	snd_intel8x0: Unknown parameter `'

Previous version truncated last argument.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dave Jones <davej@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/params.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index fbf173215fd2..1a8614bac5d5 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -80,8 +80,6 @@ static char *next_arg(char *args, char **param, char **val)
 	int in_quote = 0, quoted = 0;
 	char *next;
 
-	/* Chew any extra spaces */
-	while (*args == ' ') args++;
 	if (*args == '"') {
 		args++;
 		in_quote = 1;
@@ -121,6 +119,10 @@ static char *next_arg(char *args, char **param, char **val)
 		next = args + i + 1;
 	} else
 		next = args + i;
+
+	/* Chew up trailing spaces. */
+	while (*next == ' ')
+		next++;
 	return next;
 }
 
@@ -135,6 +137,10 @@ int parse_args(const char *name,
 
 	DEBUGP("Parsing ARGS: %s\n", args);
 
+	/* Chew leading spaces */
+	while (*args == ' ')
+		args++;
+
 	while (*args) {
 		int ret;
 
-- 
cgit v1.2.3


From 0f7347c20c410c300be0db4c132945fd02e54110 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 27 Sep 2005 21:45:43 -0700
Subject: [PATCH] swsusp: avoid problems if there are too many pages to save

The following patch makes swsusp avoid problems during resume if there are
too many pages to save on suspend.  It adds a constant that allows us to
verify if we are going to save too many pages and implements the check
(this is done as early as we can tell that the check will trigger, which is
in swsusp_alloc()).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h  | 5 ++++-
 kernel/power/swsusp.c | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9c9167d910dd..6748de23e83c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -9,6 +9,9 @@
 #define SUSPEND_CONSOLE	(MAX_NR_CONSOLES-1)
 #endif
 
+#define MAX_PBES	((PAGE_SIZE - sizeof(struct new_utsname) \
+			- 4 - 3*sizeof(unsigned long) - sizeof(int) \
+			- sizeof(void *)) / sizeof(swp_entry_t))
 
 struct swsusp_info {
 	struct new_utsname	uts;
@@ -18,7 +21,7 @@ struct swsusp_info {
 	unsigned long		image_pages;
 	unsigned long		pagedir_pages;
 	suspend_pagedir_t	* suspend_pagedir;
-	swp_entry_t		pagedir[768];
+	swp_entry_t		pagedir[MAX_PBES];
 } __attribute__((aligned(PAGE_SIZE)));
 
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 0dfb24948907..acf79ac1cb6d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -931,6 +931,10 @@ static int swsusp_alloc(void)
 	if (!enough_swap())
 		return -ENOSPC;
 
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE))
+		return -ENOSPC;
+
 	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return -ENOMEM;
-- 
cgit v1.2.3


From 5134fc15b643dc36eb9aa77e4318b886844a9ac5 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 28 Sep 2005 06:42:24 -0700
Subject: [PATCH] cpuset read past eof memory leak fix

Don't leak a page of memory if user reads a cpuset file past eof.

Signed-off-by: KUROSAWA Takahiro <kurosawa@valinux.co.jp>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79866bc6b3a1..6a6e87b2f2fd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -969,7 +969,7 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	ssize_t retval = 0;
 	char *s;
 	char *start;
-	size_t n;
+	ssize_t n;
 
 	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
@@ -999,12 +999,13 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	*s++ = '\n';
 	*s = '\0';
 
-	/* Do nothing if *ppos is at the eof or beyond the eof. */
-	if (s - page <= *ppos)
-		return 0;
-
 	start = page + *ppos;
 	n = s - start;
+
+	/* Do nothing if *ppos is at the eof or beyond the eof. */
+	if (n <= 0)
+		goto out;
+
 	retval = n - copy_to_user(buf, start, min(n, nbytes));
 	*ppos += retval;
 out:
-- 
cgit v1.2.3


From 5acbc5cb507e6c381b70093b1081854708e82b16 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 29 Sep 2005 14:54:42 -0700
Subject: [PATCH] Fix task state testing properly in do_signal_stop()

Any tests using < TASK_STOPPED or the like are left over from the time
when the TASK_ZOMBIE and TASK_DEAD bits were in the same word, and it
served to check for "stopped or dead".  I think this one in
do_signal_stop is the only such case.  It has been buggy ever since
exit_state was separated, and isn't testing the exit_state value.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 5a274705ba19..619b027e92b5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1763,7 +1763,8 @@ do_signal_stop(int signr)
 				 * stop is always done with the siglock held,
 				 * so this check has no races.
 				 */
-				if (t->state < TASK_STOPPED) {
+				if (!t->exit_state &&
+				    !(t->state & (TASK_STOPPED|TASK_TRACED))) {
 					stop_count++;
 					signal_wake_up(t, 0);
 				}
-- 
cgit v1.2.3


From eacaa1f5aa4a41a48349f55abcd9258506943e76 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Fri, 30 Sep 2005 03:26:43 +0100
Subject: [PATCH] cpuset crapectomy

Switched cpuset_common_file_read() to simple_read_from_buffer(), killed
a bunch of useless (and not quite correct - e.g.  min(size_t,ssize_t))
code.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6a6e87b2f2fd..45a5719a0104 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -968,8 +968,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	char *page;
 	ssize_t retval = 0;
 	char *s;
-	char *start;
-	ssize_t n;
 
 	if (!(page = (char *)__get_free_page(GFP_KERNEL)))
 		return -ENOMEM;
@@ -999,15 +997,7 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 	*s++ = '\n';
 	*s = '\0';
 
-	start = page + *ppos;
-	n = s - start;
-
-	/* Do nothing if *ppos is at the eof or beyond the eof. */
-	if (n <= 0)
-		goto out;
-
-	retval = n - copy_to_user(buf, start, min(n, nbytes));
-	*ppos += retval;
+	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
 out:
 	free_page((unsigned long)page);
 	return retval;
-- 
cgit v1.2.3


From 14bf01bb0599c89fc7f426d20353b76e12555308 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sat, 1 Oct 2005 11:04:18 -0700
Subject: Fix inequality comparison against "task->state"

We should always use bitmask ops, rather than depend on some ordering of
the different states.  With the TASK_NONINTERACTIVE flag, the inequality
doesn't really work.

Oleg Nesterov argues (likely correctly) that this test is unnecessary in
the first place.  However, the minimal fix for now is to at least make
it work in the presense of TASK_NONINTERACTIVE.  Waiting for consensus
from Roland & co on potential bigger cleanups.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ee6d8b8abef5..43077732619b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1203,7 +1203,7 @@ static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap,
 
 		exit_code = p->exit_code;
 		if (unlikely(!exit_code) ||
-		    unlikely(p->state > TASK_STOPPED))
+		    unlikely(p->state & TASK_TRACED))
 			goto bail_ref;
 		return wait_noreap_copyout(p, pid, uid,
 					   why, (exit_code << 8) | 0x7f,
-- 
cgit v1.2.3


From 788e05a67c343fa22f2ae1d3ca264e7f15c25eaf Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 7 Oct 2005 17:46:19 +0400
Subject: [PATCH] fix do_coredump() vs SIGSTOP race

Let's suppose we have 2 threads in thread group:
	A - does coredump
	B - has pending SIGSTOP

thread A						thread B

do_coredump:						get_signal_to_deliver:

  lock(->sighand)
  ->signal->flags = SIGNAL_GROUP_EXIT
  unlock(->sighand)

							lock(->sighand)
							signr = dequeue_signal()
								->signal->flags |= SIGNAL_STOP_DEQUEUED
								return SIGSTOP;

							do_signal_stop:
							    unlock(->sighand)

  coredump_wait:

      zap_threads:
          lock(tasklist_lock)
          send SIGKILL to B
              // signal_wake_up() does nothing
          unlock(tasklist_lock)

							    lock(tasklist_lock)
							    lock(->sighand)
							    re-check sig->flags & SIGNAL_STOP_DEQUEUED, yes
							    set_current_state(TASK_STOPPED);
							    finish_stop:
							        schedule();
							            // ->state == TASK_STOPPED

      wait_for_completion(&startup_done)
         // waits for complete() from B,
         // ->state == TASK_UNINTERRUPTIBLE

We can't wake up 'B' in any way:

	SIGCONT will be ignored because handle_stop_signal() sees
	->signal->flags & SIGNAL_GROUP_EXIT.

	sys_kill(SIGKILL)->__group_complete_signal() will choose
	uninterruptible 'A', so it can't help.

	sys_tkill(B, SIGKILL) will be ignored by specific_send_sig_info()
	because B already has pending SIGKILL.

This scenario is not possbile if 'A' does do_group_exit(), because
it sets sig->flags = SIGNAL_GROUP_EXIT and delivers SIGKILL to
subthreads atomically, holding both tasklist_lock and sighand->lock.
That means that do_signal_stop() will notice !SIGNAL_STOP_DEQUEUED
after re-locking ->sighand. And it is not possible to any other
thread to re-add SIGNAL_STOP_DEQUEUED later, because dequeue_signal()
can only return SIGKILL.

I think it is better to change do_coredump() to do sigaddset(SIGKILL)
and signal_wake_up() under sighand->lock, but this patch is much
simpler.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 619b027e92b5..c135f5aa2c2d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -578,7 +578,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
  		 * is to alert stop-signal processing code when another
  		 * processor has come along and cleared the flag.
  		 */
- 		tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
+ 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
  	}
 	if ( signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
-- 
cgit v1.2.3


From dd0fc66fb33cd610bc1a5db8a5e232d34879b4d7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Fri, 7 Oct 2005 07:46:04 +0100
Subject: [PATCH] gfp flags annotations - part 1

 - added typedef unsigned int __nocast gfp_t;

 - replaced __nocast uses for gfp flags with gfp_t - it gives exactly
   the same warnings as far as sparse is concerned, doesn't change
   generated code (from gcc point of view we replaced unsigned int with
   typedef) and documents what's going on far better.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/arch-v32/drivers/pci/dma.c |  2 +-
 arch/i386/kernel/pci-dma.c           |  2 +-
 arch/ppc64/kernel/bpa_iommu.c        |  2 +-
 arch/ppc64/kernel/dma.c              |  2 +-
 arch/ppc64/kernel/iommu.c            |  2 +-
 arch/ppc64/kernel/pci_direct_iommu.c |  2 +-
 arch/ppc64/kernel/pci_iommu.c        |  2 +-
 arch/ppc64/kernel/vio.c              |  2 +-
 drivers/atm/ambassador.c             |  2 +-
 drivers/atm/firestream.c             |  5 ++---
 drivers/atm/fore200e.c               |  2 +-
 drivers/base/dmapool.c               |  5 ++---
 drivers/block/pktcdvd.c              |  4 ++--
 drivers/bluetooth/bpa10x.c           |  2 +-
 drivers/bluetooth/hci_usb.c          |  2 +-
 drivers/connector/connector.c        |  3 +--
 drivers/ieee1394/raw1394.c           |  2 +-
 drivers/infiniband/core/mad.c        |  2 +-
 drivers/infiniband/core/sa_query.c   |  6 +++---
 drivers/md/dm-crypt.c                |  2 +-
 drivers/md/dm-io.c                   |  2 +-
 drivers/md/dm-raid1.c                |  2 +-
 drivers/md/multipath.c               |  2 +-
 drivers/md/raid1.c                   |  4 ++--
 drivers/md/raid10.c                  |  4 ++--
 drivers/net/bonding/bond_main.c      |  2 +-
 drivers/net/ns83820.c                |  2 +-
 drivers/net/sungem.h                 |  2 +-
 drivers/s390/scsi/zfcp_aux.c         |  2 +-
 fs/bio.c                             | 10 +++++-----
 fs/buffer.c                          |  2 +-
 fs/mpage.c                           |  2 +-
 fs/ntfs/malloc.h                     |  2 +-
 fs/posix_acl.c                       |  6 +++---
 fs/xfs/linux-2.6/kmem.c              | 10 +++++-----
 fs/xfs/linux-2.6/kmem.h              | 13 ++++++-------
 include/asm-generic/dma-mapping.h    |  4 ++--
 include/asm-i386/dma-mapping.h       |  2 +-
 include/asm-ppc/dma-mapping.h        |  2 +-
 include/asm-ppc64/dma-mapping.h      |  4 ++--
 include/asm-ppc64/iommu.h            |  2 +-
 include/linux/atmdev.h               |  2 +-
 include/linux/bio.h                  |  6 +++---
 include/linux/buffer_head.h          |  2 +-
 include/linux/connector.h            |  2 +-
 include/linux/cpuset.h               |  5 ++---
 include/linux/dmapool.h              |  2 +-
 include/linux/gfp.h                  | 14 +++++++-------
 include/linux/jbd.h                  |  2 +-
 include/linux/kfifo.h                |  4 ++--
 include/linux/mempool.h              |  9 ++++-----
 include/linux/netlink.h              |  2 +-
 include/linux/pagemap.h              |  2 +-
 include/linux/posix_acl.h            |  6 +++---
 include/linux/radix-tree.h           |  2 +-
 include/linux/security.h             |  6 ++----
 include/linux/skbuff.h               | 28 ++++++++++++++--------------
 include/linux/slab.h                 | 19 +++++++++----------
 include/linux/string.h               |  2 +-
 include/linux/swap.h                 |  2 +-
 include/linux/textsearch.h           |  2 +-
 include/linux/types.h                |  4 ++++
 include/linux/vmalloc.h              |  4 ++--
 include/net/bluetooth/bluetooth.h    |  2 +-
 include/net/bluetooth/rfcomm.h       |  2 +-
 include/net/dn_nsp.h                 |  8 ++++----
 include/net/dn_route.h               |  2 +-
 include/net/inet_connection_sock.h   |  2 +-
 include/net/ip_vs.h                  |  2 +-
 include/net/llc_conn.h               |  2 +-
 include/net/sctp/sctp.h              |  2 +-
 include/net/sctp/sm.h                | 10 +++++-----
 include/net/sctp/structs.h           | 24 ++++++++++++------------
 include/net/sctp/ulpevent.h          | 16 ++++++++--------
 include/net/sctp/ulpqueue.h          | 11 ++++-------
 include/net/sock.h                   | 16 ++++++++--------
 include/net/tcp.h                    |  3 +--
 include/net/xfrm.h                   |  2 +-
 include/rdma/ib_mad.h                |  2 +-
 include/rdma/ib_sa.h                 | 10 +++++-----
 include/rxrpc/call.h                 |  2 +-
 include/rxrpc/message.h              |  2 +-
 include/sound/core.h                 |  8 ++++----
 include/sound/driver.h               |  2 +-
 kernel/audit.c                       |  2 +-
 kernel/cpuset.c                      |  2 +-
 kernel/kfifo.c                       |  4 ++--
 kernel/signal.c                      |  2 +-
 lib/radix-tree.c                     |  2 +-
 lib/ts_bm.c                          |  2 +-
 lib/ts_fsm.c                         |  2 +-
 lib/ts_kmp.c                         |  2 +-
 mm/highmem.c                         |  2 +-
 mm/mempolicy.c                       |  8 ++++----
 mm/mempool.c                         |  6 +++---
 mm/nommu.c                           |  3 +--
 mm/oom_kill.c                        |  2 +-
 mm/page_alloc.c                      | 12 ++++++------
 mm/page_io.c                         |  2 +-
 mm/shmem.c                           |  3 +--
 mm/slab.c                            | 34 ++++++++++++++++------------------
 mm/swap_state.c                      |  2 +-
 mm/vmalloc.c                         |  4 ++--
 net/atm/atm_misc.c                   |  2 +-
 net/bluetooth/l2cap.c                |  2 +-
 net/bluetooth/rfcomm/core.c          |  2 +-
 net/bluetooth/rfcomm/sock.c          |  2 +-
 net/bluetooth/rfcomm/tty.c           |  2 +-
 net/bluetooth/sco.c                  |  2 +-
 net/core/dev.c                       |  2 +-
 net/core/skbuff.c                    | 14 +++++++-------
 net/core/sock.c                      | 10 +++++-----
 net/dccp/ackvec.c                    |  2 +-
 net/dccp/ackvec.h                    |  4 ++--
 net/dccp/ccids/lib/loss_interval.h   |  2 +-
 net/dccp/ccids/lib/packet_history.h  |  4 ++--
 net/decnet/af_decnet.c               |  6 ++----
 net/decnet/dn_nsp_out.c              | 20 +++++++++-----------
 net/ieee80211/ieee80211_tx.c         |  2 +-
 net/ipv4/inet_connection_sock.c      |  2 +-
 net/ipv4/ipvs/ip_vs_app.c            |  2 +-
 net/ipv4/tcp_output.c                |  2 +-
 net/key/af_key.c                     |  6 +++---
 net/llc/llc_conn.c                   |  3 +--
 net/netfilter/nfnetlink.c            |  3 +--
 net/netlink/af_netlink.c             |  4 ++--
 net/rxrpc/call.c                     |  2 +-
 net/rxrpc/connection.c               |  2 +-
 net/sctp/associola.c                 | 10 +++++-----
 net/sctp/bind_addr.c                 | 12 ++++++------
 net/sctp/chunk.c                     |  2 +-
 net/sctp/endpointola.c               |  5 ++---
 net/sctp/protocol.c                  |  2 +-
 net/sctp/sm_make_chunk.c             | 14 +++++++-------
 net/sctp/sm_sideeffect.c             | 12 ++++++------
 net/sctp/ssnmap.c                    |  2 +-
 net/sctp/transport.c                 |  4 ++--
 net/sctp/ulpevent.c                  | 18 +++++++++---------
 net/sctp/ulpqueue.c                  |  8 ++++----
 net/sunrpc/sched.c                   |  2 +-
 net/xfrm/xfrm_policy.c               |  2 +-
 sound/core/memalloc.c                |  2 +-
 sound/core/memory.c                  | 10 +++++-----
 sound/core/seq/instr/ainstr_iw.c     |  2 +-
 sound/core/wrappers.c                |  2 +-
 145 files changed, 340 insertions(+), 360 deletions(-)

(limited to 'kernel')

diff --git a/arch/cris/arch-v32/drivers/pci/dma.c b/arch/cris/arch-v32/drivers/pci/dma.c
index 10329306d23c..426b09878a05 100644
--- a/arch/cris/arch-v32/drivers/pci/dma.c
+++ b/arch/cris/arch-v32/drivers/pci/dma.c
@@ -24,7 +24,7 @@ struct dma_coherent_mem {
 };
 
 void *dma_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, unsigned int __nocast gfp)
+			   dma_addr_t *dma_handle, gfp_t gfp)
 {
 	void *ret;
 	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
diff --git a/arch/i386/kernel/pci-dma.c b/arch/i386/kernel/pci-dma.c
index 1e51427cc9eb..25fe66853934 100644
--- a/arch/i386/kernel/pci-dma.c
+++ b/arch/i386/kernel/pci-dma.c
@@ -23,7 +23,7 @@ struct dma_coherent_mem {
 };
 
 void *dma_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, unsigned int __nocast gfp)
+			   dma_addr_t *dma_handle, gfp_t gfp)
 {
 	void *ret;
 	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
diff --git a/arch/ppc64/kernel/bpa_iommu.c b/arch/ppc64/kernel/bpa_iommu.c
index 507eb9d0223f..5f2460090e03 100644
--- a/arch/ppc64/kernel/bpa_iommu.c
+++ b/arch/ppc64/kernel/bpa_iommu.c
@@ -310,7 +310,7 @@ static void bpa_map_iommu(void)
 
 
 static void *bpa_alloc_coherent(struct device *hwdev, size_t size,
-			   dma_addr_t *dma_handle, unsigned int __nocast flag)
+			   dma_addr_t *dma_handle, gfp_t flag)
 {
 	void *ret;
 
diff --git a/arch/ppc64/kernel/dma.c b/arch/ppc64/kernel/dma.c
index 4da8e31b2b61..7c3419656ccc 100644
--- a/arch/ppc64/kernel/dma.c
+++ b/arch/ppc64/kernel/dma.c
@@ -53,7 +53,7 @@ int dma_set_mask(struct device *dev, u64 dma_mask)
 EXPORT_SYMBOL(dma_set_mask);
 
 void *dma_alloc_coherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, unsigned int __nocast flag)
+		dma_addr_t *dma_handle, gfp_t flag)
 {
 	struct dma_mapping_ops *dma_ops = get_dma_ops(dev);
 
diff --git a/arch/ppc64/kernel/iommu.c b/arch/ppc64/kernel/iommu.c
index 9032b6bfe036..4d9b4388918b 100644
--- a/arch/ppc64/kernel/iommu.c
+++ b/arch/ppc64/kernel/iommu.c
@@ -519,7 +519,7 @@ void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
  * to the dma address (mapping) of the first page.
  */
 void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
-		dma_addr_t *dma_handle, unsigned int __nocast flag)
+		dma_addr_t *dma_handle, gfp_t flag)
 {
 	void *ret = NULL;
 	dma_addr_t mapping;
diff --git a/arch/ppc64/kernel/pci_direct_iommu.c b/arch/ppc64/kernel/pci_direct_iommu.c
index b8f7f58824f4..54055c81017a 100644
--- a/arch/ppc64/kernel/pci_direct_iommu.c
+++ b/arch/ppc64/kernel/pci_direct_iommu.c
@@ -31,7 +31,7 @@
 #include "pci.h"
 
 static void *pci_direct_alloc_coherent(struct device *hwdev, size_t size,
-				   dma_addr_t *dma_handle, unsigned int __nocast flag)
+				   dma_addr_t *dma_handle, gfp_t flag)
 {
 	void *ret;
 
diff --git a/arch/ppc64/kernel/pci_iommu.c b/arch/ppc64/kernel/pci_iommu.c
index 14647e09c9cd..d9e33b7d4203 100644
--- a/arch/ppc64/kernel/pci_iommu.c
+++ b/arch/ppc64/kernel/pci_iommu.c
@@ -76,7 +76,7 @@ static inline struct iommu_table *devnode_table(struct device *dev)
  * to the dma address (mapping) of the first page.
  */
 static void *pci_iommu_alloc_coherent(struct device *hwdev, size_t size,
-			   dma_addr_t *dma_handle, unsigned int __nocast flag)
+			   dma_addr_t *dma_handle, gfp_t flag)
 {
 	return iommu_alloc_coherent(devnode_table(hwdev), size, dma_handle,
 			flag);
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
index c90e1dd875ce..0e555b7a6587 100644
--- a/arch/ppc64/kernel/vio.c
+++ b/arch/ppc64/kernel/vio.c
@@ -218,7 +218,7 @@ static void vio_unmap_sg(struct device *dev, struct scatterlist *sglist,
 }
 
 static void *vio_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, unsigned int __nocast flag)
+			   dma_addr_t *dma_handle, gfp_t flag)
 {
 	return iommu_alloc_coherent(to_vio_dev(dev)->iommu_table, size,
 			dma_handle, flag);
diff --git a/drivers/atm/ambassador.c b/drivers/atm/ambassador.c
index d74a7c5e75dd..4b6bf19c39c0 100644
--- a/drivers/atm/ambassador.c
+++ b/drivers/atm/ambassador.c
@@ -795,7 +795,7 @@ static void drain_rx_pools (amb_dev * dev) {
 }
 
 static inline void fill_rx_pool (amb_dev * dev, unsigned char pool,
-                                 unsigned int __nocast priority)
+                                 gfp_t priority)
 {
   rx_in rx;
   amb_rxq * rxq;
diff --git a/drivers/atm/firestream.c b/drivers/atm/firestream.c
index 58219744f5db..7f7ec288824d 100644
--- a/drivers/atm/firestream.c
+++ b/drivers/atm/firestream.c
@@ -1374,8 +1374,7 @@ static void reset_chip (struct fs_dev *dev)
 	}
 }
 
-static void __devinit *aligned_kmalloc (int size, unsigned int __nocast flags,
-					int alignment)
+static void __devinit *aligned_kmalloc (int size, gfp_t flags, int alignment)
 {
 	void  *t;
 
@@ -1466,7 +1465,7 @@ static inline int nr_buffers_in_freepool (struct fs_dev *dev, struct freepool *f
    working again after that...  -- REW */
 
 static void top_off_fp (struct fs_dev *dev, struct freepool *fp,
-			unsigned int __nocast gfp_flags)
+			gfp_t gfp_flags)
 {
 	struct FS_BPENTRY *qe, *ne;
 	struct sk_buff *skb;
diff --git a/drivers/atm/fore200e.c b/drivers/atm/fore200e.c
index 6f1a83c9d9e0..14f6a6201da3 100644
--- a/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@ -178,7 +178,7 @@ fore200e_irq_itoa(int irq)
 
 
 static void*
-fore200e_kmalloc(int size, unsigned int __nocast flags)
+fore200e_kmalloc(int size, gfp_t flags)
 {
     void *chunk = kzalloc(size, flags);
 
diff --git a/drivers/base/dmapool.c b/drivers/base/dmapool.c
index 60a7ef6a201b..e2f64f91ed05 100644
--- a/drivers/base/dmapool.c
+++ b/drivers/base/dmapool.c
@@ -156,7 +156,7 @@ dma_pool_create (const char *name, struct device *dev,
 
 
 static struct dma_page *
-pool_alloc_page (struct dma_pool *pool, unsigned int __nocast mem_flags)
+pool_alloc_page (struct dma_pool *pool, gfp_t mem_flags)
 {
 	struct dma_page	*page;
 	int		mapsize;
@@ -262,8 +262,7 @@ dma_pool_destroy (struct dma_pool *pool)
  * If such a memory block can't be allocated, null is returned.
  */
 void *
-dma_pool_alloc (struct dma_pool *pool, unsigned int __nocast mem_flags,
-		dma_addr_t *handle)
+dma_pool_alloc (struct dma_pool *pool, gfp_t mem_flags, dma_addr_t *handle)
 {
 	unsigned long		flags;
 	struct dma_page		*page;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 7e22a58926b8..a280e679b1ca 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -229,7 +229,7 @@ static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
 	return 1;
 }
 
-static void *pkt_rb_alloc(unsigned int __nocast gfp_mask, void *data)
+static void *pkt_rb_alloc(gfp_t gfp_mask, void *data)
 {
 	return kmalloc(sizeof(struct pkt_rb_node), gfp_mask);
 }
@@ -2082,7 +2082,7 @@ static int pkt_close(struct inode *inode, struct file *file)
 }
 
 
-static void *psd_pool_alloc(unsigned int __nocast gfp_mask, void *data)
+static void *psd_pool_alloc(gfp_t gfp_mask, void *data)
 {
 	return kmalloc(sizeof(struct packet_stacked_data), gfp_mask);
 }
diff --git a/drivers/bluetooth/bpa10x.c b/drivers/bluetooth/bpa10x.c
index a1bf8f066c88..4fa85234d8b5 100644
--- a/drivers/bluetooth/bpa10x.c
+++ b/drivers/bluetooth/bpa10x.c
@@ -308,7 +308,7 @@ unlock:
 }
 
 static inline struct urb *bpa10x_alloc_urb(struct usb_device *udev, unsigned int pipe,
-					size_t size, unsigned int __nocast flags, void *data)
+					size_t size, gfp_t flags, void *data)
 {
 	struct urb *urb;
 	struct usb_ctrlrequest *cr;
diff --git a/drivers/bluetooth/hci_usb.c b/drivers/bluetooth/hci_usb.c
index 57c48bbf6fe6..6756cb20b753 100644
--- a/drivers/bluetooth/hci_usb.c
+++ b/drivers/bluetooth/hci_usb.c
@@ -132,7 +132,7 @@ static struct usb_device_id blacklist_ids[] = {
 	{ }	/* Terminating entry */
 };
 
-static struct _urb *_urb_alloc(int isoc, unsigned int __nocast gfp)
+static struct _urb *_urb_alloc(int isoc, gfp_t gfp)
 {
 	struct _urb *_urb = kmalloc(sizeof(struct _urb) +
 				sizeof(struct usb_iso_packet_descriptor) * isoc, gfp);
diff --git a/drivers/connector/connector.c b/drivers/connector/connector.c
index 1422285d537c..505677fb3157 100644
--- a/drivers/connector/connector.c
+++ b/drivers/connector/connector.c
@@ -69,8 +69,7 @@ int cn_already_initialized = 0;
  * a new message.
  *
  */
-int cn_netlink_send(struct cn_msg *msg, u32 __group,
-		    unsigned int __nocast gfp_mask)
+int cn_netlink_send(struct cn_msg *msg, u32 __group, gfp_t gfp_mask)
 {
 	struct cn_callback_entry *__cbq;
 	unsigned int size;
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 5fe4f2ba0979..315f5ca8bedb 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -98,7 +98,7 @@ static struct hpsb_address_ops arm_ops = {
 
 static void queue_complete_cb(struct pending_request *req);
 
-static struct pending_request *__alloc_pending_request(unsigned int __nocast flags)
+static struct pending_request *__alloc_pending_request(gfp_t flags)
 {
 	struct pending_request *req;
 
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index a4a4d9c1eef3..a14ca87fda18 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -783,7 +783,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 					    u32 remote_qpn, u16 pkey_index,
 					    struct ib_ah *ah, int rmpp_active,
 					    int hdr_len, int data_len,
-					    unsigned int __nocast gfp_mask)
+					    gfp_t gfp_mask)
 {
 	struct ib_mad_agent_private *mad_agent_priv;
 	struct ib_mad_send_buf *send_buf;
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 78de2dd1a4f2..262618210c1c 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -574,7 +574,7 @@ static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
 int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
 		       struct ib_sa_path_rec *rec,
 		       ib_sa_comp_mask comp_mask,
-		       int timeout_ms, unsigned int __nocast gfp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
 		       void (*callback)(int status,
 					struct ib_sa_path_rec *resp,
 					void *context),
@@ -676,7 +676,7 @@ static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
 int ib_sa_service_rec_query(struct ib_device *device, u8 port_num, u8 method,
 			    struct ib_sa_service_rec *rec,
 			    ib_sa_comp_mask comp_mask,
-			    int timeout_ms, unsigned int __nocast gfp_mask,
+			    int timeout_ms, gfp_t gfp_mask,
 			    void (*callback)(int status,
 					     struct ib_sa_service_rec *resp,
 					     void *context),
@@ -759,7 +759,7 @@ int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
 			     u8 method,
 			     struct ib_sa_mcmember_rec *rec,
 			     ib_sa_comp_mask comp_mask,
-			     int timeout_ms, unsigned int __nocast gfp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
 			     void (*callback)(int status,
 					      struct ib_sa_mcmember_rec *resp,
 					      void *context),
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index b82bc3150476..b6148f6f7836 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -96,7 +96,7 @@ static kmem_cache_t *_crypt_io_pool;
 /*
  * Mempool alloc and free functions for the page
  */
-static void *mempool_alloc_page(unsigned int __nocast gfp_mask, void *data)
+static void *mempool_alloc_page(gfp_t gfp_mask, void *data)
 {
 	return alloc_page(gfp_mask);
 }
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 9de000131a8a..4809b209fbb1 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -32,7 +32,7 @@ struct io {
 static unsigned _num_ios;
 static mempool_t *_io_pool;
 
-static void *alloc_io(unsigned int __nocast gfp_mask, void *pool_data)
+static void *alloc_io(gfp_t gfp_mask, void *pool_data)
 {
 	return kmalloc(sizeof(struct io), gfp_mask);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 863282513753..2375709a392c 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -122,7 +122,7 @@ static inline sector_t region_to_sector(struct region_hash *rh, region_t region)
 /* FIXME move this */
 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw);
 
-static void *region_alloc(unsigned int __nocast gfp_mask, void *pool_data)
+static void *region_alloc(gfp_t gfp_mask, void *pool_data)
 {
 	return kmalloc(sizeof(struct region), gfp_mask);
 }
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 286342375fb7..1151c3ed3006 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -38,7 +38,7 @@
 static mdk_personality_t multipath_personality;
 
 
-static void *mp_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void *mp_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct multipath_bh *mpb;
 	mpb = kmalloc(sizeof(*mpb), gfp_flags);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a93ca478142a..0e1f148dd41d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -52,7 +52,7 @@ static mdk_personality_t raid1_personality;
 static void unplug_slaves(mddev_t *mddev);
 
 
-static void * r1bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
 	r1bio_t *r1_bio;
@@ -79,7 +79,7 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (2048*1024)
 
-static void * r1buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
 	struct page *page;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 5bd1e9ec899d..28dd028415e4 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -47,7 +47,7 @@
 
 static void unplug_slaves(mddev_t *mddev);
 
-static void * r10bio_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
 	r10bio_t *r10_bio;
@@ -81,7 +81,7 @@ static void r10bio_pool_free(void *r10_bio, void *data)
  * one for write (we recover only one drive per r10buf)
  *
  */
-static void * r10buf_pool_alloc(unsigned int __nocast gfp_flags, void *data)
+static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	conf_t *conf = data;
 	struct page *page;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index f0a5b772a386..f264ff162979 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1290,7 +1290,7 @@ static void bond_mc_list_destroy(struct bonding *bond)
  * Copy all the Multicast addresses from src to the bonding device dst
  */
 static int bond_mc_list_copy(struct dev_mc_list *mc_list, struct bonding *bond,
-			     unsigned int __nocast gfp_flag)
+			     gfp_t gfp_flag)
 {
 	struct dev_mc_list *dmi, *new_dmi;
 
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index 83334db2921c..e4811b42a6b7 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -584,7 +584,7 @@ static inline int ns83820_add_rx_skb(struct ns83820 *dev, struct sk_buff *skb)
 	return 0;
 }
 
-static inline int rx_refill(struct net_device *ndev, unsigned int __nocast gfp)
+static inline int rx_refill(struct net_device *ndev, gfp_t gfp)
 {
 	struct ns83820 *dev = PRIV(ndev);
 	unsigned i;
diff --git a/drivers/net/sungem.h b/drivers/net/sungem.h
index 16edbb1a4a7a..13006d759ad8 100644
--- a/drivers/net/sungem.h
+++ b/drivers/net/sungem.h
@@ -1036,7 +1036,7 @@ struct gem {
 #define ALIGNED_RX_SKB_ADDR(addr) \
         ((((unsigned long)(addr) + (64UL - 1UL)) & ~(64UL - 1UL)) - (unsigned long)(addr))
 static __inline__ struct sk_buff *gem_alloc_skb(int size,
-						unsigned int __nocast gfp_flags)
+						gfp_t gfp_flags)
 {
 	struct sk_buff *skb = alloc_skb(size + 64, gfp_flags);
 
diff --git a/drivers/s390/scsi/zfcp_aux.c b/drivers/s390/scsi/zfcp_aux.c
index 0b5087f7cabc..cab098556b44 100644
--- a/drivers/s390/scsi/zfcp_aux.c
+++ b/drivers/s390/scsi/zfcp_aux.c
@@ -833,7 +833,7 @@ zfcp_unit_dequeue(struct zfcp_unit *unit)
 }
 
 static void *
-zfcp_mempool_alloc(unsigned int __nocast gfp_mask, void *size)
+zfcp_mempool_alloc(gfp_t gfp_mask, void *size)
 {
 	return kmalloc((size_t) size, gfp_mask);
 }
diff --git a/fs/bio.c b/fs/bio.c
index 83a349574567..7d81a93afd48 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -75,7 +75,7 @@ struct bio_set {
  */
 static struct bio_set *fs_bio_set;
 
-static inline struct bio_vec *bvec_alloc_bs(unsigned int __nocast gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+static inline struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
 {
 	struct bio_vec *bvl;
 	struct biovec_slab *bp;
@@ -155,7 +155,7 @@ inline void bio_init(struct bio *bio)
  *   allocate bio and iovecs from the memory pools specified by the
  *   bio_set structure.
  **/
-struct bio *bio_alloc_bioset(unsigned int __nocast gfp_mask, int nr_iovecs, struct bio_set *bs)
+struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask);
 
@@ -181,7 +181,7 @@ out:
 	return bio;
 }
 
-struct bio *bio_alloc(unsigned int __nocast gfp_mask, int nr_iovecs)
+struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs)
 {
 	struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, fs_bio_set);
 
@@ -277,7 +277,7 @@ inline void __bio_clone(struct bio *bio, struct bio *bio_src)
  *
  * 	Like __bio_clone, only also allocates the returned bio
  */
-struct bio *bio_clone(struct bio *bio, unsigned int __nocast gfp_mask)
+struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask)
 {
 	struct bio *b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, fs_bio_set);
 
@@ -1078,7 +1078,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	return bp;
 }
 
-static void *bio_pair_alloc(unsigned int __nocast gfp_flags, void *data)
+static void *bio_pair_alloc(gfp_t gfp_flags, void *data)
 {
 	return kmalloc(sizeof(struct bio_pair), gfp_flags);
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index 6cbfceabd95d..1216c0d3c8ce 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3045,7 +3045,7 @@ static void recalc_bh_state(void)
 	buffer_heads_over_limit = (tot > max_buffer_heads);
 }
 	
-struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags)
+struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
 {
 	struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
 	if (ret) {
diff --git a/fs/mpage.c b/fs/mpage.c
index bb9aebe93862..c5adcdddf3cc 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -102,7 +102,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 static struct bio *
 mpage_alloc(struct block_device *bdev,
 		sector_t first_sector, int nr_vecs,
-		unsigned int __nocast gfp_flags)
+		gfp_t gfp_flags)
 {
 	struct bio *bio;
 
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
index 006946efca8c..590887b943f5 100644
--- a/fs/ntfs/malloc.h
+++ b/fs/ntfs/malloc.h
@@ -40,7 +40,7 @@
  * Depending on @gfp_mask the allocation may be guaranteed to succeed.
  */
 static inline void *__ntfs_malloc(unsigned long size,
-		unsigned int __nocast gfp_mask)
+		gfp_t gfp_mask)
 {
 	if (likely(size <= PAGE_SIZE)) {
 		BUG_ON(!size);
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 296480e96dd5..6c8dcf7613fd 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -35,7 +35,7 @@ EXPORT_SYMBOL(posix_acl_permission);
  * Allocate a new ACL with the specified number of entries.
  */
 struct posix_acl *
-posix_acl_alloc(int count, unsigned int __nocast flags)
+posix_acl_alloc(int count, gfp_t flags)
 {
 	const size_t size = sizeof(struct posix_acl) +
 	                    count * sizeof(struct posix_acl_entry);
@@ -51,7 +51,7 @@ posix_acl_alloc(int count, unsigned int __nocast flags)
  * Clone an ACL.
  */
 struct posix_acl *
-posix_acl_clone(const struct posix_acl *acl, unsigned int __nocast flags)
+posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
 {
 	struct posix_acl *clone = NULL;
 
@@ -185,7 +185,7 @@ posix_acl_equiv_mode(const struct posix_acl *acl, mode_t *mode_p)
  * Create an ACL representing the file mode permission bits of an inode.
  */
 struct posix_acl *
-posix_acl_from_mode(mode_t mode, unsigned int __nocast flags)
+posix_acl_from_mode(mode_t mode, gfp_t flags)
 {
 	struct posix_acl *acl = posix_acl_alloc(3, flags);
 	if (!acl)
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index 4b184559f231..d2653b589b1c 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -45,7 +45,7 @@
 
 
 void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
+kmem_alloc(size_t size, gfp_t flags)
 {
 	int		retries = 0;
 	unsigned int	lflags = kmem_flags_convert(flags);
@@ -67,7 +67,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 }
 
 void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
+kmem_zalloc(size_t size, gfp_t flags)
 {
 	void	*ptr;
 
@@ -90,7 +90,7 @@ kmem_free(void *ptr, size_t size)
 
 void *
 kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
-	     unsigned int __nocast flags)
+	     gfp_t flags)
 {
 	void	*new;
 
@@ -105,7 +105,7 @@ kmem_realloc(void *ptr, size_t newsize, size_t oldsize,
 }
 
 void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_alloc(kmem_zone_t *zone, gfp_t flags)
 {
 	int		retries = 0;
 	unsigned int	lflags = kmem_flags_convert(flags);
@@ -124,7 +124,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 }
 
 void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_zalloc(kmem_zone_t *zone, gfp_t flags)
 {
 	void	*ptr;
 
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
index 109fcf27e256..ee7010f085bc 100644
--- a/fs/xfs/linux-2.6/kmem.h
+++ b/fs/xfs/linux-2.6/kmem.h
@@ -81,7 +81,7 @@ typedef unsigned long xfs_pflags_t;
 	*(NSTATEP) = *(OSTATEP);	\
 } while (0)
 
-static __inline unsigned int kmem_flags_convert(unsigned int __nocast flags)
+static __inline unsigned int kmem_flags_convert(gfp_t flags)
 {
 	unsigned int	lflags = __GFP_NOWARN;	/* we'll report problems, if need be */
 
@@ -125,13 +125,12 @@ kmem_zone_destroy(kmem_zone_t *zone)
 		BUG();
 }
 
-extern void	    *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-extern void	    *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
+extern void	    *kmem_zone_zalloc(kmem_zone_t *, gfp_t);
+extern void	    *kmem_zone_alloc(kmem_zone_t *, gfp_t);
 
-extern void	    *kmem_alloc(size_t, unsigned int __nocast);
-extern void	    *kmem_realloc(void *, size_t, size_t,
-				  unsigned int __nocast);
-extern void	    *kmem_zalloc(size_t, unsigned int __nocast);
+extern void	    *kmem_alloc(size_t, gfp_t);
+extern void	    *kmem_realloc(void *, size_t, size_t, gfp_t);
+extern void	    *kmem_zalloc(size_t, gfp_t);
 extern void         kmem_free(void *, size_t);
 
 typedef struct shrinker *kmem_shaker_t;
diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 8cef663c5cd9..747d790295f3 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -35,7 +35,7 @@ dma_set_mask(struct device *dev, u64 dma_mask)
 
 static inline void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   unsigned int __nocast flag)
+		   gfp_t flag)
 {
 	BUG_ON(dev->bus != &pci_bus_type);
 
@@ -168,7 +168,7 @@ dma_set_mask(struct device *dev, u64 dma_mask)
 
 static inline void *
 dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
-		   unsigned int __nocast flag)
+		   gfp_t flag)
 {
 	BUG();
 	return NULL;
diff --git a/include/asm-i386/dma-mapping.h b/include/asm-i386/dma-mapping.h
index 563964b2995b..e56c335f8ef9 100644
--- a/include/asm-i386/dma-mapping.h
+++ b/include/asm-i386/dma-mapping.h
@@ -11,7 +11,7 @@
 #define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
 
 void *dma_alloc_coherent(struct device *dev, size_t size,
-			   dma_addr_t *dma_handle, unsigned int __nocast flag);
+			   dma_addr_t *dma_handle, gfp_t flag);
 
 void dma_free_coherent(struct device *dev, size_t size,
 			 void *vaddr, dma_addr_t dma_handle);
diff --git a/include/asm-ppc/dma-mapping.h b/include/asm-ppc/dma-mapping.h
index 92b8ee78dcc2..061bfcac1bf1 100644
--- a/include/asm-ppc/dma-mapping.h
+++ b/include/asm-ppc/dma-mapping.h
@@ -61,7 +61,7 @@ static inline int dma_set_mask(struct device *dev, u64 dma_mask)
 
 static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 				       dma_addr_t * dma_handle,
-				       unsigned int __nocast gfp)
+				       gfp_t gfp)
 {
 #ifdef CONFIG_NOT_COHERENT_CACHE
 	return __dma_alloc_coherent(size, dma_handle, gfp);
diff --git a/include/asm-ppc64/dma-mapping.h b/include/asm-ppc64/dma-mapping.h
index 9ad8adee0067..fb68fa23bea8 100644
--- a/include/asm-ppc64/dma-mapping.h
+++ b/include/asm-ppc64/dma-mapping.h
@@ -19,7 +19,7 @@
 extern int dma_supported(struct device *dev, u64 mask);
 extern int dma_set_mask(struct device *dev, u64 dma_mask);
 extern void *dma_alloc_coherent(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, unsigned int __nocast flag);
+		dma_addr_t *dma_handle, gfp_t flag);
 extern void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
 		dma_addr_t dma_handle);
 extern dma_addr_t dma_map_single(struct device *dev, void *cpu_addr,
@@ -118,7 +118,7 @@ dma_cache_sync(void *vaddr, size_t size,
  */
 struct dma_mapping_ops {
 	void *		(*alloc_coherent)(struct device *dev, size_t size,
-				dma_addr_t *dma_handle, unsigned int __nocast flag);
+				dma_addr_t *dma_handle, gfp_t flag);
 	void		(*free_coherent)(struct device *dev, size_t size,
 				void *vaddr, dma_addr_t dma_handle);
 	dma_addr_t	(*map_single)(struct device *dev, void *ptr,
diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h
index 72dcf8116b04..c2f3b6e8a42f 100644
--- a/include/asm-ppc64/iommu.h
+++ b/include/asm-ppc64/iommu.h
@@ -122,7 +122,7 @@ extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		int nelems, enum dma_data_direction direction);
 
 extern void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
-		dma_addr_t *dma_handle, unsigned int __nocast flag);
+		dma_addr_t *dma_handle, gfp_t flag);
 extern void iommu_free_coherent(struct iommu_table *tbl, size_t size,
 		void *vaddr, dma_addr_t dma_handle);
 extern dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index aca9b344bd35..e7d0593bb576 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -467,7 +467,7 @@ static inline void atm_dev_put(struct atm_dev *dev)
 
 int atm_charge(struct atm_vcc *vcc,int truesize);
 struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
-    unsigned int __nocast gfp_flags);
+    gfp_t gfp_flags);
 int atm_pcr_goal(struct atm_trafprm *tp);
 
 void vcc_release_async(struct atm_vcc *vcc, int reply);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 6e1c79c8b6bf..3344b4e8e43a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -276,8 +276,8 @@ extern void bio_pair_release(struct bio_pair *dbio);
 extern struct bio_set *bioset_create(int, int, int);
 extern void bioset_free(struct bio_set *);
 
-extern struct bio *bio_alloc(unsigned int __nocast, int);
-extern struct bio *bio_alloc_bioset(unsigned int __nocast, int, struct bio_set *);
+extern struct bio *bio_alloc(gfp_t, int);
+extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 extern void bio_free(struct bio *, struct bio_set *);
 
@@ -287,7 +287,7 @@ extern int bio_phys_segments(struct request_queue *, struct bio *);
 extern int bio_hw_segments(struct request_queue *, struct bio *);
 
 extern void __bio_clone(struct bio *, struct bio *);
-extern struct bio *bio_clone(struct bio *, unsigned int __nocast);
+extern struct bio *bio_clone(struct bio *, gfp_t);
 
 extern void bio_init(struct bio *);
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 90828493791f..6a1d154c0825 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -172,7 +172,7 @@ void __brelse(struct buffer_head *);
 void __bforget(struct buffer_head *);
 void __breadahead(struct block_device *, sector_t block, int size);
 struct buffer_head *__bread(struct block_device *, sector_t block, int size);
-struct buffer_head *alloc_buffer_head(unsigned int __nocast gfp_flags);
+struct buffer_head *alloc_buffer_head(gfp_t gfp_flags);
 void free_buffer_head(struct buffer_head * bh);
 void FASTCALL(unlock_buffer(struct buffer_head *bh));
 void FASTCALL(__lock_buffer(struct buffer_head *bh));
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 96582c9911ac..95952cc1f525 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -149,7 +149,7 @@ struct cn_dev {
 
 int cn_add_callback(struct cb_id *, char *, void (*callback) (void *));
 void cn_del_callback(struct cb_id *);
-int cn_netlink_send(struct cn_msg *, u32, unsigned int __nocast);
+int cn_netlink_send(struct cn_msg *, u32, gfp_t);
 
 int cn_queue_add_callback(struct cn_queue_dev *dev, char *name, struct cb_id *id, void (*callback)(void *));
 void cn_queue_del_callback(struct cn_queue_dev *dev, struct cb_id *id);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 24062a1dbf61..6e2deef96b34 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,7 +23,7 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_current_mems_allowed(void);
 void cpuset_restrict_to_mems_allowed(unsigned long *nodes);
 int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
-extern int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask);
+extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
 extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
 extern struct file_operations proc_cpuset_operations;
 extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
@@ -49,8 +49,7 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 	return 1;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z,
-					unsigned int __nocast gfp_mask)
+static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	return 1;
 }
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index 4932ee5c77f0..76f12f46db7f 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -19,7 +19,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
 
 void dma_pool_destroy(struct dma_pool *pool);
 
-void *dma_pool_alloc(struct dma_pool *pool, unsigned int __nocast mem_flags,
+void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
 		     dma_addr_t *handle);
 
 void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 4dc990f3b5cc..3010e172394d 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -85,9 +85,9 @@ static inline void arch_free_page(struct page *page, int order) { }
 #endif
 
 extern struct page *
-FASTCALL(__alloc_pages(unsigned int, unsigned int, struct zonelist *));
+FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
 
-static inline struct page *alloc_pages_node(int nid, unsigned int __nocast gfp_mask,
+static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
 	if (unlikely(order >= MAX_ORDER))
@@ -98,17 +98,17 @@ static inline struct page *alloc_pages_node(int nid, unsigned int __nocast gfp_m
 }
 
 #ifdef CONFIG_NUMA
-extern struct page *alloc_pages_current(unsigned int __nocast gfp_mask, unsigned order);
+extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order);
 
 static inline struct page *
-alloc_pages(unsigned int __nocast gfp_mask, unsigned int order)
+alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	if (unlikely(order >= MAX_ORDER))
 		return NULL;
 
 	return alloc_pages_current(gfp_mask, order);
 }
-extern struct page *alloc_page_vma(unsigned __nocast gfp_mask,
+extern struct page *alloc_page_vma(gfp_t gfp_mask,
 			struct vm_area_struct *vma, unsigned long addr);
 #else
 #define alloc_pages(gfp_mask, order) \
@@ -117,8 +117,8 @@ extern struct page *alloc_page_vma(unsigned __nocast gfp_mask,
 #endif
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 
-extern unsigned long FASTCALL(__get_free_pages(unsigned int __nocast gfp_mask, unsigned int order));
-extern unsigned long FASTCALL(get_zeroed_page(unsigned int __nocast gfp_mask));
+extern unsigned long FASTCALL(__get_free_pages(gfp_t gfp_mask, unsigned int order));
+extern unsigned long FASTCALL(get_zeroed_page(gfp_t gfp_mask));
 
 #define __get_free_page(gfp_mask) \
 		__get_free_pages((gfp_mask),0)
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index de097269bd7f..ff853b3173c6 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -935,7 +935,7 @@ void journal_put_journal_head(struct journal_head *jh);
  */
 extern kmem_cache_t *jbd_handle_cache;
 
-static inline handle_t *jbd_alloc_handle(unsigned int __nocast gfp_flags)
+static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
 {
 	return kmem_cache_alloc(jbd_handle_cache, gfp_flags);
 }
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index c27cd428d269..48eccd865bd8 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -35,8 +35,8 @@ struct kfifo {
 };
 
 extern struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
-				unsigned int __nocast gfp_mask, spinlock_t *lock);
-extern struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask,
+				gfp_t gfp_mask, spinlock_t *lock);
+extern struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask,
 				 spinlock_t *lock);
 extern void kfifo_free(struct kfifo *fifo);
 extern unsigned int __kfifo_put(struct kfifo *fifo,
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 796220ce47cc..f2427d7394b0 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -6,7 +6,7 @@
 
 #include <linux/wait.h>
 
-typedef void * (mempool_alloc_t)(unsigned int __nocast gfp_mask, void *pool_data);
+typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data);
 typedef void (mempool_free_t)(void *element, void *pool_data);
 
 typedef struct mempool_s {
@@ -26,17 +26,16 @@ extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
 extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 			mempool_free_t *free_fn, void *pool_data, int nid);
 
-extern int mempool_resize(mempool_t *pool, int new_min_nr,
-			unsigned int __nocast gfp_mask);
+extern int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask);
 extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask);
+extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
  * A mempool_alloc_t and mempool_free_t that get the memory from
  * a slab that is passed in through pool_data.
  */
-void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data);
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
 #endif /* _LINUX_MEMPOOL_H */
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index bdebdc564506..ba25ca874c20 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -131,7 +131,7 @@ extern struct sock *netlink_kernel_create(int unit, unsigned int groups, void (*
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
-			     __u32 group, unsigned int __nocast allocation);
+			     __u32 group, gfp_t allocation);
 extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
 extern int netlink_register_notifier(struct notifier_block *nb);
 extern int netlink_unregister_notifier(struct notifier_block *nb);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d9a25647a295..acbf31c154f8 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -19,7 +19,7 @@
 #define	AS_EIO		(__GFP_BITS_SHIFT + 0)	/* IO error on async write */
 #define AS_ENOSPC	(__GFP_BITS_SHIFT + 1)	/* ENOSPC on async write */
 
-static inline unsigned int __nocast mapping_gfp_mask(struct address_space * mapping)
+static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 {
 	return mapping->flags & __GFP_BITS_MASK;
 }
diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h
index 4caedddaa033..4bc241290c24 100644
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -71,11 +71,11 @@ posix_acl_release(struct posix_acl *acl)
 
 /* posix_acl.c */
 
-extern struct posix_acl *posix_acl_alloc(int, unsigned int __nocast);
-extern struct posix_acl *posix_acl_clone(const struct posix_acl *, unsigned int __nocast);
+extern struct posix_acl *posix_acl_alloc(int, gfp_t);
+extern struct posix_acl *posix_acl_clone(const struct posix_acl *, gfp_t);
 extern int posix_acl_valid(const struct posix_acl *);
 extern int posix_acl_permission(struct inode *, const struct posix_acl *, int);
-extern struct posix_acl *posix_acl_from_mode(mode_t, unsigned int __nocast);
+extern struct posix_acl *posix_acl_from_mode(mode_t, gfp_t);
 extern int posix_acl_equiv_mode(const struct posix_acl *, mode_t *);
 extern int posix_acl_create_masq(struct posix_acl *, mode_t *);
 extern int posix_acl_chmod_masq(struct posix_acl *, mode_t);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 9c51917b1cce..045d4761febc 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -50,7 +50,7 @@ void *radix_tree_delete(struct radix_tree_root *, unsigned long);
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
-int radix_tree_preload(unsigned int __nocast gfp_mask);
+int radix_tree_preload(gfp_t gfp_mask);
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, int tag);
diff --git a/include/linux/security.h b/include/linux/security.h
index 0e43460d374e..627382e74057 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2634,8 +2634,7 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o
 	return security_ops->socket_getpeersec(sock, optval, optlen, len);
 }
 
-static inline int security_sk_alloc(struct sock *sk, int family,
-				    unsigned int __nocast priority)
+static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
 {
 	return security_ops->sk_alloc_security(sk, family, priority);
 }
@@ -2752,8 +2751,7 @@ static inline int security_socket_getpeersec(struct socket *sock, char __user *o
 	return -ENOPROTOOPT;
 }
 
-static inline int security_sk_alloc(struct sock *sk, int family,
-				    unsigned int __nocast priority)
+static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority)
 {
 	return 0;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 466c879f82b8..8f5d9e7f8734 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -302,37 +302,37 @@ struct sk_buff {
 
 extern void	       __kfree_skb(struct sk_buff *skb);
 extern struct sk_buff *__alloc_skb(unsigned int size,
-				   unsigned int __nocast priority, int fclone);
+				   gfp_t priority, int fclone);
 static inline struct sk_buff *alloc_skb(unsigned int size,
-					unsigned int __nocast priority)
+					gfp_t priority)
 {
 	return __alloc_skb(size, priority, 0);
 }
 
 static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
-					       unsigned int __nocast priority)
+					       gfp_t priority)
 {
 	return __alloc_skb(size, priority, 1);
 }
 
 extern struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
 					    unsigned int size,
-					    unsigned int __nocast priority);
+					    gfp_t priority);
 extern void	       kfree_skbmem(struct sk_buff *skb);
 extern struct sk_buff *skb_clone(struct sk_buff *skb,
-				 unsigned int __nocast priority);
+				 gfp_t priority);
 extern struct sk_buff *skb_copy(const struct sk_buff *skb,
-				unsigned int __nocast priority);
+				gfp_t priority);
 extern struct sk_buff *pskb_copy(struct sk_buff *skb,
-				 unsigned int __nocast gfp_mask);
+				 gfp_t gfp_mask);
 extern int	       pskb_expand_head(struct sk_buff *skb,
 					int nhead, int ntail,
-					unsigned int __nocast gfp_mask);
+					gfp_t gfp_mask);
 extern struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 					    unsigned int headroom);
 extern struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 				       int newheadroom, int newtailroom,
-				       unsigned int __nocast priority);
+				       gfp_t priority);
 extern struct sk_buff *		skb_pad(struct sk_buff *skb, int pad);
 #define dev_kfree_skb(a)	kfree_skb(a)
 extern void	      skb_over_panic(struct sk_buff *skb, int len,
@@ -484,7 +484,7 @@ static inline int skb_shared(const struct sk_buff *skb)
  *	NULL is returned on a memory allocation failure.
  */
 static inline struct sk_buff *skb_share_check(struct sk_buff *skb,
-					      unsigned int __nocast pri)
+					      gfp_t pri)
 {
 	might_sleep_if(pri & __GFP_WAIT);
 	if (skb_shared(skb)) {
@@ -516,7 +516,7 @@ static inline struct sk_buff *skb_share_check(struct sk_buff *skb,
  *	%NULL is returned on a memory allocation failure.
  */
 static inline struct sk_buff *skb_unshare(struct sk_buff *skb,
-					  unsigned int __nocast pri)
+					  gfp_t pri)
 {
 	might_sleep_if(pri & __GFP_WAIT);
 	if (skb_cloned(skb)) {
@@ -1017,7 +1017,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
  *	%NULL is returned in there is no free memory.
  */
 static inline struct sk_buff *__dev_alloc_skb(unsigned int length,
-					      unsigned int __nocast gfp_mask)
+					      gfp_t gfp_mask)
 {
 	struct sk_buff *skb = alloc_skb(length + 16, gfp_mask);
 	if (likely(skb))
@@ -1130,8 +1130,8 @@ static inline int skb_can_coalesce(struct sk_buff *skb, int i,
  *	If there is no free memory -ENOMEM is returned, otherwise zero
  *	is returned and the old skb data released.
  */
-extern int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp);
-static inline int skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp)
+extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp);
+static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp)
 {
 	return __skb_linearize(skb, gfp);
 }
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1f356f3bbc64..5fc04a16ecb0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -61,11 +61,11 @@ extern kmem_cache_t *kmem_cache_create(const char *, size_t, size_t, unsigned lo
 				       void (*)(void *, kmem_cache_t *, unsigned long));
 extern int kmem_cache_destroy(kmem_cache_t *);
 extern int kmem_cache_shrink(kmem_cache_t *);
-extern void *kmem_cache_alloc(kmem_cache_t *, unsigned int __nocast);
+extern void *kmem_cache_alloc(kmem_cache_t *, gfp_t);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern unsigned int kmem_cache_size(kmem_cache_t *);
 extern const char *kmem_cache_name(kmem_cache_t *);
-extern kmem_cache_t *kmem_find_general_cachep(size_t size, unsigned int __nocast gfpflags);
+extern kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags);
 
 /* Size description struct for general caches. */
 struct cache_sizes {
@@ -74,9 +74,9 @@ struct cache_sizes {
 	kmem_cache_t	*cs_dmacachep;
 };
 extern struct cache_sizes malloc_sizes[];
-extern void *__kmalloc(size_t, unsigned int __nocast);
+extern void *__kmalloc(size_t, gfp_t);
 
-static inline void *kmalloc(size_t size, unsigned int __nocast flags)
+static inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
 		int i = 0;
@@ -99,7 +99,7 @@ found:
 	return __kmalloc(size, flags);
 }
 
-extern void *kzalloc(size_t, unsigned int __nocast);
+extern void *kzalloc(size_t, gfp_t);
 
 /**
  * kcalloc - allocate memory for an array. The memory is set to zero.
@@ -107,7 +107,7 @@ extern void *kzalloc(size_t, unsigned int __nocast);
  * @size: element size.
  * @flags: the type of memory to allocate.
  */
-static inline void *kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
 	if (n != 0 && size > INT_MAX / n)
 		return NULL;
@@ -118,15 +118,14 @@ extern void kfree(const void *);
 extern unsigned int ksize(const void *);
 
 #ifdef CONFIG_NUMA
-extern void *kmem_cache_alloc_node(kmem_cache_t *,
-			unsigned int __nocast flags, int node);
-extern void *kmalloc_node(size_t size, unsigned int __nocast flags, int node);
+extern void *kmem_cache_alloc_node(kmem_cache_t *, gfp_t flags, int node);
+extern void *kmalloc_node(size_t size, gfp_t flags, int node);
 #else
 static inline void *kmem_cache_alloc_node(kmem_cache_t *cachep, int flags, int node)
 {
 	return kmem_cache_alloc(cachep, flags);
 }
-static inline void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
+static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	return kmalloc(size, flags);
 }
diff --git a/include/linux/string.h b/include/linux/string.h
index dab2652acbd8..369be3264a55 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -88,7 +88,7 @@ extern int memcmp(const void *,const void *,__kernel_size_t);
 extern void * memchr(const void *,int,__kernel_size_t);
 #endif
 
-extern char *kstrdup(const char *s, unsigned int __nocast gfp);
+extern char *kstrdup(const char *s, gfp_t gfp);
 
 #ifdef __cplusplus
 }
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3c9ff0048153..a7bf1a3b1496 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -147,7 +147,7 @@ struct swap_list_t {
 #define vm_swap_full() (nr_swap_pages*2 < total_swap_pages)
 
 /* linux/mm/oom_kill.c */
-extern void out_of_memory(unsigned int __nocast gfp_mask, int order);
+extern void out_of_memory(gfp_t gfp_mask, int order);
 
 /* linux/mm/memory.c */
 extern void swapin_readahead(swp_entry_t, unsigned long, struct vm_area_struct *);
diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h
index 1a4990e448e9..515046d1b2f4 100644
--- a/include/linux/textsearch.h
+++ b/include/linux/textsearch.h
@@ -159,7 +159,7 @@ extern unsigned int textsearch_find_continuous(struct ts_config *,
 #define TS_PRIV_ALIGN(len) (((len) + TS_PRIV_ALIGNTO-1) & ~(TS_PRIV_ALIGNTO-1))
 
 static inline struct ts_config *alloc_ts_config(size_t payload,
-						unsigned int __nocast gfp_mask)
+						gfp_t gfp_mask)
 {
 	struct ts_config *conf;
 
diff --git a/include/linux/types.h b/include/linux/types.h
index 2b678c22ca4a..0aee34f9da9f 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -165,6 +165,10 @@ typedef __u64 __bitwise __le64;
 typedef __u64 __bitwise __be64;
 #endif
 
+#ifdef __KERNEL__
+typedef unsigned __nocast gfp_t;
+#endif
+
 struct ustat {
 	__kernel_daddr_t	f_tfree;
 	__kernel_ino_t		f_tinode;
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index b244f69ef682..3701a0673d2c 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -34,8 +34,8 @@ struct vm_struct {
 extern void *vmalloc(unsigned long size);
 extern void *vmalloc_exec(unsigned long size);
 extern void *vmalloc_32(unsigned long size);
-extern void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot);
-extern void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot);
+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
+extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot);
 extern void vfree(void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 6dfa4a61ffd0..210458624840 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -136,7 +136,7 @@ struct bt_skb_cb {
 };
 #define bt_cb(skb) ((struct bt_skb_cb *)(skb->cb)) 
 
-static inline struct sk_buff *bt_skb_alloc(unsigned int len, unsigned int __nocast how)
+static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how)
 {
 	struct sk_buff *skb;
 
diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h
index ffea9d54071f..fbe557f7ea1d 100644
--- a/include/net/bluetooth/rfcomm.h
+++ b/include/net/bluetooth/rfcomm.h
@@ -230,7 +230,7 @@ int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci,
 			u8 xon_char, u8 xoff_char, u16 param_mask);
 
 /* ---- RFCOMM DLCs (channels) ---- */
-struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio);
+struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio);
 void rfcomm_dlc_free(struct rfcomm_dlc *d);
 int  rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel);
 int  rfcomm_dlc_close(struct rfcomm_dlc *d, int reason);
diff --git a/include/net/dn_nsp.h b/include/net/dn_nsp.h
index 8a0891e2e888..1ba03be0af3a 100644
--- a/include/net/dn_nsp.h
+++ b/include/net/dn_nsp.h
@@ -19,9 +19,9 @@ extern void dn_nsp_send_data_ack(struct sock *sk);
 extern void dn_nsp_send_oth_ack(struct sock *sk);
 extern void dn_nsp_delayed_ack(struct sock *sk);
 extern void dn_send_conn_ack(struct sock *sk);
-extern void dn_send_conn_conf(struct sock *sk, unsigned int __nocast gfp);
+extern void dn_send_conn_conf(struct sock *sk, gfp_t gfp);
 extern void dn_nsp_send_disc(struct sock *sk, unsigned char type, 
-			unsigned short reason, unsigned int __nocast gfp);
+			unsigned short reason, gfp_t gfp);
 extern void dn_nsp_return_disc(struct sk_buff *skb, unsigned char type,
 				unsigned short reason);
 extern void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval);
@@ -29,14 +29,14 @@ extern void dn_nsp_send_conninit(struct sock *sk, unsigned char flags);
 
 extern void dn_nsp_output(struct sock *sk);
 extern int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum);
-extern void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, unsigned int __nocast gfp, int oob);
+extern void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, gfp_t gfp, int oob);
 extern unsigned long dn_nsp_persist(struct sock *sk);
 extern int dn_nsp_xmit_timeout(struct sock *sk);
 
 extern int dn_nsp_rx(struct sk_buff *);
 extern int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb);
 
-extern struct sk_buff *dn_alloc_skb(struct sock *sk, int size, unsigned int __nocast pri);
+extern struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri);
 extern struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, long timeo, int *err);
 
 #define NSP_REASON_OK 0		/* No error */
diff --git a/include/net/dn_route.h b/include/net/dn_route.h
index 11fe973cf383..5122da3f2eb3 100644
--- a/include/net/dn_route.h
+++ b/include/net/dn_route.h
@@ -15,7 +15,7 @@
     GNU General Public License for more details.
 *******************************************************************************/
 
-extern struct sk_buff *dn_alloc_skb(struct sock *sk, int size, unsigned int __nocast pri);
+extern struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri);
 extern int dn_route_output_sock(struct dst_entry **pprt, struct flowi *, struct sock *sk, int flags);
 extern int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb);
 extern int dn_cache_getroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg);
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 651f824c1008..b0c99060b78d 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -94,7 +94,7 @@ static inline void *inet_csk_ca(const struct sock *sk)
 
 extern struct sock *inet_csk_clone(struct sock *sk,
 				   const struct request_sock *req,
-				   const unsigned int __nocast priority);
+				   const gfp_t priority);
 
 enum inet_csk_ack_state_t {
 	ICSK_ACK_SCHED	= 1,
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ecb2b061f597..3b5559a023a4 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -832,7 +832,7 @@ extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
 
 extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff **pskb);
 extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff **pskb);
-extern int ip_vs_skb_replace(struct sk_buff *skb, unsigned int __nocast pri,
+extern int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
 			     char *o_buf, int o_len, char *n_buf, int n_len);
 extern int ip_vs_app_init(void);
 extern void ip_vs_app_cleanup(void);
diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h
index 54852ff6033b..00730d21b522 100644
--- a/include/net/llc_conn.h
+++ b/include/net/llc_conn.h
@@ -93,7 +93,7 @@ static __inline__ char llc_backlog_type(struct sk_buff *skb)
 	return skb->cb[sizeof(skb->cb) - 1];
 }
 
-extern struct sock *llc_sk_alloc(int family, unsigned int __nocast priority,
+extern struct sock *llc_sk_alloc(int family, gfp_t priority,
 				 struct proto *prot);
 extern void llc_sk_free(struct sock *sk);
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index e1d5ec1c23c0..8f241216f46b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -125,7 +125,7 @@
  */
 extern struct sock *sctp_get_ctl_sock(void);
 extern int sctp_copy_local_addr_list(struct sctp_bind_addr *,
-				     sctp_scope_t, unsigned int __nocast gfp,
+				     sctp_scope_t, gfp_t gfp,
 				     int flags);
 extern struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
 extern int sctp_register_pf(struct sctp_pf *, sa_family_t);
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 58462164d960..1eac3d0eb7a9 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -181,17 +181,17 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t,
 int sctp_chunk_iif(const struct sctp_chunk *);
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *,
 					     struct sctp_chunk *,
-					     unsigned int __nocast gfp);
+					     gfp_t gfp);
 __u32 sctp_generate_verification_tag(void);
 void sctp_populate_tie_tags(__u8 *cookie, __u32 curTag, __u32 hisTag);
 
 /* Prototypes for chunk-building functions.  */
 struct sctp_chunk *sctp_make_init(const struct sctp_association *,
 			     const struct sctp_bind_addr *,
-			     unsigned int __nocast gfp, int vparam_len);
+			     gfp_t gfp, int vparam_len);
 struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *,
 				 const struct sctp_chunk *,
-				 const unsigned int __nocast gfp,
+				 const gfp_t gfp,
 				 const int unkparam_len);
 struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *,
 				    const struct sctp_chunk *);
@@ -265,7 +265,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
                struct sctp_endpoint *,
                struct sctp_association *asoc,
                void *event_arg,
-	       unsigned int __nocast gfp);
+	       gfp_t gfp);
 
 /* 2nd level prototypes */
 void sctp_generate_t3_rtx_event(unsigned long peer);
@@ -276,7 +276,7 @@ void sctp_ootb_pkt_free(struct sctp_packet *);
 struct sctp_association *sctp_unpack_cookie(const struct sctp_endpoint *,
 				       const struct sctp_association *,
 				       struct sctp_chunk *,
-				       unsigned int __nocast gfp, int *err,
+				       gfp_t gfp, int *err,
 				       struct sctp_chunk **err_chk_p);
 int sctp_addip_addr_config(struct sctp_association *, sctp_param_t,
 			   struct sockaddr_storage*, int);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 994009bbe3b4..9c385b6417c7 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -446,7 +446,7 @@ struct sctp_ssnmap {
 };
 
 struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
-				    unsigned int __nocast gfp);
+				    gfp_t gfp);
 void sctp_ssnmap_free(struct sctp_ssnmap *map);
 void sctp_ssnmap_clear(struct sctp_ssnmap *map);
 
@@ -947,7 +947,7 @@ struct sctp_transport {
 };
 
 struct sctp_transport *sctp_transport_new(const union sctp_addr *,
-					  unsigned int __nocast);
+					  gfp_t);
 void sctp_transport_set_owner(struct sctp_transport *,
 			      struct sctp_association *);
 void sctp_transport_route(struct sctp_transport *, union sctp_addr *,
@@ -1095,10 +1095,10 @@ void sctp_bind_addr_init(struct sctp_bind_addr *, __u16 port);
 void sctp_bind_addr_free(struct sctp_bind_addr *);
 int sctp_bind_addr_copy(struct sctp_bind_addr *dest,
 			const struct sctp_bind_addr *src,
-			sctp_scope_t scope, unsigned int __nocast gfp,
+			sctp_scope_t scope, gfp_t gfp,
 			int flags);
 int sctp_add_bind_addr(struct sctp_bind_addr *, union sctp_addr *,
-		       unsigned int __nocast gfp);
+		       gfp_t gfp);
 int sctp_del_bind_addr(struct sctp_bind_addr *, union sctp_addr *);
 int sctp_bind_addr_match(struct sctp_bind_addr *, const union sctp_addr *,
 			 struct sctp_sock *);
@@ -1108,9 +1108,9 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
 					struct sctp_sock	*opt);
 union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
 					 int *addrs_len,
-					 unsigned int __nocast gfp);
+					 gfp_t gfp);
 int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw, int len,
-			   __u16 port, unsigned int __nocast gfp);
+			   __u16 port, gfp_t gfp);
 
 sctp_scope_t sctp_scope(const union sctp_addr *);
 int sctp_in_scope(const union sctp_addr *addr, const sctp_scope_t scope);
@@ -1239,7 +1239,7 @@ static inline struct sctp_endpoint *sctp_ep(struct sctp_ep_common *base)
 }
 
 /* These are function signatures for manipulating endpoints.  */
-struct sctp_endpoint *sctp_endpoint_new(struct sock *, unsigned int __nocast);
+struct sctp_endpoint *sctp_endpoint_new(struct sock *, gfp_t);
 void sctp_endpoint_free(struct sctp_endpoint *);
 void sctp_endpoint_put(struct sctp_endpoint *);
 void sctp_endpoint_hold(struct sctp_endpoint *);
@@ -1260,7 +1260,7 @@ int sctp_verify_init(const struct sctp_association *asoc, sctp_cid_t,
 		     struct sctp_chunk **err_chunk);
 int sctp_process_init(struct sctp_association *, sctp_cid_t cid,
 		      const union sctp_addr *peer,
-		      sctp_init_chunk_t *init, unsigned int __nocast gfp);
+		      sctp_init_chunk_t *init, gfp_t gfp);
 __u32 sctp_generate_tag(const struct sctp_endpoint *);
 __u32 sctp_generate_tsn(const struct sctp_endpoint *);
 
@@ -1723,7 +1723,7 @@ static inline struct sctp_association *sctp_assoc(struct sctp_ep_common *base)
 
 struct sctp_association *
 sctp_association_new(const struct sctp_endpoint *, const struct sock *,
-		     sctp_scope_t scope, unsigned int __nocast gfp);
+		     sctp_scope_t scope, gfp_t gfp);
 void sctp_association_free(struct sctp_association *);
 void sctp_association_put(struct sctp_association *);
 void sctp_association_hold(struct sctp_association *);
@@ -1739,7 +1739,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
 			    const union sctp_addr *laddr);
 struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *,
 				     const union sctp_addr *address,
-				     const unsigned int __nocast gfp,
+				     const gfp_t gfp,
 				     const int peer_state);
 void sctp_assoc_del_peer(struct sctp_association *asoc,
 			 const union sctp_addr *addr);
@@ -1764,10 +1764,10 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *, unsigned);
 void sctp_assoc_set_primary(struct sctp_association *,
 			    struct sctp_transport *);
 int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *,
-				     unsigned int __nocast);
+				     gfp_t);
 int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *,
 					 struct sctp_cookie*,
-					 unsigned int __nocast gfp);
+					 gfp_t gfp);
 
 int sctp_cmp_addr_exact(const union sctp_addr *ss1,
 			const union sctp_addr *ss2);
diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h
index 90fe4bf6754f..6c40cfc4832d 100644
--- a/include/net/sctp/ulpevent.h
+++ b/include/net/sctp/ulpevent.h
@@ -88,7 +88,7 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_change(
 	__u16 error,
 	__u16 outbound,
 	__u16 inbound,
-	unsigned int __nocast gfp);
+	gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
 	const struct sctp_association *asoc,
@@ -96,35 +96,35 @@ struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
 	int flags,
 	int state,
 	int error,
-	unsigned int __nocast gfp);
+	gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
 	const struct sctp_association *asoc,
 	struct sctp_chunk *chunk,
 	__u16 flags,
-	unsigned int __nocast gfp);
+	gfp_t gfp);
 struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
 	const struct sctp_association *asoc,
 	struct sctp_chunk *chunk,
 	__u16 flags,
 	__u32 error,
-	unsigned int __nocast gfp);
+	gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
 	const struct sctp_association *asoc,
 	__u16 flags,
-	unsigned int __nocast gfp);
+	gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	const struct sctp_association *asoc,
-	__u32 indication, unsigned int __nocast gfp);
+	__u32 indication, gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
-	const struct sctp_association *asoc, unsigned int __nocast gfp);
+	const struct sctp_association *asoc, gfp_t gfp);
 
 struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 	struct sctp_chunk *chunk,
-	unsigned int __nocast gfp);
+	gfp_t gfp);
 
 void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event,
 	struct msghdr *);
diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index 1a60c6d943c1..a43c8788b650 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -62,22 +62,19 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *,
 void sctp_ulpq_free(struct sctp_ulpq *);
 
 /* Add a new DATA chunk for processing. */
-int sctp_ulpq_tail_data(struct sctp_ulpq *, struct sctp_chunk *,
-			unsigned int __nocast);
+int sctp_ulpq_tail_data(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
 
 /* Add a new event for propagation to the ULP. */
 int sctp_ulpq_tail_event(struct sctp_ulpq *, struct sctp_ulpevent *ev);
 
 /* Renege previously received chunks.  */
-void sctp_ulpq_renege(struct sctp_ulpq *, struct sctp_chunk *,
-		      unsigned int __nocast);
+void sctp_ulpq_renege(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
 
 /* Perform partial delivery. */
-void sctp_ulpq_partial_delivery(struct sctp_ulpq *, struct sctp_chunk *,
-				unsigned int __nocast);
+void sctp_ulpq_partial_delivery(struct sctp_ulpq *, struct sctp_chunk *, gfp_t);
 
 /* Abort the partial delivery. */
-void sctp_ulpq_abort_pd(struct sctp_ulpq *, unsigned int __nocast);
+void sctp_ulpq_abort_pd(struct sctp_ulpq *, gfp_t);
 
 /* Clear the partial data delivery condition on this socket. */
 int sctp_clear_pd(struct sock *sk);
diff --git a/include/net/sock.h b/include/net/sock.h
index b6440805c420..ecb75526cba0 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -739,18 +739,18 @@ extern void FASTCALL(release_sock(struct sock *sk));
 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
 
 extern struct sock		*sk_alloc(int family,
-					  unsigned int __nocast priority,
+					  gfp_t priority,
 					  struct proto *prot, int zero_it);
 extern void			sk_free(struct sock *sk);
 extern struct sock		*sk_clone(const struct sock *sk,
-					  const unsigned int __nocast priority);
+					  const gfp_t priority);
 
 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
 					      unsigned long size, int force,
-					      unsigned int __nocast priority);
+					      gfp_t priority);
 extern struct sk_buff		*sock_rmalloc(struct sock *sk,
 					      unsigned long size, int force,
-					      unsigned int __nocast priority);
+					      gfp_t priority);
 extern void			sock_wfree(struct sk_buff *skb);
 extern void			sock_rfree(struct sk_buff *skb);
 
@@ -766,7 +766,7 @@ extern struct sk_buff 		*sock_alloc_send_skb(struct sock *sk,
 						     int noblock,
 						     int *errcode);
 extern void *sock_kmalloc(struct sock *sk, int size,
-			  unsigned int __nocast priority);
+			  gfp_t priority);
 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
 extern void sk_send_sigurg(struct sock *sk);
 
@@ -1201,7 +1201,7 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
 
 static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
 						   int size, int mem,
-						   unsigned int __nocast gfp)
+						   gfp_t gfp)
 {
 	struct sk_buff *skb;
 	int hdr_len;
@@ -1224,7 +1224,7 @@ static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk,
 
 static inline struct sk_buff *sk_stream_alloc_skb(struct sock *sk,
 						  int size,
-						  unsigned int __nocast gfp)
+						  gfp_t gfp)
 {
 	return sk_stream_alloc_pskb(sk, size, 0, gfp);
 }
@@ -1255,7 +1255,7 @@ static inline int sock_writeable(const struct sock *sk)
 	return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf / 2);
 }
 
-static inline unsigned int __nocast gfp_any(void)
+static inline gfp_t gfp_any(void)
 {
 	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
 }
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 97af77c4d096..c24339c4e310 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -460,8 +460,7 @@ extern void tcp_send_probe0(struct sock *);
 extern void tcp_send_partial(struct sock *);
 extern int  tcp_write_wakeup(struct sock *);
 extern void tcp_send_fin(struct sock *sk);
-extern void tcp_send_active_reset(struct sock *sk,
-                                  unsigned int __nocast priority);
+extern void tcp_send_active_reset(struct sock *sk, gfp_t priority);
 extern int  tcp_send_synack(struct sock *);
 extern void tcp_push_one(struct sock *, unsigned int mss_now);
 extern void tcp_send_ack(struct sock *sk);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b6e72f890c6c..5beae1ccd574 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -875,7 +875,7 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig
 } 
 #endif
 
-struct xfrm_policy *xfrm_policy_alloc(unsigned int __nocast gfp);
+struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp);
 extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *);
 int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl);
 struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel,
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index 0e293fe733b0..4172e6841e3d 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -596,7 +596,7 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
 					    u32 remote_qpn, u16 pkey_index,
 					    struct ib_ah *ah, int rmpp_active,
 					    int hdr_len, int data_len,
-					    unsigned int __nocast gfp_mask);
+					    gfp_t gfp_mask);
 
 /**
  * ib_free_send_mad - Returns data buffers used to send a MAD.
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index a7555c800ecf..f404fe21cc21 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -285,7 +285,7 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query);
 int ib_sa_path_rec_get(struct ib_device *device, u8 port_num,
 		       struct ib_sa_path_rec *rec,
 		       ib_sa_comp_mask comp_mask,
-		       int timeout_ms, unsigned int __nocast gfp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
 		       void (*callback)(int status,
 					struct ib_sa_path_rec *resp,
 					void *context),
@@ -296,7 +296,7 @@ int ib_sa_mcmember_rec_query(struct ib_device *device, u8 port_num,
 			     u8 method,
 			     struct ib_sa_mcmember_rec *rec,
 			     ib_sa_comp_mask comp_mask,
-			     int timeout_ms, unsigned int __nocast gfp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
 			     void (*callback)(int status,
 					      struct ib_sa_mcmember_rec *resp,
 					      void *context),
@@ -307,7 +307,7 @@ int ib_sa_service_rec_query(struct ib_device *device, u8 port_num,
 			 u8 method,
 			 struct ib_sa_service_rec *rec,
 			 ib_sa_comp_mask comp_mask,
-			 int timeout_ms, unsigned int __nocast gfp_mask,
+			 int timeout_ms, gfp_t gfp_mask,
 			 void (*callback)(int status,
 					  struct ib_sa_service_rec *resp,
 					  void *context),
@@ -342,7 +342,7 @@ static inline int
 ib_sa_mcmember_rec_set(struct ib_device *device, u8 port_num,
 		       struct ib_sa_mcmember_rec *rec,
 		       ib_sa_comp_mask comp_mask,
-		       int timeout_ms, unsigned int __nocast gfp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
 		       void (*callback)(int status,
 					struct ib_sa_mcmember_rec *resp,
 					void *context),
@@ -384,7 +384,7 @@ static inline int
 ib_sa_mcmember_rec_delete(struct ib_device *device, u8 port_num,
 			  struct ib_sa_mcmember_rec *rec,
 			  ib_sa_comp_mask comp_mask,
-			  int timeout_ms, unsigned int __nocast gfp_mask,
+			  int timeout_ms, gfp_t gfp_mask,
 			  void (*callback)(int status,
 					   struct ib_sa_mcmember_rec *resp,
 					   void *context),
diff --git a/include/rxrpc/call.h b/include/rxrpc/call.h
index 8118731e7d96..b86f83743510 100644
--- a/include/rxrpc/call.h
+++ b/include/rxrpc/call.h
@@ -203,7 +203,7 @@ extern int rxrpc_call_write_data(struct rxrpc_call *call,
 				 size_t sioc,
 				 struct kvec *siov,
 				 uint8_t rxhdr_flags,
-				 unsigned int __nocast alloc_flags,
+				 gfp_t alloc_flags,
 				 int dup_data,
 				 size_t *size_sent);
 
diff --git a/include/rxrpc/message.h b/include/rxrpc/message.h
index 983d9f9eee1a..b318f273d4f2 100644
--- a/include/rxrpc/message.h
+++ b/include/rxrpc/message.h
@@ -63,7 +63,7 @@ extern int rxrpc_conn_newmsg(struct rxrpc_connection *conn,
 			     uint8_t type,
 			     int count,
 			     struct kvec *diov,
-			     unsigned int __nocast alloc_flags,
+			     gfp_t alloc_flags,
 			     struct rxrpc_message **_msg);
 
 extern int rxrpc_conn_sendmsg(struct rxrpc_connection *conn, struct rxrpc_message *msg);
diff --git a/include/sound/core.h b/include/sound/core.h
index 26160adcdffc..6d971a4c4ca0 100644
--- a/include/sound/core.h
+++ b/include/sound/core.h
@@ -290,13 +290,13 @@ void snd_memory_init(void);
 void snd_memory_done(void);
 int snd_memory_info_init(void);
 int snd_memory_info_done(void);
-void *snd_hidden_kmalloc(size_t size, unsigned int __nocast flags);
-void *snd_hidden_kzalloc(size_t size, unsigned int __nocast flags);
-void *snd_hidden_kcalloc(size_t n, size_t size, unsigned int __nocast flags);
+void *snd_hidden_kmalloc(size_t size, gfp_t flags);
+void *snd_hidden_kzalloc(size_t size, gfp_t flags);
+void *snd_hidden_kcalloc(size_t n, size_t size, gfp_t flags);
 void snd_hidden_kfree(const void *obj);
 void *snd_hidden_vmalloc(unsigned long size);
 void snd_hidden_vfree(void *obj);
-char *snd_hidden_kstrdup(const char *s, unsigned int __nocast flags);
+char *snd_hidden_kstrdup(const char *s, gfp_t flags);
 #define kmalloc(size, flags) snd_hidden_kmalloc(size, flags)
 #define kzalloc(size, flags) snd_hidden_kzalloc(size, flags)
 #define kcalloc(n, size, flags) snd_hidden_kcalloc(n, size, flags)
diff --git a/include/sound/driver.h b/include/sound/driver.h
index 0d12456ec3ae..1ec2fae050a6 100644
--- a/include/sound/driver.h
+++ b/include/sound/driver.h
@@ -51,7 +51,7 @@
 #ifdef CONFIG_SND_DEBUG_MEMORY
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
-void *snd_wrapper_kmalloc(size_t, unsigned int __nocast);
+void *snd_wrapper_kmalloc(size_t, gfp_t);
 #undef kmalloc
 void snd_wrapper_kfree(const void *);
 #undef kfree
diff --git a/kernel/audit.c b/kernel/audit.c
index 83096b67510a..aefa73a8a586 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -560,7 +560,7 @@ static void audit_buffer_free(struct audit_buffer *ab)
 }
 
 static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
-						unsigned int __nocast gfp_mask, int type)
+						gfp_t gfp_mask, int type)
 {
 	unsigned long flags;
 	struct audit_buffer *ab = NULL;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 45a5719a0104..28176d083f7b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1670,7 +1670,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  **/
 
-int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask)
+int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
 	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 179baafcdd96..64ab045c3d9d 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -36,7 +36,7 @@
  * struct kfifo with kfree().
  */
 struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size,
-			 unsigned int __nocast gfp_mask, spinlock_t *lock)
+			 gfp_t gfp_mask, spinlock_t *lock)
 {
 	struct kfifo *fifo;
 
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(kfifo_init);
  *
  * The size will be rounded-up to a power of 2.
  */
-struct kfifo *kfifo_alloc(unsigned int size, unsigned int __nocast gfp_mask, spinlock_t *lock)
+struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
 {
 	unsigned char *buffer;
 	struct kfifo *ret;
diff --git a/kernel/signal.c b/kernel/signal.c
index c135f5aa2c2d..cba193ceda0d 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -262,7 +262,7 @@ next_signal(struct sigpending *pending, sigset_t *mask)
 	return sig;
 }
 
-static struct sigqueue *__sigqueue_alloc(struct task_struct *t, unsigned int __nocast flags,
+static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 					 int override_rlimit)
 {
 	struct sigqueue *q = NULL;
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 6a8bc6e06431..d1c057e71b68 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -110,7 +110,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * success, return zero, with preemption disabled.  On error, return -ENOMEM
  * with preemption not disabled.
  */
-int radix_tree_preload(unsigned int __nocast gfp_mask)
+int radix_tree_preload(gfp_t gfp_mask)
 {
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
diff --git a/lib/ts_bm.c b/lib/ts_bm.c
index 1b61fceef777..8a8b3a16133e 100644
--- a/lib/ts_bm.c
+++ b/lib/ts_bm.c
@@ -127,7 +127,7 @@ static void compute_prefix_tbl(struct ts_bm *bm, const u8 *pattern,
 }
 
 static struct ts_config *bm_init(const void *pattern, unsigned int len,
-				 unsigned int __nocast gfp_mask)
+				 gfp_t gfp_mask)
 {
 	struct ts_config *conf;
 	struct ts_bm *bm;
diff --git a/lib/ts_fsm.c b/lib/ts_fsm.c
index ef9779e00506..ca3211206eef 100644
--- a/lib/ts_fsm.c
+++ b/lib/ts_fsm.c
@@ -258,7 +258,7 @@ found_match:
 }
 
 static struct ts_config *fsm_init(const void *pattern, unsigned int len,
-				     unsigned int __nocast gfp_mask)
+				     gfp_t gfp_mask)
 {
 	int i, err = -EINVAL;
 	struct ts_config *conf;
diff --git a/lib/ts_kmp.c b/lib/ts_kmp.c
index e45f0f0c2379..7fd45451b44a 100644
--- a/lib/ts_kmp.c
+++ b/lib/ts_kmp.c
@@ -87,7 +87,7 @@ static inline void compute_prefix_tbl(const u8 *pattern, unsigned int len,
 }
 
 static struct ts_config *kmp_init(const void *pattern, unsigned int len,
-				  unsigned int __nocast gfp_mask)
+				  gfp_t gfp_mask)
 {
 	struct ts_config *conf;
 	struct ts_kmp *kmp;
diff --git a/mm/highmem.c b/mm/highmem.c
index 400911599468..90e1861e2da0 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -30,7 +30,7 @@
 
 static mempool_t *page_pool, *isa_page_pool;
 
-static void *page_pool_alloc(unsigned int __nocast gfp_mask, void *data)
+static void *page_pool_alloc(gfp_t gfp_mask, void *data)
 {
 	unsigned int gfp = gfp_mask | (unsigned int) (long) data;
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9033f0859aa8..37af443eb094 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -687,7 +687,7 @@ get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned lo
 }
 
 /* Return a zonelist representing a mempolicy */
-static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
+static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 {
 	int nd;
 
@@ -751,7 +751,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
 
 /* Allocate a page in interleaved policy.
    Own path because it needs to do special accounting. */
-static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
+static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid)
 {
 	struct zonelist *zl;
 	struct page *page;
@@ -789,7 +789,7 @@ static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned or
  *	Should be called with the mm_sem of the vma hold.
  */
 struct page *
-alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
+alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
 
@@ -832,7 +832,7 @@ alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned l
  *	1) it's ok to take cpuset_sem (can WAIT), and
  *	2) allocating for current task (not interrupt).
  */
-struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
+struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 {
 	struct mempolicy *pol = current->mempolicy;
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 65f2957b8d51..9e377ea700b2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -112,7 +112,7 @@ EXPORT_SYMBOL(mempool_create_node);
  * while this function is running. mempool_alloc() & mempool_free()
  * might be called (eg. from IRQ contexts) while this function executes.
  */
-int mempool_resize(mempool_t *pool, int new_min_nr, unsigned int __nocast gfp_mask)
+int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask)
 {
 	void *element;
 	void **new_elements;
@@ -200,7 +200,7 @@ EXPORT_SYMBOL(mempool_destroy);
  * *never* fails when called from process contexts. (it might
  * fail if called from an IRQ context.)
  */
-void * mempool_alloc(mempool_t *pool, unsigned int __nocast gfp_mask)
+void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
 {
 	void *element;
 	unsigned long flags;
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(mempool_free);
 /*
  * A commonly used alloc and free fn.
  */
-void *mempool_alloc_slab(unsigned int __nocast gfp_mask, void *pool_data)
+void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
 	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
 	return kmem_cache_alloc(mem, gfp_mask);
diff --git a/mm/nommu.c b/mm/nommu.c
index 064d70442895..0ef241ae3763 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -157,8 +157,7 @@ void vfree(void *addr)
 	kfree(addr);
 }
 
-void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask,
-			pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
 	/*
 	 * kmalloc doesn't like __GFP_HIGHMEM for some reason
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index ac3bf33e5370..d348b9035955 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -263,7 +263,7 @@ static struct mm_struct *oom_kill_process(struct task_struct *p)
  * OR try to be smart about which process to kill. Note that we
  * don't have to be perfect here, we just have to be good.
  */
-void out_of_memory(unsigned int __nocast gfp_mask, int order)
+void out_of_memory(gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
 	task_t * p;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ae2903339e71..cc1fe2672a31 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -671,7 +671,7 @@ void fastcall free_cold_page(struct page *page)
 	free_hot_cold_page(page, 1);
 }
 
-static inline void prep_zero_page(struct page *page, int order, unsigned int __nocast gfp_flags)
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	int i;
 
@@ -686,7 +686,7 @@ static inline void prep_zero_page(struct page *page, int order, unsigned int __n
  * or two.
  */
 static struct page *
-buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags)
+buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
 {
 	unsigned long flags;
 	struct page *page = NULL;
@@ -761,7 +761,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
 }
 
 static inline int
-should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
 {
 	if (!z->reclaim_pages)
 		return 0;
@@ -774,7 +774,7 @@ should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
  * This is the 'heart' of the zoned buddy allocator.
  */
 struct page * fastcall
-__alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
+__alloc_pages(gfp_t gfp_mask, unsigned int order,
 		struct zonelist *zonelist)
 {
 	const int wait = gfp_mask & __GFP_WAIT;
@@ -977,7 +977,7 @@ EXPORT_SYMBOL(__alloc_pages);
 /*
  * Common helper functions.
  */
-fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned int order)
+fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page * page;
 	page = alloc_pages(gfp_mask, order);
@@ -988,7 +988,7 @@ fastcall unsigned long __get_free_pages(unsigned int __nocast gfp_mask, unsigned
 
 EXPORT_SYMBOL(__get_free_pages);
 
-fastcall unsigned long get_zeroed_page(unsigned int __nocast gfp_mask)
+fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
 {
 	struct page * page;
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 2e605a19ce57..330e00d6db00 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -19,7 +19,7 @@
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
-static struct bio *get_swap_bio(unsigned int __nocast gfp_flags, pgoff_t index,
+static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index,
 				struct page *page, bio_end_io_t end_io)
 {
 	struct bio *bio;
diff --git a/mm/shmem.c b/mm/shmem.c
index 1f7aeb210c7b..ea064d89cda9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -921,8 +921,7 @@ shmem_swapin(struct shmem_inode_info *info,swp_entry_t entry,unsigned long idx)
 }
 
 static inline struct page *
-shmem_alloc_page(unsigned int __nocast gfp,struct shmem_inode_info *info,
-				 unsigned long idx)
+shmem_alloc_page(gfp_t gfp,struct shmem_inode_info *info, unsigned long idx)
 {
 	return alloc_page(gfp | __GFP_ZERO);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 5cbbdfa6dd0e..d05c678bceb3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -650,8 +650,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
-static inline kmem_cache_t *__find_general_cachep(size_t size,
-						unsigned int __nocast gfpflags)
+static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -675,8 +674,7 @@ static inline kmem_cache_t *__find_general_cachep(size_t size,
 	return csizep->cs_cachep;
 }
 
-kmem_cache_t *kmem_find_general_cachep(size_t size,
-		unsigned int __nocast gfpflags)
+kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
@@ -1185,7 +1183,7 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
 	struct page *page;
 	void *addr;
@@ -2048,7 +2046,7 @@ EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
 static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-			int colour_off, unsigned int __nocast local_flags)
+			int colour_off, gfp_t local_flags)
 {
 	struct slab *slabp;
 	
@@ -2149,7 +2147,7 @@ static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
 	struct slab	*slabp;
 	void		*objp;
@@ -2356,7 +2354,7 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags)
+static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
@@ -2456,7 +2454,7 @@ alloc_done:
 }
 
 static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
+cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2467,7 +2465,7 @@ cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
 #if DEBUG
 static void *
 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-			unsigned int __nocast flags, void *objp, void *caller)
+			gfp_t flags, void *objp, void *caller)
 {
 	if (!objp)	
 		return objp;
@@ -2510,7 +2508,7 @@ cache_alloc_debugcheck_after(kmem_cache_t *cachep,
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
 	void* objp;
 	struct array_cache *ac;
@@ -2528,7 +2526,7 @@ static inline void *____cache_alloc(kmem_cache_t *cachep, unsigned int __nocast
 	return objp;
 }
 
-static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
 	unsigned long save_flags;
 	void* objp;
@@ -2787,7 +2785,7 @@ static inline void __cache_free(kmem_cache_t *cachep, void *objp)
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
-void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
+void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
 {
 	return __cache_alloc(cachep, flags);
 }
@@ -2848,7 +2846,7 @@ out:
  * New and improved: it will now make sure that the object gets
  * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
 {
 	unsigned long save_flags;
 	void *ptr;
@@ -2875,7 +2873,7 @@ void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, i
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
-void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
+void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	kmem_cache_t *cachep;
 
@@ -2908,7 +2906,7 @@ EXPORT_SYMBOL(kmalloc_node);
  * platforms.  For example, on i386, it means that the memory must come
  * from the first 16MB.
  */
-void *__kmalloc(size_t size, unsigned int __nocast flags)
+void *__kmalloc(size_t size, gfp_t flags)
 {
 	kmem_cache_t *cachep;
 
@@ -2997,7 +2995,7 @@ EXPORT_SYMBOL(kmem_cache_free);
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kzalloc(size_t size, unsigned int __nocast flags)
+void *kzalloc(size_t size, gfp_t flags)
 {
 	void *ret = kmalloc(size, flags);
 	if (ret)
@@ -3603,7 +3601,7 @@ unsigned int ksize(const void *objp)
  * @s: the string to duplicate
  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  */
-char *kstrdup(const char *s, unsigned int __nocast gfp)
+char *kstrdup(const char *s, gfp_t gfp)
 {
 	size_t len;
 	char *buf;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index adbc2b426c2f..132164f7d0a7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -68,7 +68,7 @@ void show_swap_cache_info(void)
  * but sets SwapCache flag and private instead of mapping and index.
  */
 static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
-			       unsigned int __nocast gfp_mask)
+			       gfp_t gfp_mask)
 {
 	int error;
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 13c3d82968ae..1150229b6366 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -395,7 +395,7 @@ void *vmap(struct page **pages, unsigned int count,
 
 EXPORT_SYMBOL(vmap);
 
-void *__vmalloc_area(struct vm_struct *area, unsigned int __nocast gfp_mask, pgprot_t prot)
+void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
 	struct page **pages;
 	unsigned int nr_pages, array_size, i;
@@ -446,7 +446,7 @@ fail:
  *	allocator with @gfp_mask flags.  Map them into contiguous
  *	kernel virtual space, using a pagetable protection of @prot.
  */
-void *__vmalloc(unsigned long size, unsigned int __nocast gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 {
 	struct vm_struct *area;
 
diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c
index 71abc99ec815..223c7ad5bd0f 100644
--- a/net/atm/atm_misc.c
+++ b/net/atm/atm_misc.c
@@ -25,7 +25,7 @@ int atm_charge(struct atm_vcc *vcc,int truesize)
 
 
 struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size,
-    unsigned int __nocast gfp_flags)
+    gfp_t gfp_flags)
 {
 	struct sock *sk = sk_atm(vcc);
 	int guess = atm_guess_pdu2truesize(pdu_size);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index d3d6bc547212..59b2dd36baa7 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -372,7 +372,7 @@ static struct proto l2cap_proto = {
 	.obj_size	= sizeof(struct l2cap_pinfo)
 };
 
-static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
+static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, gfp_t prio)
 {
 	struct sock *sk;
 
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 173f46e8cdae..35adce6482b6 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -229,7 +229,7 @@ static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d)
 	d->rx_credits = RFCOMM_DEFAULT_CREDITS;
 }
 
-struct rfcomm_dlc *rfcomm_dlc_alloc(unsigned int __nocast prio)
+struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio)
 {
 	struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio);
 	if (!d)
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index f49e7e938bfb..a2b30f0aedb7 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -284,7 +284,7 @@ static struct proto rfcomm_proto = {
 	.obj_size	= sizeof(struct rfcomm_pinfo)
 };
 
-static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
+static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, gfp_t prio)
 {
 	struct rfcomm_dlc *d;
 	struct sock *sk;
diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c
index 1bca860a6109..158a9c46d863 100644
--- a/net/bluetooth/rfcomm/tty.c
+++ b/net/bluetooth/rfcomm/tty.c
@@ -286,7 +286,7 @@ static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *de
 	skb->destructor = rfcomm_wfree;
 }
 
-static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, unsigned int __nocast priority)
+static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority)
 {
 	if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) {
 		struct sk_buff *skb = alloc_skb(size, priority);
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index ce7ab7dfa0b2..997e42df115c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -418,7 +418,7 @@ static struct proto sco_proto = {
 	.obj_size	= sizeof(struct sco_pinfo)
 };
 
-static struct sock *sco_sock_alloc(struct socket *sock, int proto, unsigned int __nocast prio)
+static struct sock *sco_sock_alloc(struct socket *sock, int proto, gfp_t prio)
 {
 	struct sock *sk;
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 9066c874e273..a44eeef24edf 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1132,7 +1132,7 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
 #endif
 
 /* Keep head the same: replace data */
-int __skb_linearize(struct sk_buff *skb, unsigned int __nocast gfp_mask)
+int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	unsigned int size;
 	u8 *data;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0e9431b59fb2..af9b1516e21f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -130,7 +130,7 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here)
  *	Buffers may only be allocated from interrupts using a @gfp_mask of
  *	%GFP_ATOMIC.
  */
-struct sk_buff *__alloc_skb(unsigned int size, unsigned int __nocast gfp_mask,
+struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 			    int fclone)
 {
 	struct sk_buff *skb;
@@ -198,7 +198,7 @@ nodata:
  */
 struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp,
 				     unsigned int size,
-				     unsigned int __nocast gfp_mask)
+				     gfp_t gfp_mask)
 {
 	struct sk_buff *skb;
 	u8 *data;
@@ -361,7 +361,7 @@ void __kfree_skb(struct sk_buff *skb)
  *	%GFP_ATOMIC.
  */
 
-struct sk_buff *skb_clone(struct sk_buff *skb, unsigned int __nocast gfp_mask)
+struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	struct sk_buff *n;
 
@@ -500,7 +500,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
  *	header is going to be modified. Use pskb_copy() instead.
  */
 
-struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_mask)
+struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
 {
 	int headerlen = skb->data - skb->head;
 	/*
@@ -539,7 +539,7 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, unsigned int __nocast gfp_ma
  *	The returned buffer has a reference count of 1.
  */
 
-struct sk_buff *pskb_copy(struct sk_buff *skb, unsigned int __nocast gfp_mask)
+struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 {
 	/*
 	 *	Allocate the copy buffer
@@ -598,7 +598,7 @@ out:
  */
 
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
-		     unsigned int __nocast gfp_mask)
+		     gfp_t gfp_mask)
 {
 	int i;
 	u8 *data;
@@ -689,7 +689,7 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
  */
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
 				int newheadroom, int newtailroom,
-				unsigned int __nocast gfp_mask)
+				gfp_t gfp_mask)
 {
 	/*
 	 *	Allocate the copy buffer
diff --git a/net/core/sock.c b/net/core/sock.c
index 928d2a1d6d8e..1c52fe809eda 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -637,7 +637,7 @@ lenout:
  *	@prot: struct proto associated with this new sock instance
  *	@zero_it: if we should zero the newly allocated sock
  */
-struct sock *sk_alloc(int family, unsigned int __nocast priority,
+struct sock *sk_alloc(int family, gfp_t priority,
 		      struct proto *prot, int zero_it)
 {
 	struct sock *sk = NULL;
@@ -704,7 +704,7 @@ void sk_free(struct sock *sk)
 	module_put(owner);
 }
 
-struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 {
 	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
 
@@ -845,7 +845,7 @@ unsigned long sock_i_ino(struct sock *sk)
  * Allocate a skb from the socket's send buffer.
  */
 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
-			     unsigned int __nocast priority)
+			     gfp_t priority)
 {
 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
 		struct sk_buff * skb = alloc_skb(size, priority);
@@ -861,7 +861,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
  * Allocate a skb from the socket's receive buffer.
  */ 
 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
-			     unsigned int __nocast priority)
+			     gfp_t priority)
 {
 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
 		struct sk_buff *skb = alloc_skb(size, priority);
@@ -876,7 +876,7 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
 /* 
  * Allocate a memory block from the socket's option memory buffer.
  */ 
-void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
 {
 	if ((unsigned)size <= sysctl_optmem_max &&
 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 6530283eafca..c9a62cca22fc 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -91,7 +91,7 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb)
 }
 
 struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len,
-				      const unsigned int __nocast priority)
+				      const gfp_t priority)
 {
 	struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority);
 
diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h
index 8ca51c9191f7..d0fd6c60c574 100644
--- a/net/dccp/ackvec.h
+++ b/net/dccp/ackvec.h
@@ -74,7 +74,7 @@ struct sk_buff;
 
 #ifdef CONFIG_IP_DCCP_ACKVEC
 extern struct dccp_ackvec *dccp_ackvec_alloc(unsigned int len,
-					  const unsigned int __nocast priority);
+					  const gfp_t priority);
 extern void dccp_ackvec_free(struct dccp_ackvec *av);
 
 extern int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk,
@@ -93,7 +93,7 @@ static inline int dccp_ackvec_pending(const struct dccp_ackvec *av)
 }
 #else /* CONFIG_IP_DCCP_ACKVEC */
 static inline struct dccp_ackvec *dccp_ackvec_alloc(unsigned int len,
-					   const unsigned int __nocast priority)
+					   const gfp_t priority)
 {
 	return NULL;
 }
diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h
index 13ad47ba1420..417d9d82df3e 100644
--- a/net/dccp/ccids/lib/loss_interval.h
+++ b/net/dccp/ccids/lib/loss_interval.h
@@ -36,7 +36,7 @@ struct dccp_li_hist_entry {
 
 static inline struct dccp_li_hist_entry *
 		dccp_li_hist_entry_new(struct dccp_li_hist *hist,
-				       const unsigned int __nocast prio)
+				       const gfp_t prio)
 {
 	return kmem_cache_alloc(hist->dccplih_slab, prio);
 }
diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h
index b375ebdb7dcf..122e96737ff6 100644
--- a/net/dccp/ccids/lib/packet_history.h
+++ b/net/dccp/ccids/lib/packet_history.h
@@ -86,7 +86,7 @@ extern struct dccp_rx_hist_entry *
 
 static inline struct dccp_tx_hist_entry *
 		dccp_tx_hist_entry_new(struct dccp_tx_hist *hist,
-				       const unsigned int __nocast prio)
+				       const gfp_t prio)
 {
 	struct dccp_tx_hist_entry *entry = kmem_cache_alloc(hist->dccptxh_slab,
 							    prio);
@@ -137,7 +137,7 @@ static inline struct dccp_rx_hist_entry *
 				     	    const struct sock *sk, 
 				     	    const u32 ndp, 
 					    const struct sk_buff *skb,
-					    const unsigned int __nocast prio)
+					    const gfp_t prio)
 {
 	struct dccp_rx_hist_entry *entry = kmem_cache_alloc(hist->dccprxh_slab,
 							    prio);
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 34d4128d56d5..1186dc44cdff 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -452,8 +452,7 @@ static struct proto dn_proto = {
 	.obj_size = sizeof(struct dn_sock),
 };
 
-static struct sock *dn_alloc_sock(struct socket *sock,
-				  unsigned int __nocast gfp)
+static struct sock *dn_alloc_sock(struct socket *sock, gfp_t gfp)
 {
 	struct dn_scp *scp;
 	struct sock *sk = sk_alloc(PF_DECnet, gfp, &dn_proto, 1);
@@ -805,8 +804,7 @@ static int dn_auto_bind(struct socket *sock)
 	return rv;
 }
 
-static int dn_confirm_accept(struct sock *sk, long *timeo,
-				unsigned int __nocast allocation)
+static int dn_confirm_accept(struct sock *sk, long *timeo, gfp_t allocation)
 {
 	struct dn_scp *scp = DN_SK(sk);
 	DEFINE_WAIT(wait);
diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c
index cd08244aa10c..c96c767b1f74 100644
--- a/net/decnet/dn_nsp_out.c
+++ b/net/decnet/dn_nsp_out.c
@@ -117,8 +117,7 @@ try_again:
  * The eventual aim is for each socket to have a cached header size
  * for its outgoing packets, and to set hdr from this when sk != NULL.
  */
-struct sk_buff *dn_alloc_skb(struct sock *sk, int size,
-			     unsigned int __nocast pri)
+struct sk_buff *dn_alloc_skb(struct sock *sk, int size, gfp_t pri)
 {
 	struct sk_buff *skb;
 	int hdr = 64;
@@ -212,7 +211,7 @@ static void dn_nsp_rtt(struct sock *sk, long rtt)
  * Returns: The number of times the packet has been sent previously
  */
 static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb,
-					     unsigned int __nocast gfp)
+					     gfp_t gfp)
 {
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
 	struct sk_buff *skb2;
@@ -353,7 +352,7 @@ static unsigned short *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *sk
 }
 
 void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb,
-			unsigned int __nocast gfp, int oth)
+			gfp_t gfp, int oth)
 {
 	struct dn_scp *scp = DN_SK(sk);
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
@@ -520,7 +519,7 @@ static int dn_nsp_retrans_conn_conf(struct sock *sk)
 	return 0;
 }
 
-void dn_send_conn_conf(struct sock *sk, unsigned int __nocast gfp)
+void dn_send_conn_conf(struct sock *sk, gfp_t gfp)
 {
 	struct dn_scp *scp = DN_SK(sk);
 	struct sk_buff *skb = NULL;
@@ -552,7 +551,7 @@ void dn_send_conn_conf(struct sock *sk, unsigned int __nocast gfp)
 
 
 static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, 
-			unsigned short reason, unsigned int __nocast gfp,
+			unsigned short reason, gfp_t gfp,
 			struct dst_entry *dst,
 			int ddl, unsigned char *dd, __u16 rem, __u16 loc)
 {
@@ -595,7 +594,7 @@ static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg,
 
 
 void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg, 
-			unsigned short reason, unsigned int __nocast gfp)
+			unsigned short reason, gfp_t gfp)
 {
 	struct dn_scp *scp = DN_SK(sk);
 	int ddl = 0;
@@ -616,7 +615,7 @@ void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg,
 {
 	struct dn_skb_cb *cb = DN_SKB_CB(skb);
 	int ddl = 0;
-	unsigned int __nocast gfp = GFP_ATOMIC;
+	gfp_t gfp = GFP_ATOMIC;
 
 	dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb->dst, ddl, 
 			NULL, cb->src_port, cb->dst_port);
@@ -628,7 +627,7 @@ void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval)
 	struct dn_scp *scp = DN_SK(sk);
 	struct sk_buff *skb;
 	unsigned char *ptr;
-	unsigned int __nocast gfp = GFP_ATOMIC;
+	gfp_t gfp = GFP_ATOMIC;
 
 	if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL)
 		return;
@@ -663,8 +662,7 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg)
 	unsigned char menuver;
 	struct dn_skb_cb *cb;
 	unsigned char type = 1;
-	unsigned int __nocast allocation =
-			(msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
+	gfp_t allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC;
 	struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation);
 
 	if (!skb)
diff --git a/net/ieee80211/ieee80211_tx.c b/net/ieee80211/ieee80211_tx.c
index ecdf9f7a538f..eed07bbbe6b6 100644
--- a/net/ieee80211/ieee80211_tx.c
+++ b/net/ieee80211/ieee80211_tx.c
@@ -207,7 +207,7 @@ void ieee80211_txb_free(struct ieee80211_txb *txb)
 }
 
 static struct ieee80211_txb *ieee80211_alloc_txb(int nr_frags, int txb_size,
-						 unsigned int __nocast gfp_mask)
+						 gfp_t gfp_mask)
 {
 	struct ieee80211_txb *txb;
 	int i;
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index fe3c6d3d0c91..94468a76c5b4 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -494,7 +494,7 @@ void inet_csk_reqsk_queue_prune(struct sock *parent,
 EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_prune);
 
 struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req,
-			    const unsigned int __nocast priority)
+			    const gfp_t priority)
 {
 	struct sock *newsk = sk_clone(sk, priority);
 
diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c
index b942ff3c8860..fc6f95aaa969 100644
--- a/net/ipv4/ipvs/ip_vs_app.c
+++ b/net/ipv4/ipvs/ip_vs_app.c
@@ -604,7 +604,7 @@ static struct file_operations ip_vs_app_fops = {
 /*
  *	Replace a segment of data with a new segment
  */
-int ip_vs_skb_replace(struct sk_buff *skb, unsigned int __nocast pri,
+int ip_vs_skb_replace(struct sk_buff *skb, gfp_t pri,
 		      char *o_buf, int o_len, char *n_buf, int n_len)
 {
 	struct iphdr *iph;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c5b911f9b662..8225e4257258 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1610,7 +1610,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index bbf0f69181ba..39031684b65c 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -185,7 +185,7 @@ static int pfkey_release(struct socket *sock)
 }
 
 static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
-			       unsigned int __nocast allocation, struct sock *sk)
+			       gfp_t allocation, struct sock *sk)
 {
 	int err = -ENOBUFS;
 
@@ -217,7 +217,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2,
 #define BROADCAST_ONE		1
 #define BROADCAST_REGISTERED	2
 #define BROADCAST_PROMISC_ONLY	4
-static int pfkey_broadcast(struct sk_buff *skb, unsigned int __nocast allocation,
+static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation,
 			   int broadcast_flags, struct sock *one_sk)
 {
 	struct sock *sk;
@@ -1417,7 +1417,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr,
 }
 
 static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig,
-					      unsigned int __nocast allocation)
+					      gfp_t allocation)
 {
 	struct sk_buff *skb;
 	struct sadb_msg *hdr;
diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c
index 042b24a8ca4c..c761c15da421 100644
--- a/net/llc/llc_conn.c
+++ b/net/llc/llc_conn.c
@@ -867,8 +867,7 @@ static void llc_sk_init(struct sock* sk)
  *	Allocates a LLC sock and initializes it. Returns the new LLC sock
  *	or %NULL if there's no memory available for one
  */
-struct sock *llc_sk_alloc(int family, unsigned int __nocast priority,
-			 struct proto *prot)
+struct sock *llc_sk_alloc(int family, gfp_t priority, struct proto *prot)
 {
 	struct sock *sk = sk_alloc(family, priority, prot, 1);
 
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 34d671974a4d..1caaca06f698 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -195,8 +195,7 @@ nfnetlink_check_attributes(struct nfnetlink_subsystem *subsys,
 
 int nfnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo)
 {
-	unsigned int __nocast allocation =
-			in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+	gfp_t allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
 	int err = 0;
 
 	NETLINK_CB(skb).dst_group = group;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a64e1d5ce3ca..678c3f2c0d0b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -758,7 +758,7 @@ void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
 }
 
 static inline struct sk_buff *netlink_trim(struct sk_buff *skb,
-					   unsigned int __nocast allocation)
+					   gfp_t allocation)
 {
 	int delta;
 
@@ -880,7 +880,7 @@ out:
 }
 
 int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid,
-		      u32 group, unsigned int __nocast allocation)
+		      u32 group, gfp_t allocation)
 {
 	struct netlink_broadcast_data info;
 	struct hlist_node *node;
diff --git a/net/rxrpc/call.c b/net/rxrpc/call.c
index 86f777052633..c4aeb7d40266 100644
--- a/net/rxrpc/call.c
+++ b/net/rxrpc/call.c
@@ -1923,7 +1923,7 @@ int rxrpc_call_write_data(struct rxrpc_call *call,
 			  size_t sioc,
 			  struct kvec *siov,
 			  u8 rxhdr_flags,
-			  unsigned int __nocast alloc_flags,
+			  gfp_t alloc_flags,
 			  int dup_data,
 			  size_t *size_sent)
 {
diff --git a/net/rxrpc/connection.c b/net/rxrpc/connection.c
index be4b2be58956..2ba14a75dbbe 100644
--- a/net/rxrpc/connection.c
+++ b/net/rxrpc/connection.c
@@ -522,7 +522,7 @@ int rxrpc_conn_newmsg(struct rxrpc_connection *conn,
 		      uint8_t type,
 		      int dcount,
 		      struct kvec diov[],
-		      unsigned int __nocast alloc_flags,
+		      gfp_t alloc_flags,
 		      struct rxrpc_message **_msg)
 {
 	struct rxrpc_message *msg;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 5b24ae0650d3..12b0f582a66b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -71,7 +71,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a
 					  const struct sctp_endpoint *ep,
 					  const struct sock *sk,
 					  sctp_scope_t scope,
-					  unsigned int __nocast gfp)
+					  gfp_t gfp)
 {
 	struct sctp_sock *sp;
 	int i;
@@ -273,7 +273,7 @@ fail_init:
 struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep,
 					 const struct sock *sk,
 					 sctp_scope_t scope,
-					 unsigned int __nocast gfp)
+					 gfp_t gfp)
 {
 	struct sctp_association *asoc;
 
@@ -479,7 +479,7 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
 /* Add a transport address to an association.  */
 struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
 					   const union sctp_addr *addr,
-					   const unsigned int __nocast gfp,
+					   const gfp_t gfp,
 					   const int peer_state)
 {
 	struct sctp_transport *peer;
@@ -1231,7 +1231,7 @@ void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len)
  * local endpoint and the remote peer.
  */
 int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
-				     unsigned int __nocast gfp)
+				     gfp_t gfp)
 {
 	sctp_scope_t scope;
 	int flags;
@@ -1254,7 +1254,7 @@ int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
 /* Build the association's bind address list from the cookie.  */
 int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc,
 					 struct sctp_cookie *cookie,
-					 unsigned int __nocast gfp)
+					 gfp_t gfp)
 {
 	int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length);
 	int var_size3 = cookie->raw_addr_list_len;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index f71549710f2e..2b962627f631 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -53,7 +53,7 @@
 
 /* Forward declarations for internal helpers. */
 static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *,
-			      sctp_scope_t scope, unsigned int __nocast gfp,
+			      sctp_scope_t scope, gfp_t gfp,
 			      int flags);
 static void sctp_bind_addr_clean(struct sctp_bind_addr *);
 
@@ -64,7 +64,7 @@ static void sctp_bind_addr_clean(struct sctp_bind_addr *);
  */
 int sctp_bind_addr_copy(struct sctp_bind_addr *dest, 
 			const struct sctp_bind_addr *src,
-			sctp_scope_t scope, unsigned int __nocast gfp,
+			sctp_scope_t scope, gfp_t gfp,
 			int flags)
 {
 	struct sctp_sockaddr_entry *addr;
@@ -146,7 +146,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
 
 /* Add an address to the bind address list in the SCTP_bind_addr structure. */
 int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
-		       unsigned int __nocast gfp)
+		       gfp_t gfp)
 {
 	struct sctp_sockaddr_entry *addr;
 
@@ -200,7 +200,7 @@ int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr)
  */
 union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp,
 					 int *addrs_len,
-					 unsigned int __nocast gfp)
+					 gfp_t gfp)
 {
 	union sctp_params addrparms;
 	union sctp_params retval;
@@ -252,7 +252,7 @@ end_raw:
  * address parameters).
  */
 int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
-			   int addrs_len, __u16 port, unsigned int __nocast gfp)
+			   int addrs_len, __u16 port, gfp_t gfp)
 {
 	union sctp_addr_param *rawaddr;
 	struct sctp_paramhdr *param;
@@ -350,7 +350,7 @@ union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr	*bp,
 /* Copy out addresses from the global local address list. */
 static int sctp_copy_one_addr(struct sctp_bind_addr *dest, 
 			      union sctp_addr *addr,
-			      sctp_scope_t scope, unsigned int __nocast gfp,
+			      sctp_scope_t scope, gfp_t gfp,
 			      int flags)
 {
 	int error = 0;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index 61da2937e641..83ef411772f4 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -62,7 +62,7 @@ static void sctp_datamsg_init(struct sctp_datamsg *msg)
 }
 
 /* Allocate and initialize datamsg. */
-SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(unsigned int __nocast gfp)
+SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
 {
 	struct sctp_datamsg *msg;
 	msg = kmalloc(sizeof(struct sctp_datamsg), gfp);
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index e22ccd655965..96984f7a2d69 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -68,7 +68,7 @@ static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep);
  */
 static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 						struct sock *sk,
-						unsigned int __nocast gfp)
+						gfp_t gfp)
 {
 	struct sctp_sock *sp = sctp_sk(sk);
 	memset(ep, 0, sizeof(struct sctp_endpoint));
@@ -138,8 +138,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
 /* Create a sctp_endpoint with all that boring stuff initialized.
  * Returns NULL if there isn't enough memory.
  */
-struct sctp_endpoint *sctp_endpoint_new(struct sock *sk,
-					unsigned int __nocast gfp)
+struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, gfp_t gfp)
 {
 	struct sctp_endpoint *ep;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f01d1c9002a1..26de4d3e1bd9 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -219,7 +219,7 @@ static void sctp_free_local_addr_list(void)
 
 /* Copy the local addresses which are valid for 'scope' into 'bp'.  */
 int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope,
-			      unsigned int __nocast gfp, int copy_flags)
+			      gfp_t gfp, int copy_flags)
 {
 	struct sctp_sockaddr_entry *addr;
 	int error = 0;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 3868a8d70cc0..10e82ec2ebd3 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -78,7 +78,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
 static int sctp_process_param(struct sctp_association *asoc,
 			      union sctp_params param,
 			      const union sctp_addr *peer_addr,
-			      unsigned int __nocast gfp);
+			      gfp_t gfp);
 
 /* What was the inbound interface for this chunk? */
 int sctp_chunk_iif(const struct sctp_chunk *chunk)
@@ -174,7 +174,7 @@ void  sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code,
  */
 struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
 			     const struct sctp_bind_addr *bp,
-			     unsigned int __nocast gfp, int vparam_len)
+			     gfp_t gfp, int vparam_len)
 {
 	sctp_inithdr_t init;
 	union sctp_params addrs;
@@ -261,7 +261,7 @@ nodata:
 
 struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
 				 const struct sctp_chunk *chunk,
-				 unsigned int __nocast gfp, int unkparam_len)
+				 gfp_t gfp, int unkparam_len)
 {
 	sctp_inithdr_t initack;
 	struct sctp_chunk *retval;
@@ -1234,7 +1234,7 @@ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk)
 /* Create a CLOSED association to use with an incoming packet.  */
 struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep,
 					struct sctp_chunk *chunk,
-					unsigned int __nocast gfp)
+					gfp_t gfp)
 {
 	struct sctp_association *asoc;
 	struct sk_buff *skb;
@@ -1349,7 +1349,7 @@ nodata:
 struct sctp_association *sctp_unpack_cookie(
 	const struct sctp_endpoint *ep,
 	const struct sctp_association *asoc,
-	struct sctp_chunk *chunk, unsigned int __nocast gfp,
+	struct sctp_chunk *chunk, gfp_t gfp,
 	int *error, struct sctp_chunk **errp)
 {
 	struct sctp_association *retval = NULL;
@@ -1814,7 +1814,7 @@ int sctp_verify_init(const struct sctp_association *asoc,
  */
 int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid,
 		      const union sctp_addr *peer_addr,
-		      sctp_init_chunk_t *peer_init, unsigned int __nocast gfp)
+		      sctp_init_chunk_t *peer_init, gfp_t gfp)
 {
 	union sctp_params param;
 	struct sctp_transport *transport;
@@ -1985,7 +1985,7 @@ nomem:
 static int sctp_process_param(struct sctp_association *asoc,
 			      union sctp_params param,
 			      const union sctp_addr *peer_addr,
-			      unsigned int __nocast gfp)
+			      gfp_t gfp)
 {
 	union sctp_addr addr;
 	int i;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 39c970b5b198..f84173ea8ec1 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				void *event_arg,
 			 	sctp_disposition_t status,
 				sctp_cmd_seq_t *commands,
-				unsigned int __nocast gfp);
+				gfp_t gfp);
 static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     sctp_state_t state,
 			     struct sctp_endpoint *ep,
@@ -71,7 +71,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     void *event_arg,
 			     sctp_disposition_t status,
 			     sctp_cmd_seq_t *commands,
-			     unsigned int __nocast gfp);
+			     gfp_t gfp);
 
 /********************************************************************
  * Helper functions
@@ -498,7 +498,7 @@ static int sctp_cmd_process_init(sctp_cmd_seq_t *commands,
 				 struct sctp_association *asoc,
 				 struct sctp_chunk *chunk,
 				 sctp_init_chunk_t *peer_init,
-				 unsigned int __nocast gfp)
+				 gfp_t gfp)
 {
 	int error;
 
@@ -853,7 +853,7 @@ int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype,
 	       struct sctp_endpoint *ep,
 	       struct sctp_association *asoc,
 	       void *event_arg,
-	       unsigned int __nocast gfp)
+	       gfp_t gfp)
 {
 	sctp_cmd_seq_t commands;
 	const sctp_sm_table_entry_t *state_fn;
@@ -898,7 +898,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
 			     void *event_arg,
 			     sctp_disposition_t status,
 			     sctp_cmd_seq_t *commands,
-			     unsigned int __nocast gfp)
+			     gfp_t gfp)
 {
 	int error;
 
@@ -986,7 +986,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
 				void *event_arg,
 			 	sctp_disposition_t status,
 				sctp_cmd_seq_t *commands,
-				unsigned int __nocast gfp)
+				gfp_t gfp)
 {
 	int error = 0;
 	int force;
diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c
index 25037daf3fa0..cbe2513d2822 100644
--- a/net/sctp/ssnmap.c
+++ b/net/sctp/ssnmap.c
@@ -58,7 +58,7 @@ static inline size_t sctp_ssnmap_size(__u16 in, __u16 out)
  * Allocate room to store at least 'len' contiguous TSNs.
  */
 struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out,
-				    unsigned int __nocast gfp)
+				    gfp_t gfp)
 {
 	struct sctp_ssnmap *retval;
 	int size;
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index d2f04ebe5081..6bc27200e6ca 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -57,7 +57,7 @@
 /* Initialize a new transport from provided memory.  */
 static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 						  const union sctp_addr *addr,
-						  unsigned int __nocast gfp)
+						  gfp_t gfp)
 {
 	/* Copy in the address.  */
 	peer->ipaddr = *addr;
@@ -122,7 +122,7 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer,
 
 /* Allocate and initialize a new transport.  */
 struct sctp_transport *sctp_transport_new(const union sctp_addr *addr,
-					  unsigned int __nocast gfp)
+					  gfp_t gfp)
 {
         struct sctp_transport *transport;
 
diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c
index 0abd5101107c..057e7fac3af0 100644
--- a/net/sctp/ulpevent.c
+++ b/net/sctp/ulpevent.c
@@ -74,7 +74,7 @@ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags)
 
 /* Create a new sctp_ulpevent.  */
 SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags,
-						    unsigned int __nocast gfp)
+						    gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sk_buff *skb;
@@ -136,7 +136,7 @@ static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event)
 struct sctp_ulpevent  *sctp_ulpevent_make_assoc_change(
 	const struct sctp_association *asoc,
 	__u16 flags, __u16 state, __u16 error, __u16 outbound,
-	__u16 inbound, unsigned int __nocast gfp)
+	__u16 inbound, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_assoc_change *sac;
@@ -237,7 +237,7 @@ fail:
 struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change(
 	const struct sctp_association *asoc,
 	const struct sockaddr_storage *aaddr,
-	int flags, int state, int error, unsigned int __nocast gfp)
+	int flags, int state, int error, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_paddr_change  *spc;
@@ -350,7 +350,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_remote_error(
 	const struct sctp_association *asoc, struct sctp_chunk *chunk,
-	__u16 flags, unsigned int __nocast gfp)
+	__u16 flags, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_remote_error *sre;
@@ -448,7 +448,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_send_failed(
 	const struct sctp_association *asoc, struct sctp_chunk *chunk,
-	__u16 flags, __u32 error, unsigned int __nocast gfp)
+	__u16 flags, __u32 error, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_send_failed *ssf;
@@ -557,7 +557,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event(
 	const struct sctp_association *asoc,
-	__u16 flags, unsigned int __nocast gfp)
+	__u16 flags, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_shutdown_event *sse;
@@ -620,7 +620,7 @@ fail:
  * 5.3.1.6 SCTP_ADAPTION_INDICATION
  */
 struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication(
-	const struct sctp_association *asoc, unsigned int __nocast gfp)
+	const struct sctp_association *asoc, gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_adaption_event *sai;
@@ -657,7 +657,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc,
 						struct sctp_chunk *chunk,
-						unsigned int __nocast gfp)
+						gfp_t gfp)
 {
 	struct sctp_ulpevent *event = NULL;
 	struct sk_buff *skb;
@@ -719,7 +719,7 @@ fail:
  */
 struct sctp_ulpevent *sctp_ulpevent_make_pdapi(
 	const struct sctp_association *asoc, __u32 indication,
-	unsigned int __nocast gfp)
+	gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_pdapi_event *pd;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index ec2c857eae7f..2080b2d28c98 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -100,7 +100,7 @@ void sctp_ulpq_free(struct sctp_ulpq *ulpq)
 
 /* Process an incoming DATA chunk.  */
 int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
-			unsigned int __nocast gfp)
+			gfp_t gfp)
 {
 	struct sk_buff_head temp;
 	sctp_data_chunk_t *hdr;
@@ -792,7 +792,7 @@ static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed)
 /* Partial deliver the first message as there is pressure on rwnd. */
 void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
 				struct sctp_chunk *chunk,
-				unsigned int __nocast gfp)
+				gfp_t gfp)
 {
 	struct sctp_ulpevent *event;
 	struct sctp_association *asoc;
@@ -816,7 +816,7 @@ void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq,
 
 /* Renege some packets to make room for an incoming chunk.  */
 void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
-		      unsigned int __nocast gfp)
+		      gfp_t gfp)
 {
 	struct sctp_association *asoc;
 	__u16 needed, freed;
@@ -855,7 +855,7 @@ void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk,
 /* Notify the application if an association is aborted and in
  * partial delivery mode.  Send up any pending received messages.
  */
-void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, unsigned int __nocast gfp)
+void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp)
 {
 	struct sctp_ulpevent *ev = NULL;
 	struct sock *sk;
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index ade730eaf401..54e60a657500 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -719,7 +719,7 @@ static void rpc_async_schedule(void *arg)
 void *
 rpc_malloc(struct rpc_task *task, size_t size)
 {
-	unsigned int __nocast	gfp;
+	gfp_t	gfp;
 
 	if (task->tk_flags & RPC_TASK_SWAPPER)
 		gfp = GFP_ATOMIC;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 061b44cc2451..cbb0ba34a600 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -225,7 +225,7 @@ expired:
  * SPD calls.
  */
 
-struct xfrm_policy *xfrm_policy_alloc(unsigned int __nocast gfp)
+struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp)
 {
 	struct xfrm_policy *policy;
 
diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c
index 91124ddbdda9..e72cec77f0db 100644
--- a/sound/core/memalloc.c
+++ b/sound/core/memalloc.c
@@ -106,7 +106,7 @@ struct snd_mem_list {
 
 static void *snd_dma_hack_alloc_coherent(struct device *dev, size_t size,
 					 dma_addr_t *dma_handle,
-					 unsigned int __nocast flags)
+					 gfp_t flags)
 {
 	void *ret;
 	u64 dma_mask, coherent_dma_mask;
diff --git a/sound/core/memory.c b/sound/core/memory.c
index 8fa888fc53a0..7d8e2eebba51 100644
--- a/sound/core/memory.c
+++ b/sound/core/memory.c
@@ -89,7 +89,7 @@ void snd_memory_done(void)
 	}
 }
 
-static void *__snd_kmalloc(size_t size, unsigned int __nocast flags, void *caller)
+static void *__snd_kmalloc(size_t size, gfp_t flags, void *caller)
 {
 	unsigned long cpu_flags;
 	struct snd_alloc_track *t;
@@ -111,12 +111,12 @@ static void *__snd_kmalloc(size_t size, unsigned int __nocast flags, void *calle
 }
 
 #define _snd_kmalloc(size, flags) __snd_kmalloc((size), (flags), __builtin_return_address(0));
-void *snd_hidden_kmalloc(size_t size, unsigned int __nocast flags)
+void *snd_hidden_kmalloc(size_t size, gfp_t flags)
 {
 	return _snd_kmalloc(size, flags);
 }
 
-void *snd_hidden_kzalloc(size_t size, unsigned int __nocast flags)
+void *snd_hidden_kzalloc(size_t size, gfp_t flags)
 {
 	void *ret = _snd_kmalloc(size, flags);
 	if (ret)
@@ -125,7 +125,7 @@ void *snd_hidden_kzalloc(size_t size, unsigned int __nocast flags)
 }
 EXPORT_SYMBOL(snd_hidden_kzalloc);
 
-void *snd_hidden_kcalloc(size_t n, size_t size, unsigned int __nocast flags)
+void *snd_hidden_kcalloc(size_t n, size_t size, gfp_t flags)
 {
 	void *ret = NULL;
 	if (n != 0 && size > INT_MAX / n)
@@ -190,7 +190,7 @@ void snd_hidden_vfree(void *obj)
 	snd_wrapper_vfree(obj);
 }
 
-char *snd_hidden_kstrdup(const char *s, unsigned int __nocast flags)
+char *snd_hidden_kstrdup(const char *s, gfp_t flags)
 {
 	int len;
 	char *buf;
diff --git a/sound/core/seq/instr/ainstr_iw.c b/sound/core/seq/instr/ainstr_iw.c
index b3cee092b1a4..67c24c8e8e7b 100644
--- a/sound/core/seq/instr/ainstr_iw.c
+++ b/sound/core/seq/instr/ainstr_iw.c
@@ -58,7 +58,7 @@ static int snd_seq_iwffff_copy_env_from_stream(__u32 req_stype,
 					       iwffff_xenv_t *ex,
 					       char __user **data,
 					       long *len,
-					       unsigned int __nocast gfp_mask)
+					       gfp_t gfp_mask)
 {
 	__u32 stype;
 	iwffff_env_record_t *rp, *rp_last;
diff --git a/sound/core/wrappers.c b/sound/core/wrappers.c
index 508e6d67ee19..296b716f1376 100644
--- a/sound/core/wrappers.c
+++ b/sound/core/wrappers.c
@@ -27,7 +27,7 @@
 #include <linux/fs.h>
 
 #ifdef CONFIG_SND_DEBUG_MEMORY
-void *snd_wrapper_kmalloc(size_t size, unsigned int __nocast flags)
+void *snd_wrapper_kmalloc(size_t size, gfp_t flags)
 {
 	return kmalloc(size, flags);
 }
-- 
cgit v1.2.3


From 3dd083255ddcfa87751fa8e32f61a9547a15a541 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 9 Oct 2005 21:19:40 +0200
Subject: [PATCH] x86_64: Set up safe page tables during resume

The following patch makes swsusp avoid the possible temporary corruption
of page translation tables during resume on x86-64.  This is achieved by
creating a copy of the relevant page tables that will not be modified by
swsusp and can be safely used by it on resume.

The problem is that during resume on x86-64 swsusp may temporarily
corrupt the page tables used for the direct mapping of RAM.  If that
happens, a page fault occurs and cannot be handled properly, which leads
to the solid hang of the affected system.  This leads to the loss of the
system's state from before suspend and may result in the loss of data or
the corruption of filesystems, so it is a serious issue.  Also, it
appears to happen quite often (for me, as often as 50% of the time).

The problem is related to the fact that (at least) one of the PMD
entries used in the direct memory mapping (starting at PAGE_OFFSET)
points to a page table the physical address of which is much greater
than the physical address of the PMD entry itself.  Moreover,
unfortunately, the physical address of the page table before suspend
(i.e.  the one stored in the suspend image) happens to be different to
the physical address of the corresponding page table used during resume
(i.e.  the one that is valid right before swsusp_arch_resume() in
arch/x86_64/kernel/suspend_asm.S is executed).  Thus while the image is
restored, the "offending" PMD entry gets overwritten, so it does not
point to the right physical address any more (i.e.  there's no page
table at the address pointed to by it, because it points to the address
the page table has been at during suspend).  Consequently, if the PMD
entry is used later on, and it _is_ used in the process of copying the
image pages, a page fault occurs, but it cannot be handled in the normal
way and the system hangs.

In principle we can call create_resume_mapping() from
swsusp_arch_resume() (ie.  from suspend_asm.S), but then the memory
allocations in create_resume_mapping(), resume_pud_mapping(), and
resume_pmd_mapping() must be made carefully so that we use _only_
NosaveFree pages in them (the other pages are overwritten by the loop in
swsusp_arch_resume()).  Additionally, we are in atomic context at that
time, so we cannot use GFP_KERNEL.  Moreover, if one of the allocations
fails, we should free all of the allocated pages, so we need to trace
them somehow.

All of this is done in the appended patch, except that the functions
populating the page tables are located in arch/x86_64/kernel/suspend.c
rather than in init.c.  It may be done in a more elegan way in the
future, with the help of some swsusp patches that are in the works now.

[AK: move some externs into headers, renamed a function]

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c     | 127 +++++++++++++++++++++++++++++++++++++++
 arch/x86_64/kernel/suspend_asm.S |  17 ++++--
 include/linux/suspend.h          |   2 +
 kernel/power/swsusp.c            |   7 ++-
 4 files changed, 144 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index ebb9abf3ce6d..f066c6ab3618 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -11,6 +11,8 @@
 #include <linux/smp.h>
 #include <linux/suspend.h>
 #include <asm/proto.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
 
 struct saved_context saved_context;
 
@@ -140,4 +142,129 @@ void fix_processor_context(void)
 
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/* Defined in arch/x86_64/kernel/suspend_asm.S */
+extern int restore_image(void);
 
+pgd_t *temp_level4_pgt;
+
+static void **pages;
+
+static inline void *__add_page(void)
+{
+	void **c;
+
+	c = (void **)get_usable_page(GFP_ATOMIC);
+	if (c) {
+		*c = pages;
+		pages = c;
+	}
+	return c;
+}
+
+static inline void *__next_page(void)
+{
+	void **c;
+
+	c = pages;
+	if (c) {
+		pages = *c;
+		*c = NULL;
+	}
+	return c;
+}
+
+/*
+ * Try to allocate as many usable pages as needed and daisy chain them.
+ * If one allocation fails, free the pages allocated so far
+ */
+static int alloc_usable_pages(unsigned long n)
+{
+	void *p;
+
+	pages = NULL;
+	do
+		if (!__add_page())
+			break;
+	while (--n);
+	if (n) {
+		p = __next_page();
+		while (p) {
+			free_page((unsigned long)p);
+			p = __next_page();
+		}
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+{
+	long i, j;
+
+	i = pud_index(address);
+	pud = pud + i;
+	for (; i < PTRS_PER_PUD; pud++, i++) {
+		unsigned long paddr;
+		pmd_t *pmd;
+
+		paddr = address + i*PUD_SIZE;
+		if (paddr >= end)
+			break;
+
+		pmd = (pmd_t *)__next_page();
+		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
+			unsigned long pe;
+
+			if (paddr >= end)
+				break;
+			pe = _PAGE_NX | _PAGE_PSE | _KERNPG_TABLE | paddr;
+			pe &= __supported_pte_mask;
+			set_pmd(pmd, __pmd(pe));
+		}
+	}
+}
+
+static void set_up_temporary_mappings(void)
+{
+	unsigned long start, end, next;
+
+	temp_level4_pgt = (pgd_t *)__next_page();
+
+	/* It is safe to reuse the original kernel mapping */
+	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
+		init_level4_pgt[pgd_index(__START_KERNEL_map)]);
+
+	/* Set up the direct mapping from scratch */
+	start = (unsigned long)pfn_to_kaddr(0);
+	end = (unsigned long)pfn_to_kaddr(end_pfn);
+
+	for (; start < end; start = next) {
+		pud_t *pud = (pud_t *)__next_page();
+		next = start + PGDIR_SIZE;
+		if (next > end)
+			next = end;
+		res_phys_pud_init(pud, __pa(start), __pa(next));
+		set_pgd(temp_level4_pgt + pgd_index(start),
+			mk_kernel_pgd(__pa(pud)));
+	}
+}
+
+int swsusp_arch_resume(void)
+{
+	unsigned long n;
+
+	n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT;
+	n += (n + PTRS_PER_PUD - 1) / PTRS_PER_PUD + 1;
+	pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n);
+	if (alloc_usable_pages(n)) {
+		free_eaten_memory();
+		return -ENOMEM;
+	}
+	/* We have got enough memory and from now on we cannot recover */
+	set_up_temporary_mappings();
+	restore_image();
+	return 0;
+}
+#endif /* CONFIG_SOFTWARE_SUSPEND */
diff --git a/arch/x86_64/kernel/suspend_asm.S b/arch/x86_64/kernel/suspend_asm.S
index 4d659e97df10..320b6fb00cca 100644
--- a/arch/x86_64/kernel/suspend_asm.S
+++ b/arch/x86_64/kernel/suspend_asm.S
@@ -39,12 +39,13 @@ ENTRY(swsusp_arch_suspend)
 	call swsusp_save
 	ret
 
-ENTRY(swsusp_arch_resume)
-	/* set up cr3 */	
-	leaq	init_level4_pgt(%rip),%rax
-	subq	$__START_KERNEL_map,%rax
-	movq	%rax,%cr3
-
+ENTRY(restore_image)
+	/* switch to temporary page tables */
+	movq	$__PAGE_OFFSET, %rdx
+	movq	temp_level4_pgt(%rip), %rax
+	subq	%rdx, %rax
+	movq	%rax, %cr3
+	/* Flush TLB */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
 	andq	$~(1<<7), %rdx	# PGE
@@ -69,6 +70,10 @@ loop:
 	movq	pbe_next(%rdx), %rdx
 	jmp	loop
 done:
+	/* go back to the original page tables */
+	leaq	init_level4_pgt(%rip), %rax
+	subq	$__START_KERNEL_map, %rax
+	movq	%rax, %cr3
 	/* Flush TLB, including "global" things (vmalloc) */
 	movq	mmu_cr4_features(%rip), %rax
 	movq	%rax, %rdx
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index f2e96fdfaae0..ad15a54806d8 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,5 +71,7 @@ void restore_processor_state(void);
 struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
+extern unsigned long get_usable_page(unsigned gfp_mask);
+extern void free_eaten_memory(void);
 
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index acf79ac1cb6d..2d5c45676442 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1095,7 +1095,7 @@ static inline void eat_page(void *page)
 	*eaten_memory = c;
 }
 
-static unsigned long get_usable_page(unsigned gfp_mask)
+unsigned long get_usable_page(unsigned gfp_mask)
 {
 	unsigned long m;
 
@@ -1109,7 +1109,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
 	return m;
 }
 
-static void free_eaten_memory(void)
+void free_eaten_memory(void)
 {
 	unsigned long m;
 	void **c;
@@ -1481,11 +1481,12 @@ static int read_suspend_image(void)
 	/* Allocate memory for the image and read the data from swap */
 
 	error = check_pagedir(pagedir_nosave);
-	free_eaten_memory();
+
 	if (!error)
 		error = data_read(pagedir_nosave);
 
 	if (error) { /* We fail cleanly */
+		free_eaten_memory();
 		for_each_pbe (p, pagedir_nosave)
 			if (p->address) {
 				free_page(p->address);
-- 
cgit v1.2.3


From 46113830a18847cff8da73005e57bc49c2f95a56 Mon Sep 17 00:00:00 2001
From: Harald Welte <laforge@gnumonks.org>
Date: Mon, 10 Oct 2005 19:44:29 +0200
Subject: [PATCH] Fix signal sending in usbdevio on async URB completion

If a process issues an URB from userspace and (starts to) terminate
before the URB comes back, we run into the issue described above.  This
is because the urb saves a pointer to "current" when it is posted to the
device, but there's no guarantee that this pointer is still valid
afterwards.

In fact, there are three separate issues:

1) the pointer to "current" can become invalid, since the task could be
   completely gone when the URB completion comes back from the device.

2) Even if the saved task pointer is still pointing to a valid task_struct,
   task_struct->sighand could have gone meanwhile.

3) Even if the process is perfectly fine, permissions may have changed,
   and we can no longer send it a signal.

So what we do instead, is to save the PID and uid's of the process, and
introduce a new kill_proc_info_as_uid() function.

Signed-off-by: Harald Welte <laforge@gnumonks.org>
[ Fixed up types and added symbol exports ]
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/usb/core/devio.c | 12 +++++++++---
 include/linux/sched.h    |  1 +
 kernel/signal.c          | 34 ++++++++++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index b4265aa7d45e..6c35dcbea664 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -30,6 +30,8 @@
  *  Revision history
  *    22.12.1999   0.1   Initial release (split from proc_usb.c)
  *    04.01.2000   0.2   Turned into its own filesystem
+ *    30.09.2005   0.3   Fix user-triggerable oops in async URB delivery
+ *    			 (CAN-2005-3055)
  */
 
 /*****************************************************************************/
@@ -58,7 +60,8 @@ static struct class *usb_device_class;
 struct async {
 	struct list_head asynclist;
 	struct dev_state *ps;
-	struct task_struct *task;
+	pid_t pid;
+	uid_t uid, euid;
 	unsigned int signr;
 	unsigned int ifnum;
 	void __user *userbuffer;
@@ -290,7 +293,8 @@ static void async_completed(struct urb *urb, struct pt_regs *regs)
 		sinfo.si_errno = as->urb->status;
 		sinfo.si_code = SI_ASYNCIO;
 		sinfo.si_addr = as->userurb;
-		send_sig_info(as->signr, &sinfo, as->task);
+		kill_proc_info_as_uid(as->signr, &sinfo, as->pid, as->uid, 
+				      as->euid);
 	}
         wake_up(&ps->wait);
 }
@@ -988,7 +992,9 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
 		as->userbuffer = NULL;
 	as->signr = uurb->signr;
 	as->ifnum = ifnum;
-	as->task = current;
+	as->pid = current->pid;
+	as->uid = current->uid;
+	as->euid = current->euid;
 	if (!(uurb->endpoint & USB_DIR_IN)) {
 		if (copy_from_user(as->urb->transfer_buffer, uurb->buffer, as->urb->transfer_buffer_length)) {
 			free_async(as);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c3ba31f210a9..27519df0f987 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1018,6 +1018,7 @@ extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp);
 extern int kill_pg_info(int, struct siginfo *, pid_t);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
+extern int kill_proc_info_as_uid(int, struct siginfo *, pid_t, uid_t, uid_t);
 extern void do_notify_parent(struct task_struct *, int);
 extern void force_sig(int, struct task_struct *);
 extern void force_sig_specific(int, struct task_struct *);
diff --git a/kernel/signal.c b/kernel/signal.c
index cba193ceda0d..50c992643771 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1193,6 +1193,40 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 	return error;
 }
 
+/* like kill_proc_info(), but doesn't use uid/euid of "current" */
+int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
+		      uid_t uid, uid_t euid)
+{
+	int ret = -EINVAL;
+	struct task_struct *p;
+
+	if (!valid_signal(sig))
+		return ret;
+
+	read_lock(&tasklist_lock);
+	p = find_task_by_pid(pid);
+	if (!p) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+	if ((!info || ((unsigned long)info != 1 &&
+			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	    && (euid != p->suid) && (euid != p->uid)
+	    && (uid != p->suid) && (uid != p->uid)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+	if (sig && p->sighand) {
+		unsigned long flags;
+		spin_lock_irqsave(&p->sighand->siglock, flags);
+		ret = __group_send_sig_info(sig, info, p);
+		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	}
+out_unlock:
+	read_unlock(&tasklist_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kill_proc_info_as_uid);
 
 /*
  * kill_something_info() interprets pid in interesting ways just like kill(2).
-- 
cgit v1.2.3


From c6ecf7ed3131961e5aeedb0efd217afa0808798f Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 14 Oct 2005 15:59:03 -0700
Subject: [PATCH] Add missing export of getnstimeofday()

Adds the missing EXPORT_SYMBOL_GPL for getnstimeofday() when
CONFIG_TIME_INTERPOLATION isn't set.  Needed by drivers/char/mmtimer.c

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/time.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time.c b/kernel/time.c
index dd5ae1162a8f..40c2410ac99a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -570,6 +570,7 @@ void getnstimeofday(struct timespec *tv)
 	tv->tv_sec = x.tv_sec;
 	tv->tv_nsec = x.tv_usec * NSEC_PER_USEC;
 }
+EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
 #if (BITS_PER_LONG < 64)
-- 
cgit v1.2.3


From 2cc78eb52bc1ae89f0a4fa5a00eb998dffde4a9f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 17 Oct 2005 09:10:15 -0700
Subject: Increase default RCU batching sharply

Dipankar made RCU limit the batch size to improve latency, but that
approach is unworkable: it can cause the RCU queues to grow without
bounds, since the batch limiter ended up limiting the callbacks.

So make the limit much higher, and start planning on instead limiting
the batch size by doing RCU callbacks more often if the queue looks like
it might be growing too long.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcupdate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index bef3b6901b76..dd99415b1551 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -71,7 +71,7 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int maxbatch = 10;
+static int maxbatch = 10000;
 
 #ifndef __HAVE_ARCH_CMPXCHG
 /*
-- 
cgit v1.2.3


From 47d6b08334a43fafa61a587f721fa21ef65d81be Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 17 Oct 2005 18:49:42 +0400
Subject: [PATCH] posix-timers: fix task accounting

Make sure we release the task struct properly when releasing pending
timers.

release_task() does write_lock_irq(&tasklist_lock), so it can't race
with run_posix_cpu_timers() on any cpu.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ad85d3f0dcc4..7a51a5597c33 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -424,6 +424,7 @@ static void cleanup_timers(struct list_head *head,
 	cputime_t ptime = cputime_add(utime, stime);
 
 	list_for_each_entry_safe(timer, next, head, entry) {
+		put_task_struct(timer->task);
 		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, ptime)) {
@@ -436,6 +437,7 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
+		put_task_struct(timer->task);
 		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, utime)) {
@@ -448,6 +450,7 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
+		put_task_struct(timer->task);
 		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (timer->expires.sched < sched_time) {
-- 
cgit v1.2.3


From 5ee832dbc6770135ec8d63296af0a4374557bb79 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Mon, 17 Oct 2005 20:01:21 +0200
Subject: [PATCH] rcu: keep rcu callback event counter

This makes call_rcu() keep track of how many events there are on the RCU
list, and cause a reschedule event when the list gets too long.

This helps keep RCU event lists down.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/rcupdate.h |  1 +
 kernel/rcupdate.c        | 11 +++++++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 4e65eb44adfd..70191a5a148f 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -94,6 +94,7 @@ struct rcu_data {
 	long  	       	batch;           /* Batch # for current RCU batch */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail;
+	long            count; /* # of queued items */
 	struct rcu_head *curlist;
 	struct rcu_head **curtail;
 	struct rcu_head *donelist;
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index dd99415b1551..2559d4b8f23f 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -109,6 +109,10 @@ void fastcall call_rcu(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->count > 10000))
+		set_need_resched();
+
 	local_irq_restore(flags);
 }
 
@@ -140,6 +144,12 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
+	rdp->count++;
+/*
+ *  Should we directly call rcu_do_batch() here ?
+ *  if (unlikely(rdp->count > 10000))
+ *      rcu_do_batch(rdp);
+ */
 	local_irq_restore(flags);
 }
 
@@ -157,6 +167,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
 		next = rdp->donelist = list->next;
 		list->func(list);
 		list = next;
+		rdp->count--;
 		if (++count >= maxbatch)
 			break;
 	}
-- 
cgit v1.2.3


From e03d13e985d48ac4885382c9e3b1510c78bd047f Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Wed, 19 Oct 2005 22:21:23 -0700
Subject: [PATCH] Fix cpu timers exit deadlock and races

Oleg Nesterov reported an SMP deadlock.  If there is a running timer
tracking a different process's CPU time clock when the process owning
the timer exits, we deadlock on tasklist_lock in posix_cpu_timer_del via
exit_itimers.

That code was using tasklist_lock to check for a race with __exit_signal
being called on the timer-target task and clearing its ->signal.
However, there is actually no such race.  __exit_signal will have called
posix_cpu_timers_exit and posix_cpu_timers_exit_group before it does
that.  Those will clear those k_itimer's association with the dying
task, so posix_cpu_timer_del will return early and never reach the code
in question.

In addition, posix_cpu_timer_del called from exit_itimers during execve
or directly from timer_delete in the process owning the timer can race
with an exiting timer-target task to cause a double put on timer-target
task struct.  Make sure we always access cpu_timers lists with sighand
lock held.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Chris Wright <chrisw@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7a51a5597c33..b3f3edc475de 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -387,25 +387,19 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 	if (unlikely(p == NULL))
 		return 0;
 
+	spin_lock(&p->sighand->siglock);
 	if (!list_empty(&timer->it.cpu.entry)) {
-		read_lock(&tasklist_lock);
-		if (unlikely(p->signal == NULL)) {
-			/*
-			 * We raced with the reaping of the task.
-			 * The deletion should have cleared us off the list.
-			 */
-			BUG_ON(!list_empty(&timer->it.cpu.entry));
-		} else {
-			/*
-			 * Take us off the task's timer list.
-			 */
-			spin_lock(&p->sighand->siglock);
-			list_del(&timer->it.cpu.entry);
-			spin_unlock(&p->sighand->siglock);
-		}
-		read_unlock(&tasklist_lock);
+		/*
+		 * Take us off the task's timer list.  We don't need to
+		 * take tasklist_lock and check for the task being reaped.
+		 * If it was reaped, it already called posix_cpu_timers_exit
+		 * and posix_cpu_timers_exit_group to clear all the timers
+		 * that pointed to it.
+		 */
+		list_del(&timer->it.cpu.entry);
+		put_task_struct(p);
 	}
-	put_task_struct(p);
+	spin_unlock(&p->sighand->siglock);
 
 	return 0;
 }
-- 
cgit v1.2.3


From d1209d049bbc3df66650f8417637be4f7b57b604 Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 19 Oct 2005 21:23:51 -0700
Subject: [PATCH] Threads shouldn't inherit PF_NOFREEZE

The PF_NOFREEZE process flag should not be inherited when a thread is
forked.  This patch (as585) removes the flag from the child.

This problem is starting to show up more and more as drivers turn to the
kthread API instead of using kernel_thread().  As a result, their kernel
threads are now children of the kthread worker instead of modprobe, and
they inherit the PF_NOFREEZE flag.  This can cause problems during system
suspend; the kernel threads are not getting frozen as they ought to be.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 533ce27f4b2c..280bd44ac441 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -848,7 +848,7 @@ static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;
 
-	new_flags &= ~PF_SUPERPRIV;
+	new_flags &= ~(PF_SUPERPRIV | PF_NOFREEZE);
 	new_flags |= PF_FORKNOEXEC;
 	if (!(clone_flags & CLONE_PTRACE))
 		p->ptrace = 0;
-- 
cgit v1.2.3


From 9465bee863bc4c6cf1566c12d6f92a8133e3da5c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Fri, 21 Oct 2005 15:36:00 -0700
Subject: Revert "Fix cpu timers exit deadlock and races"

Revert commit e03d13e985d48ac4885382c9e3b1510c78bd047f, to be replaced
by a much nicer fix from Roland.
---
 kernel/posix-cpu-timers.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b3f3edc475de..7a51a5597c33 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -387,19 +387,25 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 	if (unlikely(p == NULL))
 		return 0;
 
-	spin_lock(&p->sighand->siglock);
 	if (!list_empty(&timer->it.cpu.entry)) {
-		/*
-		 * Take us off the task's timer list.  We don't need to
-		 * take tasklist_lock and check for the task being reaped.
-		 * If it was reaped, it already called posix_cpu_timers_exit
-		 * and posix_cpu_timers_exit_group to clear all the timers
-		 * that pointed to it.
-		 */
-		list_del(&timer->it.cpu.entry);
-		put_task_struct(p);
+		read_lock(&tasklist_lock);
+		if (unlikely(p->signal == NULL)) {
+			/*
+			 * We raced with the reaping of the task.
+			 * The deletion should have cleared us off the list.
+			 */
+			BUG_ON(!list_empty(&timer->it.cpu.entry));
+		} else {
+			/*
+			 * Take us off the task's timer list.
+			 */
+			spin_lock(&p->sighand->siglock);
+			list_del(&timer->it.cpu.entry);
+			spin_unlock(&p->sighand->siglock);
+		}
+		read_unlock(&tasklist_lock);
 	}
-	spin_unlock(&p->sighand->siglock);
+	put_task_struct(p);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 25f407f0b668f5e4ebd5d13e1fb4306ba6427ead Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 21 Oct 2005 15:03:29 -0700
Subject: [PATCH] Call exit_itimers from do_exit, not __exit_signal

When I originally moved exit_itimers into __exit_signal, that was the only
place where we could reliably know it was the last thread in the group
dying, without races.  Since then we've gotten the signal_struct.live
counter, and do_exit can reliably do group-wide cleanup work.

This patch moves the call to do_exit, where it's made without locks.  This
avoids the deadlock issues that the old __exit_signal code's comment talks
about, and the one that Oleg found recently with process CPU timers.

[ This replaces e03d13e985d48ac4885382c9e3b1510c78bd047f, which is why
  it was just reverted. ]

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c         |  1 +
 kernel/posix-timers.c |  2 +-
 kernel/signal.c       | 14 +-------------
 3 files changed, 3 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 43077732619b..3b25b182d2be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -843,6 +843,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
+		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
 	exit_mm(tsk);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index b7b532acd9fc..dda3cda73c77 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1157,7 +1157,7 @@ retry_delete:
 }
 
 /*
- * This is called by __exit_signal, only when there are no more
+ * This is called by do_exit or de_thread, only when there are no more
  * references to the shared signal_struct.
  */
 void exit_itimers(struct signal_struct *sig)
diff --git a/kernel/signal.c b/kernel/signal.c
index 50c992643771..f2b96b08fb44 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -397,20 +397,8 @@ void __exit_signal(struct task_struct *tsk)
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
 		/*
-		 * We are cleaning up the signal_struct here.  We delayed
-		 * calling exit_itimers until after flush_sigqueue, just in
-		 * case our thread-local pending queue contained a queued
-		 * timer signal that would have been cleared in
-		 * exit_itimers.  When that called sigqueue_free, it would
-		 * attempt to re-take the tasklist_lock and deadlock.  This
-		 * can never happen if we ensure that all queues the
-		 * timer's signal might be queued on have been flushed
-		 * first.  The shared_pending queue, and our own pending
-		 * queue are the only queues the timer could be on, since
-		 * there are no other threads left in the group and timer
-		 * signals are constrained to threads inside the group.
+		 * We are cleaning up the signal_struct here.
 		 */
-		exit_itimers(sig);
 		exit_thread_group_keys(sig);
 		kmem_cache_free(signal_cachep, sig);
 	}
-- 
cgit v1.2.3


From e80eda94d3eaf1d12cfc97878eff77cd679dabc9 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Sun, 23 Oct 2005 10:02:50 -0700
Subject: Posix timers: limit number of timers firing at once

Bursty timers aren't good for anybody, very much including latency for
other programs when we trigger lots of timers in interrupt context.  So
set a random limit, after which we'll handle the rest on the next timer
tick.

Noted by Oleg Nesterov <oleg@tv-sign.ru>

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 7a51a5597c33..d30b304a3384 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -961,14 +961,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
 				struct list_head *firing)
 {
+	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 
+	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
 			tsk->it_prof_expires = t->expires.cpu;
 			break;
 		}
@@ -977,12 +979,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
 			tsk->it_virt_expires = t->expires.cpu;
 			break;
 		}
@@ -991,12 +994,13 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	tsk->it_sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (tsk->sched_time < t->expires.sched) {
+		if (!--maxfire || tsk->sched_time < t->expires.sched) {
 			tsk->it_sched_expires = t->expires.sched;
 			break;
 		}
@@ -1013,6 +1017,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
 				 struct list_head *firing)
 {
+	int maxfire;
 	struct signal_struct *const sig = tsk->signal;
 	cputime_t utime, stime, ptime, virt_expires, prof_expires;
 	unsigned long long sched_time, sched_expires;
@@ -1045,12 +1050,13 @@ static void check_process_timers(struct task_struct *tsk,
 	} while (t != tsk);
 	ptime = cputime_add(utime, stime);
 
+	maxfire = 20;
 	prof_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(ptime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
 			prof_expires = t->expires.cpu;
 			break;
 		}
@@ -1059,12 +1065,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	virt_expires = cputime_zero;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (cputime_lt(utime, t->expires.cpu)) {
+		if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
 			virt_expires = t->expires.cpu;
 			break;
 		}
@@ -1073,12 +1080,13 @@ static void check_process_timers(struct task_struct *tsk,
 	}
 
 	++timers;
+	maxfire = 20;
 	sched_expires = 0;
 	while (!list_empty(timers)) {
 		struct cpu_timer_list *t = list_entry(timers->next,
 						      struct cpu_timer_list,
 						      entry);
-		if (sched_time < t->expires.sched) {
+		if (!--maxfire || sched_time < t->expires.sched) {
 			sched_expires = t->expires.sched;
 			break;
 		}
-- 
cgit v1.2.3


From 108150ea78003044e41150c75259447b2c0953b6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 23 Oct 2005 20:25:39 +0400
Subject: [PATCH] posix-timers: fix cleanup_timers() and run_posix_cpu_timers()
 races

1. cleanup_timers() sets timer->task = NULL under tasklist + ->sighand locks.
   That means that this code in posix_cpu_timer_del() and posix_cpu_timer_set()

   		lock_timer(timer);
		if (timer->task == NULL)
			return;
		read_lock(tasklist);
		put_task_struct(timer->task)

   is racy. With this patch timer->task modified and accounted only under
   timer->it_lock. Sadly, this means that dead task_struct won't be freed
   until timer deleted or armed.

2. run_posix_cpu_timers() collects expired timers into local list under
   tasklist + ->sighand again. That means that posix_cpu_timer_del()
   should check timer->it.cpu.firing under these locks too.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d30b304a3384..30ab39a27736 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -380,14 +380,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
 	struct task_struct *p = timer->it.cpu.task;
+	int ret = 0;
 
-	if (timer->it.cpu.firing)
-		return TIMER_RETRY;
-
-	if (unlikely(p == NULL))
-		return 0;
-
-	if (!list_empty(&timer->it.cpu.entry)) {
+	if (likely(p != NULL)) {
 		read_lock(&tasklist_lock);
 		if (unlikely(p->signal == NULL)) {
 			/*
@@ -396,18 +391,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
 			 */
 			BUG_ON(!list_empty(&timer->it.cpu.entry));
 		} else {
-			/*
-			 * Take us off the task's timer list.
-			 */
 			spin_lock(&p->sighand->siglock);
-			list_del(&timer->it.cpu.entry);
+			if (timer->it.cpu.firing)
+				ret = TIMER_RETRY;
+			else
+				list_del(&timer->it.cpu.entry);
 			spin_unlock(&p->sighand->siglock);
 		}
 		read_unlock(&tasklist_lock);
+
+		if (!ret)
+			put_task_struct(p);
 	}
-	put_task_struct(p);
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -424,8 +421,6 @@ static void cleanup_timers(struct list_head *head,
 	cputime_t ptime = cputime_add(utime, stime);
 
 	list_for_each_entry_safe(timer, next, head, entry) {
-		put_task_struct(timer->task);
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, ptime)) {
 			timer->expires.cpu = cputime_zero;
@@ -437,8 +432,6 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
-		put_task_struct(timer->task);
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (cputime_lt(timer->expires.cpu, utime)) {
 			timer->expires.cpu = cputime_zero;
@@ -450,8 +443,6 @@ static void cleanup_timers(struct list_head *head,
 
 	++head;
 	list_for_each_entry_safe(timer, next, head, entry) {
-		put_task_struct(timer->task);
-		timer->task = NULL;
 		list_del_init(&timer->entry);
 		if (timer->expires.sched < sched_time) {
 			timer->expires.sched = 0;
-- 
cgit v1.2.3


From 3de463c7d9d58f8cf3395268230cb20a4c15bffa Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 24 Oct 2005 14:34:03 +0400
Subject: [PATCH] posix-timers: remove false BUG_ON() from
 run_posix_cpu_timers()

do_exit() clears ->it_##clock##_expires, but nothing prevents
another cpu to attach the timer to exiting process after that.

After exit_notify() does 'write_unlock_irq(&tasklist_lock)' and
before do_exit() calls 'schedule() local timer interrupt can find
tsk->exit_state != 0. If that state was EXIT_DEAD (or another cpu
does sys_wait4) interrupted task has ->signal == NULL.

At this moment exiting task has no pending cpu timers, they were cleaned
up in __exit_signal()->posix_cpu_timers_exit{,_group}(), so we can just
return from irq.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c             |  8 --------
 kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------
 2 files changed, 18 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..4897977a1f4b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -825,14 +825,6 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	tsk->flags |= PF_EXITING;
 
-	/*
-	 * Make sure we don't try to process any timer firings
-	 * while we are already exiting.
-	 */
- 	tsk->it_virt_expires = cputime_zero;
- 	tsk->it_prof_expires = cputime_zero;
-	tsk->it_sched_expires = 0;
-
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, current->pid,
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 30ab39a27736..ccb04683bf18 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1285,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 #undef	UNEXPIRED
 
-	BUG_ON(tsk->exit_state);
-
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
-	spin_lock(&tsk->sighand->siglock);
+	if (likely(tsk->signal != NULL)) {
+		spin_lock(&tsk->sighand->siglock);
 
-	/*
-	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-	 * all the timers that are firing, and put them on the firing list.
-	 */
-	check_thread_timers(tsk, &firing);
-	check_process_timers(tsk, &firing);
+		/*
+		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+		 * all the timers that are firing, and put them on the firing list.
+		 */
+		check_thread_timers(tsk, &firing);
+		check_process_timers(tsk, &firing);
 
-	/*
-	 * We must release these locks before taking any timer's lock.
-	 * There is a potential race with timer deletion here, as the
-	 * siglock now protects our private firing list.  We have set
-	 * the firing flag in each timer, so that a deletion attempt
-	 * that gets the timer lock before we do will give it up and
-	 * spin until we've taken care of that timer below.
-	 */
-	spin_unlock(&tsk->sighand->siglock);
+		/*
+		 * We must release these locks before taking any timer's lock.
+		 * There is a potential race with timer deletion here, as the
+		 * siglock now protects our private firing list.  We have set
+		 * the firing flag in each timer, so that a deletion attempt
+		 * that gets the timer lock before we do will give it up and
+		 * spin until we've taken care of that timer below.
+		 */
+		spin_unlock(&tsk->sighand->siglock);
+	}
 	read_unlock(&tasklist_lock);
 
 	/*
-- 
cgit v1.2.3


From ca531a0a5e01e5122f67cb6aca8fcbfc70e18e0b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 24 Oct 2005 14:36:28 +0400
Subject: [PATCH] posix-timers: exit path cleanup

No need to rebalance when task exited

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ccb04683bf18..92a038064628 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -486,6 +486,9 @@ static void process_timer_rebalance(struct task_struct *p,
  	struct task_struct *t = p;
 	unsigned int nthreads = atomic_read(&p->signal->live);
 
+	if (!nthreads)
+		return;
+
 	switch (clock_idx) {
 	default:
 		BUG();
@@ -1160,6 +1163,9 @@ static void check_process_timers(struct task_struct *tsk,
 		unsigned long long sched_left, sched;
 		const unsigned int nthreads = atomic_read(&sig->live);
 
+		if (!nthreads)
+			return;
+
 		prof_left = cputime_sub(prof_expires, utime);
 		prof_left = cputime_sub(prof_left, stime);
 		prof_left = cputime_div(prof_left, nthreads);
-- 
cgit v1.2.3


From a69ac4a78d8bd9e1ec478bd7297d4f047fcd44a8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 24 Oct 2005 18:29:58 +0400
Subject: [PATCH] posix-timers: fix posix_cpu_timer_set() vs
 run_posix_cpu_timers() race

This might be harmless, but looks like a race from code inspection (I
was unable to trigger it).  I must admit, I don't understand why we
can't return TIMER_RETRY after 'spin_unlock(&p->sighand->siglock)'
without doing bump_cpu_timer(), but this is what original code does.

posix_cpu_timer_set:

	read_lock(&tasklist_lock);

	spin_lock(&p->sighand->siglock);
	list_del_init(&timer->it.cpu.entry);
	spin_unlock(&p->sighand->siglock);

We are probaly deleting the timer from run_posix_cpu_timers's 'firing'
local list_head while run_posix_cpu_timers() does list_for_each_safe.

Various bad things can happen, for example we can just delete this timer
so that list_for_each() will not notice it and run_posix_cpu_timers()
will not reset '->firing' flag. In that case,

	....

	if (timer->it.cpu.firing) {
		read_unlock(&tasklist_lock);
		timer->it.cpu.firing = -1;
		return TIMER_RETRY;
	}

sys_timer_settime() goes to 'retry:', calls posix_cpu_timer_set() again,
it returns TIMER_RETRY ...

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 92a038064628..b15462b17a58 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -730,9 +730,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 * Disarm any old timer after extracting its expiry time.
 	 */
 	BUG_ON(!irqs_disabled());
+
+	ret = 0;
 	spin_lock(&p->sighand->siglock);
 	old_expires = timer->it.cpu.expires;
-	list_del_init(&timer->it.cpu.entry);
+	if (unlikely(timer->it.cpu.firing)) {
+		timer->it.cpu.firing = -1;
+		ret = TIMER_RETRY;
+	} else
+		list_del_init(&timer->it.cpu.entry);
 	spin_unlock(&p->sighand->siglock);
 
 	/*
@@ -780,7 +786,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		}
 	}
 
-	if (unlikely(timer->it.cpu.firing)) {
+	if (unlikely(ret)) {
 		/*
 		 * We are colliding with the timer actually firing.
 		 * Punt after filling in the timer's old value, and
@@ -788,8 +794,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		 * it as an overrun (thanks to bump_cpu_timer above).
 		 */
 		read_unlock(&tasklist_lock);
-		timer->it.cpu.firing = -1;
-		ret = TIMER_RETRY;
 		goto out;
 	}
 
-- 
cgit v1.2.3


From bb32051532fed727de0d513a9a578b54c0b7ea5a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 26 Oct 2005 01:59:01 -0700
Subject: [PATCH] export cpu_online_map

With CONFIG_SMP=n:

*** Warning: "cpu_online_map" [drivers/firmware/dcdbas.ko] undefined!

due to set_cpus_allowed().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1f31a528fdba..1e5cafdf4e27 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3879,6 +3879,7 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map = CPU_MASK_ALL;
+EXPORT_SYMBOL_GPL(cpu_online_map);
 cpumask_t cpu_possible_map = CPU_MASK_ALL;
 #endif
 
-- 
cgit v1.2.3


From 70ab81c2ed3d1323e7d6805bf59cbb570dff7937 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 26 Oct 2005 11:23:06 -0700
Subject: posix cpu timers: fix timer ordering

Pointed out by Oleg Nesterov, who has been walking over the code
forwards and backwards.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index b15462b17a58..2f86424fa515 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -576,17 +576,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
-			if (next->expires.sched > nt->expires.sched) {
-				listpos = &next->entry;
+			if (next->expires.sched > nt->expires.sched)
 				break;
-			}
+			listpos = &next->entry;
 		}
 	} else {
 		list_for_each_entry(next, head, entry) {
-			if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
-				listpos = &next->entry;
+			if (cputime_gt(next->expires.cpu, nt->expires.cpu))
 				break;
-			}
+			listpos = &next->entry;
 		}
 	}
 	list_add(&nt->entry, listpos);
-- 
cgit v1.2.3


From 7a4ed937aa44acdeb8c6ba671509dc7b54b09d3a Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 26 Oct 2005 20:26:53 +0400
Subject: [PATCH] Fix cpu timers expiration time

There's a silly off-by-one error in the code that updates the expiration
of posix CPU timers, causing them to not be properly updated when they
hit exactly on their expiration time (which should be the normal case).

This causes them to then fire immediately again, and only _then_ get
properly updated.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2f86424fa515..383ba22f0b62 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -91,7 +91,7 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
 				  union cpu_time_count now)
 {
 	int i;
@@ -110,7 +110,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
 		for (i = 0; incr < delta - incr; i++)
 			incr = incr << 1;
 		for (; i >= 0; incr >>= 1, i--) {
-			if (delta <= incr)
+			if (delta < incr)
 				continue;
 			timer->it.cpu.expires.sched += incr;
 			timer->it_overrun += 1 << i;
@@ -128,7 +128,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
 		for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
 			     incr = cputime_add(incr, incr);
 		for (; i >= 0; incr = cputime_halve(incr), i--) {
-			if (cputime_le(delta, incr))
+			if (cputime_lt(delta, incr))
 				continue;
 			timer->it.cpu.expires.cpu =
 				cputime_add(timer->it.cpu.expires.cpu, incr);
-- 
cgit v1.2.3


From a362f463a6d316d14daed0f817e151835ce97ff7 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Thu, 27 Oct 2005 09:07:33 -0700
Subject: Revert "remove false BUG_ON() from run_posix_cpu_timers()"

This reverts commit 3de463c7d9d58f8cf3395268230cb20a4c15bffa.

Roland has another patch that allows us to leave the BUG_ON() in place
by just making sure that the condition it tests for really is always
true.

That goes in next.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c             |  8 ++++++++
 kernel/posix-cpu-timers.c | 36 ++++++++++++++++++------------------
 2 files changed, 26 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 4897977a1f4b..3b25b182d2be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -825,6 +825,14 @@ fastcall NORET_TYPE void do_exit(long code)
 
 	tsk->flags |= PF_EXITING;
 
+	/*
+	 * Make sure we don't try to process any timer firings
+	 * while we are already exiting.
+	 */
+ 	tsk->it_virt_expires = cputime_zero;
+ 	tsk->it_prof_expires = cputime_zero;
+	tsk->it_sched_expires = 0;
+
 	if (unlikely(in_atomic()))
 		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
 				current->comm, current->pid,
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 383ba22f0b62..ea1aca5e7c2b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1293,30 +1293,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 #undef	UNEXPIRED
 
+	BUG_ON(tsk->exit_state);
+
 	/*
 	 * Double-check with locks held.
 	 */
 	read_lock(&tasklist_lock);
-	if (likely(tsk->signal != NULL)) {
-		spin_lock(&tsk->sighand->siglock);
+	spin_lock(&tsk->sighand->siglock);
 
-		/*
-		 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-		 * all the timers that are firing, and put them on the firing list.
-		 */
-		check_thread_timers(tsk, &firing);
-		check_process_timers(tsk, &firing);
+	/*
+	 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+	 * all the timers that are firing, and put them on the firing list.
+	 */
+	check_thread_timers(tsk, &firing);
+	check_process_timers(tsk, &firing);
 
-		/*
-		 * We must release these locks before taking any timer's lock.
-		 * There is a potential race with timer deletion here, as the
-		 * siglock now protects our private firing list.  We have set
-		 * the firing flag in each timer, so that a deletion attempt
-		 * that gets the timer lock before we do will give it up and
-		 * spin until we've taken care of that timer below.
-		 */
-		spin_unlock(&tsk->sighand->siglock);
-	}
+	/*
+	 * We must release these locks before taking any timer's lock.
+	 * There is a potential race with timer deletion here, as the
+	 * siglock now protects our private firing list.  We have set
+	 * the firing flag in each timer, so that a deletion attempt
+	 * that gets the timer lock before we do will give it up and
+	 * spin until we've taken care of that timer below.
+	 */
+	spin_unlock(&tsk->sighand->siglock);
 	read_unlock(&tasklist_lock);
 
 	/*
-- 
cgit v1.2.3


From 72ab373a5688a78cbdaf3bf96012e597d5399bb7 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Thu, 27 Oct 2005 03:16:42 -0700
Subject: [PATCH] Yet more posix-cpu-timer fixes

This just makes sure that a thread's expiry times can't get reset after
it clears them in do_exit.

This is what allowed us to re-introduce the stricter BUG_ON() check in
a362f463a6d316d14daed0f817e151835ce97ff7.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ea1aca5e7c2b..bf374fceb39c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -497,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ticks = cputime_add(prof_ticks(t), left);
 				if (cputime_eq(t->it_prof_expires,
 					       cputime_zero) ||
@@ -512,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ticks = cputime_add(virt_ticks(t), left);
 				if (cputime_eq(t->it_virt_expires,
 					       cputime_zero) ||
@@ -527,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		nsleft = expires.sched - val.sched;
 		do_div(nsleft, nthreads);
 		do {
-			if (!unlikely(t->exit_state)) {
+			if (!unlikely(t->flags & PF_EXITING)) {
 				ns = t->sched_time + nsleft;
 				if (t->it_sched_expires == 0 ||
 				    t->it_sched_expires > ns) {
@@ -566,6 +566,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	struct cpu_timer_list *next;
 	unsigned long i;
 
+	if (CPUCLOCK_PERTHREAD(timer->it_clock)	&& (p->flags & PF_EXITING))
+		return;
+
 	head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
 		p->cpu_timers : p->signal->cpu_timers);
 	head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1204,7 +1207,7 @@ static void check_process_timers(struct task_struct *tsk,
 
 			do {
 				t = next_thread(t);
-			} while (unlikely(t->exit_state));
+			} while (unlikely(t->flags & PF_EXITING));
 		} while (t != tsk);
 	}
 }
-- 
cgit v1.2.3


From 9796fdd829da626374458e8706daedcc0e432ddd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 21 Oct 2005 03:22:03 -0400
Subject: [PATCH] gfp_t: kernel/*

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/audit.h   | 4 ++--
 include/linux/suspend.h | 2 +-
 kernel/audit.c          | 6 +++---
 kernel/auditsc.c        | 2 +-
 kernel/kexec.c          | 7 +++----
 kernel/power/swsusp.c   | 2 +-
 6 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b2a2509bd7ea..da3c01955f3d 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -260,11 +260,11 @@ extern int audit_filter_user(struct netlink_skb_parms *cb, int type);
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
-extern void		    audit_log(struct audit_context *ctx, int gfp_mask,
+extern void		    audit_log(struct audit_context *ctx, gfp_t gfp_mask,
 				      int type, const char *fmt, ...)
 				      __attribute__((format(printf,4,5)));
 
-extern struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask, int type);
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type);
 extern void		    audit_log_format(struct audit_buffer *ab,
 					     const char *fmt, ...)
 			    __attribute__((format(printf,2,3)));
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index ad15a54806d8..ba448c760168 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,7 +71,7 @@ void restore_processor_state(void);
 struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
-extern unsigned long get_usable_page(unsigned gfp_mask);
+extern unsigned long get_usable_page(gfp_t gfp_mask);
 extern void free_eaten_memory(void);
 
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/audit.c b/kernel/audit.c
index aefa73a8a586..0c56320d38dc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -133,7 +133,7 @@ struct audit_buffer {
 	struct list_head     list;
 	struct sk_buff       *skb;	/* formatted skb ready to send */
 	struct audit_context *ctx;	/* NULL or associated context */
-	int		     gfp_mask;
+	gfp_t		     gfp_mask;
 };
 
 static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
@@ -647,7 +647,7 @@ static inline void audit_get_stamp(struct audit_context *ctx,
  * will be written at syscall exit.  If there is no associated task, tsk
  * should be NULL. */
 
-struct audit_buffer *audit_log_start(struct audit_context *ctx, int gfp_mask,
+struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
 				     int type)
 {
 	struct audit_buffer	*ab	= NULL;
@@ -879,7 +879,7 @@ void audit_log_end(struct audit_buffer *ab)
 /* Log an audit record.  This is a convenience function that calls
  * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
  * called in any context. */
-void audit_log(struct audit_context *ctx, int gfp_mask, int type, 
+void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, 
 	       const char *fmt, ...)
 {
 	struct audit_buffer *ab;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 88696f639aab..d8a68509e729 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -803,7 +803,7 @@ static void audit_log_task_info(struct audit_buffer *ab)
 	up_read(&mm->mmap_sem);
 }
 
-static void audit_log_exit(struct audit_context *context, unsigned int gfp_mask)
+static void audit_log_exit(struct audit_context *context, gfp_t gfp_mask)
 {
 	int i;
 	struct audit_buffer *ab;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index cdd4dcd8fb63..36c5d9cd4cc1 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -90,7 +90,7 @@ int kexec_should_crash(struct task_struct *p)
 static int kimage_is_destination_range(struct kimage *image,
 				       unsigned long start, unsigned long end);
 static struct page *kimage_alloc_page(struct kimage *image,
-				       unsigned int gfp_mask,
+				       gfp_t gfp_mask,
 				       unsigned long dest);
 
 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
@@ -326,8 +326,7 @@ static int kimage_is_destination_range(struct kimage *image,
 	return 0;
 }
 
-static struct page *kimage_alloc_pages(unsigned int gfp_mask,
-					unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
 	struct page *pages;
 
@@ -654,7 +653,7 @@ static kimage_entry_t *kimage_dst_used(struct kimage *image,
 }
 
 static struct page *kimage_alloc_page(struct kimage *image,
-					unsigned int gfp_mask,
+					gfp_t gfp_mask,
 					unsigned long destination)
 {
 	/*
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 2d5c45676442..10bc5ec496d7 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1095,7 +1095,7 @@ static inline void eat_page(void *page)
 	*eaten_memory = c;
 }
 
-unsigned long get_usable_page(unsigned gfp_mask)
+unsigned long get_usable_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-- 
cgit v1.2.3


From 8d027de54c77d38eedc9b331c7a2a39807d34691 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sat, 29 Oct 2005 19:37:40 +0400
Subject: [PATCH] fix ->signal->live leak in copy_process()

exit_signal() (called from copy_process's error path) should decrement
->signal->live, otherwise forking process will miss 'group_dead' in
do_exit().

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index f2b96b08fb44..6904bbbfe116 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -406,6 +406,8 @@ void __exit_signal(struct task_struct *tsk)
 
 void exit_signal(struct task_struct *tsk)
 {
+	atomic_dec(&tsk->signal->live);
+
 	write_lock_irq(&tasklist_lock);
 	__exit_signal(tsk);
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 943eae03143790c71cf42fe13529f1b74ceb0266 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ftp.linux.org.uk>
Date: Sat, 29 Oct 2005 07:32:07 +0100
Subject: [PATCH] missing exports of do_settimeofday() variants

frv, sh64, ia64 and sparc64 do not have do_settimeofday() exported (the
last two are using variant in kernel/time.c).  Exports added to match
the rest of architectures.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/frv/kernel/time.c  | 1 +
 arch/sh64/kernel/time.c | 1 +
 kernel/time.c           | 1 +
 3 files changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index 8d6558b00e44..f43b734482e3 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -221,6 +221,7 @@ int do_settimeofday(struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 /*
  * Scheduler clock - returns current time in nanosec units.
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index f4a62a10053c..43e395a14f49 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -253,6 +253,7 @@ int do_settimeofday(struct timespec *tv)
 
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 static int set_rtc_time(unsigned long nowtime)
 {
diff --git a/kernel/time.c b/kernel/time.c
index 40c2410ac99a..a3c2100470e1 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -532,6 +532,7 @@ int do_settimeofday (struct timespec *tv)
 	clock_was_set();
 	return 0;
 }
+EXPORT_SYMBOL(do_settimeofday);
 
 void do_gettimeofday (struct timeval *tv)
 {
-- 
cgit v1.2.3


From 4b8f573b5db02a3017afbba49026a6aef480174f Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Date: Sat, 29 Oct 2005 18:15:42 -0700
Subject: [PATCH] TIMERS: add missing compensation for HZ == 250

Add missing compensation for (HZ == 250) != (1 << SHIFT_HZ) in
second_overflow().

Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 3ba10fa35b60..6a2e5f8dc725 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -752,6 +752,15 @@ static void second_overflow(void)
     else
 	time_adj += (time_adj >> 2) + (time_adj >> 5);
 #endif
+#if HZ == 250
+    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
+     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+     */
+    if (time_adj < 0)
+	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
+    else
+	time_adj += (time_adj >> 6) + (time_adj >> 7);
+#endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-- 
cgit v1.2.3


From ab50b8ed818016cfecd747d6d4bb9139986bc029 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:15:56 -0700
Subject: [PATCH] mm: vm_stat_account unshackled

The original vm_stat_account has fallen into disuse, with only one user, and
only one user of vm_stat_unaccount.  It's easier to keep track if we convert
them all to __vm_stat_account, then free it from its __shackles.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ia64/kernel/perfmon.c |  3 ++-
 arch/ia64/mm/fault.c       |  2 +-
 include/linux/mm.h         | 16 ++--------------
 kernel/fork.c              |  2 +-
 mm/mmap.c                  | 20 ++++++++++----------
 mm/mprotect.c              |  4 ++--
 mm/mremap.c                |  4 ++--
 7 files changed, 20 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index d71731ee5b61..f7dfc107cb7b 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2352,7 +2352,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, pfm_context_t *ctx, unsigned lon
 	insert_vm_struct(mm, vma);
 
 	mm->total_vm  += size >> PAGE_SHIFT;
-	vm_stat_account(vma);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
+							vma_pages(vma));
 	up_write(&task->mm->mmap_sem);
 
 	/*
diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
index 3c32af910d60..f21b55549787 100644
--- a/arch/ia64/mm/fault.c
+++ b/arch/ia64/mm/fault.c
@@ -41,7 +41,7 @@ expand_backing_store (struct vm_area_struct *vma, unsigned long address)
 	vma->vm_mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		vma->vm_mm->locked_vm += grow;
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
+	vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e1649578fb0c..376a466743bc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -928,26 +928,14 @@ int remap_pfn_range(struct vm_area_struct *, unsigned long,
 		unsigned long, unsigned long, pgprot_t);
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
+void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
 #else
-static inline void __vm_stat_account(struct mm_struct *mm,
+static inline void vm_stat_account(struct mm_struct *mm,
 			unsigned long flags, struct file *file, long pages)
 {
 }
 #endif /* CONFIG_PROC_FS */
 
-static inline void vm_stat_account(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							vma_pages(vma));
-}
-
-static inline void vm_stat_unaccount(struct vm_area_struct *vma)
-{
-	__vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file,
-							-vma_pages(vma));
-}
-
 /* update per process rss and vm hiwater data */
 extern void update_mem_hiwater(struct task_struct *tsk);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 280bd44ac441..e2ff11f8c1b0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -212,7 +212,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		if (mpnt->vm_flags & VM_DONTCOPY) {
 			long pages = vma_pages(mpnt);
 			mm->total_vm -= pages;
-			__vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
+			vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file,
 								-pages);
 			continue;
 		}
diff --git a/mm/mmap.c b/mm/mmap.c
index fa11d91242e8..e1780266ac7d 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -832,7 +832,7 @@ none:
 }
 
 #ifdef CONFIG_PROC_FS
-void __vm_stat_account(struct mm_struct *mm, unsigned long flags,
+void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 						struct file *file, long pages)
 {
 	const unsigned long stack_flags
@@ -1110,7 +1110,7 @@ munmap_back:
 	}
 out:	
 	mm->total_vm += len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
+	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
 		mm->locked_vm += len >> PAGE_SHIFT;
 		make_pages_present(addr, addr + len);
@@ -1475,7 +1475,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
 	mm->total_vm += grow;
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
 }
 
@@ -1610,15 +1610,15 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  * By the time this function is called, the area struct has been
  * removed from the process mapping list.
  */
-static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *area)
+static void unmap_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-	size_t len = area->vm_end - area->vm_start;
+	long nrpages = vma_pages(vma);
 
-	area->vm_mm->total_vm -= len >> PAGE_SHIFT;
-	if (area->vm_flags & VM_LOCKED)
-		area->vm_mm->locked_vm -= len >> PAGE_SHIFT;
-	vm_stat_unaccount(area);
-	remove_vm_struct(area);
+	mm->total_vm -= nrpages;
+	if (vma->vm_flags & VM_LOCKED)
+		mm->locked_vm -= nrpages;
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
+	remove_vm_struct(vma);
 }
 
 /*
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 57577f63b305..b426f01c5e9c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -168,8 +168,8 @@ success:
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
 	change_protection(vma, start, end, newprot);
-	__vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
-	__vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
+	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
 
 fail:
diff --git a/mm/mremap.c b/mm/mremap.c
index f343fc73a8bd..55df8f53e84d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -233,7 +233,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	 * since do_munmap() will decrement it by old_len == new_len
 	 */
 	mm->total_vm += new_len >> PAGE_SHIFT;
-	__vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
+	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
 	if (do_munmap(mm, old_addr, old_len) < 0) {
 		/* OOM: unable to split vma, just get accounts right */
@@ -384,7 +384,7 @@ unsigned long do_mremap(unsigned long addr,
 				addr + new_len, vma->vm_pgoff, NULL);
 
 			current->mm->total_vm += pages;
-			__vm_stat_account(vma->vm_mm, vma->vm_flags,
+			vm_stat_account(vma->vm_mm, vma->vm_flags,
 							vma->vm_file, pages);
 			if (vma->vm_flags & VM_LOCKED) {
 				current->mm->locked_vm += pages;
-- 
cgit v1.2.3


From 404351e67a9facb475abf1492245374a28d13e90 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:04 -0700
Subject: [PATCH] mm: mm_init set_mm_counters

How is anon_rss initialized?  In dup_mmap, and by mm_alloc's memset; but
that's not so good if an mm_counter_t is a special type.  And how is rss
initialized?  By set_mm_counter, all over the place.  Come on, we just need to
initialize them both at once by set_mm_counter in mm_init (which follows the
memcpy when forking).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/mips/kernel/irixelf.c          | 1 -
 arch/sparc64/kernel/binfmt_aout32.c | 1 -
 arch/x86_64/ia32/ia32_aout.c        | 1 -
 fs/binfmt_aout.c                    | 1 -
 fs/binfmt_elf.c                     | 1 -
 fs/binfmt_elf_fdpic.c               | 7 -------
 fs/binfmt_flat.c                    | 1 -
 fs/binfmt_som.c                     | 1 -
 kernel/fork.c                       | 4 ++--
 9 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index 99262fe64560..7ce34d4aa220 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -697,7 +697,6 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	/* Do this so that we can load the interpreter, if need be.  We will
 	 * change some of these later.
 	 */
-	set_mm_counter(current->mm, rss, 0);
 	setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index b2854ef221d0..edf52d06b280 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -241,7 +241,6 @@ static int load_aout32_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->brk = ex.a_bss +
 		(current->mm->start_brk = N_BSSADDR(ex));
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 3e6780fa0186..93c60f4aa47a 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -314,7 +314,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index dd9baabaf016..72011826f0cb 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -318,7 +318,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->mmap = NULL;
 	compute_creds(bprm);
  	current->flags &= ~PF_FORKNOEXEC;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d4b15576e584..918ccc267e41 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -773,7 +773,6 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	/* Do this so that we can load the interpreter, if need be.  We will
 	   change some of these later */
-	set_mm_counter(current->mm, rss, 0);
 	current->mm->free_area_cache = current->mm->mmap_base;
 	current->mm->cached_hole_size = 0;
 	retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 134c9c0d1f54..dda87c4c82a3 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -294,14 +294,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct pt_regs *regs
 				  &interp_params,
 				  &current->mm->start_stack,
 				  &current->mm->start_brk);
-#endif
-
-	/* do this so that we can load the interpreter, if need be
-	 * - we will change some of these later
-	 */
-	set_mm_counter(current->mm, rss, 0);
 
-#ifdef CONFIG_MMU
 	retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 7974efa107bc..9d6625829b99 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -650,7 +650,6 @@ static int load_flat_file(struct linux_binprm * bprm,
 		current->mm->start_brk = datapos + data_len + bss_len;
 		current->mm->brk = (current->mm->start_brk + 3) & ~3;
 		current->mm->context.end_brk = memp + ksize((void *) memp) - stack_len;
-		set_mm_counter(current->mm, rss, 0);
 	}
 
 	if (flags & FLAT_FLAG_KTRACE)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 227a2682d2bf..00a91dc25d16 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -259,7 +259,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	create_som_tables(bprm);
 
 	current->mm->start_stack = bprm->p;
-	set_mm_counter(current->mm, rss, 0);
 
 #if 0
 	printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
diff --git a/kernel/fork.c b/kernel/fork.c
index e2ff11f8c1b0..25caa02e2eac 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -198,8 +198,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	set_mm_counter(mm, rss, 0);
-	set_mm_counter(mm, anon_rss, 0);
 	cpus_clear(mm->cpu_vm_mask);
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
@@ -323,6 +321,8 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
+	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
-- 
cgit v1.2.3


From 4294621f41a85497019fae64341aa5351a1921b7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:05 -0700
Subject: [PATCH] mm: rss = file_rss + anon_rss

I was lazy when we added anon_rss, and chose to change as few places as
possible.  So currently each anonymous page has to be counted twice, in rss
and in anon_rss.  Which won't be so good if those are atomic counts in some
configurations.

Change that around: keep file_rss and anon_rss separately, and add them
together (with get_mm_rss macro) when the total is needed - reading two
atomics is much cheaper than updating two atomics.  And update anon_rss
upfront, typically in memory.c, not tucked away in page_add_anon_rmap.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c             |  2 +-
 fs/proc/array.c       |  2 +-
 fs/proc/task_mmu.c    |  8 +++-----
 include/linux/sched.h |  4 +++-
 kernel/acct.c         |  2 +-
 kernel/fork.c         |  4 ++--
 mm/fremap.c           |  4 ++--
 mm/hugetlb.c          |  6 +++---
 mm/memory.c           | 31 +++++++++++++++++--------------
 mm/nommu.c            |  2 +-
 mm/rmap.c             |  8 +++-----
 mm/swapfile.c         |  2 +-
 12 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index d2208f7c87db..cefadf5ab83b 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -330,7 +330,7 @@ void install_arg_page(struct vm_area_struct *vma,
 		pte_unmap(pte);
 		goto out;
 	}
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	lru_cache_add_active(page);
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
diff --git a/fs/proc/array.c b/fs/proc/array.c
index d84eecacbeaf..3e1239e4b303 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct task_struct *task, char * buffer, int whole)
 		jiffies_to_clock_t(it_real_value),
 		start_time,
 		vsize,
-		mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
+		mm ? get_mm_rss(mm) : 0,
 	        rsslim,
 		mm ? mm->start_code : 0,
 		mm ? mm->end_code : 0,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 994612bc72d0..bccee7cf9ccd 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -29,7 +29,7 @@ char *task_mem(struct mm_struct *mm, char *buffer)
 		"VmPTE:\t%8lu kB\n",
 		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_counter(mm, rss) << (PAGE_SHIFT-10),
+		get_mm_rss(mm) << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
@@ -44,13 +44,11 @@ unsigned long task_vsize(struct mm_struct *mm)
 int task_statm(struct mm_struct *mm, int *shared, int *text,
 	       int *data, int *resident)
 {
-	int rss = get_mm_counter(mm, rss);
-
-	*shared = rss - get_mm_counter(mm, anon_rss);
+	*shared = get_mm_counter(mm, file_rss);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
 	*data = mm->total_vm - mm->shared_vm;
-	*resident = rss;
+	*resident = *shared + get_mm_counter(mm, anon_rss);
 	return mm->total_vm;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 27519df0f987..afcaac66cbd5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -254,6 +254,8 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define add_mm_counter(mm, member, value) (mm)->_##member += (value)
 #define inc_mm_counter(mm, member) (mm)->_##member++
 #define dec_mm_counter(mm, member) (mm)->_##member--
+#define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
@@ -286,7 +288,7 @@ struct mm_struct {
 	unsigned long exec_vm, stack_vm, reserved_vm, def_flags, nr_ptes;
 
 	/* Special counters protected by the page_table_lock */
-	mm_counter_t _rss;
+	mm_counter_t _file_rss;
 	mm_counter_t _anon_rss;
 
 	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
diff --git a/kernel/acct.c b/kernel/acct.c
index b756f527497e..2e3f4a47e7d0 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -553,7 +553,7 @@ void acct_update_integrals(struct task_struct *tsk)
 		if (delta == 0)
 			return;
 		tsk->acct_stimexpd = tsk->stime;
-		tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss);
+		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
 	}
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 25caa02e2eac..2048ed7b5872 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -321,7 +321,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	INIT_LIST_HEAD(&mm->mmlist);
 	mm->core_waiters = 0;
 	mm->nr_ptes = 0;
-	set_mm_counter(mm, rss, 0);
+	set_mm_counter(mm, file_rss, 0);
 	set_mm_counter(mm, anon_rss, 0);
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
@@ -499,7 +499,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (retval)
 		goto free_pt;
 
-	mm->hiwater_rss = get_mm_counter(mm,rss);
+	mm->hiwater_rss = get_mm_rss(mm);
 	mm->hiwater_vm = mm->total_vm;
 
 good_mm:
diff --git a/mm/fremap.c b/mm/fremap.c
index ab23a0673c35..fd7f2a17ff3e 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -39,7 +39,7 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 					set_page_dirty(page);
 				page_remove_rmap(page);
 				page_cache_release(page);
-				dec_mm_counter(mm, rss);
+				dec_mm_counter(mm, file_rss);
 			}
 		}
 	} else {
@@ -95,7 +95,7 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	zap_pte(mm, vma, addr, pte);
 
-	inc_mm_counter(mm,rss);
+	inc_mm_counter(mm, file_rss);
 	flush_icache_page(vma, page);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 61d380678030..094455bcbbf7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -286,7 +286,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
-			add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
+			add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
@@ -324,7 +324,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 
 		page = pte_page(pte);
 		put_page(page);
-		add_mm_counter(mm, rss,  - (HPAGE_SIZE / PAGE_SIZE));
+		add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
 	}
 	flush_tlb_range(vma, start, end);
 }
@@ -386,7 +386,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				goto out;
 			}
 		}
-		add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
 	}
 out:
diff --git a/mm/memory.c b/mm/memory.c
index 51eb38574830..59d42e50fa53 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -397,9 +397,10 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
 	get_page(page);
-	inc_mm_counter(dst_mm, rss);
 	if (PageAnon(page))
 		inc_mm_counter(dst_mm, anon_rss);
+	else
+		inc_mm_counter(dst_mm, file_rss);
 	set_pte_at(dst_mm, addr, dst_pte, pte);
 	page_dup_rmap(page);
 }
@@ -581,8 +582,8 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 					set_page_dirty(page);
 				if (pte_young(ptent))
 					mark_page_accessed(page);
+				dec_mm_counter(tlb->mm, file_rss);
 			}
-			dec_mm_counter(tlb->mm, rss);
 			page_remove_rmap(page);
 			tlb_remove_page(tlb, page);
 			continue;
@@ -1290,13 +1291,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		if (PageAnon(old_page))
-			dec_mm_counter(mm, anon_rss);
 		if (PageReserved(old_page))
-			inc_mm_counter(mm, rss);
-		else
+			inc_mm_counter(mm, anon_rss);
+		else {
 			page_remove_rmap(old_page);
-
+			if (!PageAnon(old_page)) {
+				inc_mm_counter(mm, anon_rss);
+				dec_mm_counter(mm, file_rss);
+			}
+		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1701,7 +1704,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	/* The page isn't present yet, go ahead with the fault. */
 
-	inc_mm_counter(mm, rss);
+	inc_mm_counter(mm, anon_rss);
 	pte = mk_pte(page, vma->vm_page_prot);
 	if (write_access && can_share_swap_page(page)) {
 		pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -1774,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_release(page);
 			goto unlock;
 		}
-		inc_mm_counter(mm, rss);
+		inc_mm_counter(mm, anon_rss);
 		entry = mk_pte(page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		lru_cache_add_active(page);
@@ -1887,19 +1890,19 @@ retry:
 	 */
 	/* Only go through if we didn't race with anybody else... */
 	if (pte_none(*page_table)) {
-		if (!PageReserved(new_page))
-			inc_mm_counter(mm, rss);
-
 		flush_icache_page(vma, new_page);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
 		if (anon) {
+			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else
+		} else if (!PageReserved(new_page)) {
+			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+		}
 	} else {
 		/* One of our sibling threads was faster, back out. */
 		page_cache_release(new_page);
@@ -2192,7 +2195,7 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 void update_mem_hiwater(struct task_struct *tsk)
 {
 	if (tsk->mm) {
-		unsigned long rss = get_mm_counter(tsk->mm, rss);
+		unsigned long rss = get_mm_rss(tsk->mm);
 
 		if (tsk->mm->hiwater_rss < rss)
 			tsk->mm->hiwater_rss = rss;
diff --git a/mm/nommu.c b/mm/nommu.c
index 0ef241ae3763..599924886eb5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1083,7 +1083,7 @@ void update_mem_hiwater(struct task_struct *tsk)
 	unsigned long rss;
 
 	if (likely(tsk->mm)) {
-		rss = get_mm_counter(tsk->mm, rss);
+		rss = get_mm_rss(tsk->mm);
 		if (tsk->mm->hiwater_rss < rss)
 			tsk->mm->hiwater_rss = rss;
 		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
diff --git a/mm/rmap.c b/mm/rmap.c
index 1fc559e09ca8..504757624cce 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -445,8 +445,6 @@ void page_add_anon_rmap(struct page *page,
 {
 	BUG_ON(PageReserved(page));
 
-	inc_mm_counter(vma->vm_mm, anon_rss);
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -561,9 +559,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 		set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*pte));
 		dec_mm_counter(mm, anon_rss);
-	}
+	} else
+		dec_mm_counter(mm, file_rss);
 
-	dec_mm_counter(mm, rss);
 	page_remove_rmap(page);
 	page_cache_release(page);
 
@@ -667,7 +665,7 @@ static void try_to_unmap_cluster(unsigned long cursor,
 
 		page_remove_rmap(page);
 		page_cache_release(page);
-		dec_mm_counter(mm, rss);
+		dec_mm_counter(mm, file_rss);
 		(*mapcount)--;
 	}
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 05c851291241..296e0bbf7836 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -407,7 +407,7 @@ void free_swap_and_cache(swp_entry_t entry)
 static void unuse_pte(struct vm_area_struct *vma, pte_t *pte,
 		unsigned long addr, swp_entry_t entry, struct page *page)
 {
-	inc_mm_counter(vma->vm_mm, rss);
+	inc_mm_counter(vma->vm_mm, anon_rss);
 	get_page(page);
 	set_pte_at(vma->vm_mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-- 
cgit v1.2.3


From fd3e42fcc888a773572282575d2fdbf5cfd6216e Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:06 -0700
Subject: [PATCH] mm: dup_mmap use oldmm more

Use the parent's oldmm throughout dup_mmap, instead of perversely going back
to current->mm.  (Can you hear the sigh of relief from those mpnts?  Usually I
squash them, but not today.)

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 2048ed7b5872..0e7fe4a8a8df 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -182,16 +182,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 }
 
 #ifdef CONFIG_MMU
-static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
+static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 {
-	struct vm_area_struct * mpnt, *tmp, **pprev;
+	struct vm_area_struct *mpnt, *tmp, **pprev;
 	struct rb_node **rb_link, *rb_parent;
 	int retval;
 	unsigned long charge;
 	struct mempolicy *pol;
 
 	down_write(&oldmm->mmap_sem);
-	flush_cache_mm(current->mm);
+	flush_cache_mm(oldmm);
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -204,7 +204,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	rb_parent = NULL;
 	pprev = &mm->mmap;
 
-	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
+	for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
 		struct file *file;
 
 		if (mpnt->vm_flags & VM_DONTCOPY) {
@@ -265,7 +265,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, current->mm, tmp);
+		retval = copy_page_range(mm, oldmm, tmp);
 		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
@@ -277,7 +277,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
 	retval = 0;
 
 out:
-	flush_tlb_mm(current->mm);
+	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
 fail_nomem_policy:
-- 
cgit v1.2.3


From 7ee78232501ea9de2b6c8f10d32c9a0fee541357 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:08 -0700
Subject: [PATCH] mm: dup_mmap down new mmap_sem

One anomaly remains from when Andrea rationalized the responsibilities of
mmap_sem and page_table_lock: in dup_mmap we add vmas to the child holding its
page_table_lock, but not the mmap_sem which normally guards the vma list and
rbtree.  Which could be an issue for unuse_mm: though since it just walks down
the list (today with page_table_lock, tomorrow not), it's probably okay.  Will
need a memory barrier?  Oh, keep it simple, Nick and I agreed, no harm in
taking child's mmap_sem here.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 0e7fe4a8a8df..2a587b3224e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -192,6 +192,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 	down_write(&oldmm->mmap_sem);
 	flush_cache_mm(oldmm);
+	down_write(&mm->mmap_sem);
+
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
@@ -251,10 +253,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		}
 
 		/*
-		 * Link in the new vma and copy the page table entries:
-		 * link in first so that swapoff can see swap entries.
-		 * Note that, exceptionally, here the vma is inserted
-		 * without holding mm->mmap_sem.
+		 * Link in the new vma and copy the page table entries.
 		 */
 		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
@@ -275,8 +274,8 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			goto out;
 	}
 	retval = 0;
-
 out:
+	up_write(&mm->mmap_sem);
 	flush_tlb_mm(oldmm);
 	up_write(&oldmm->mmap_sem);
 	return retval;
-- 
cgit v1.2.3


From b5810039a54e5babf428e9a1e89fc1940fabff11 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Sat, 29 Oct 2005 18:16:12 -0700
Subject: [PATCH] core remove PageReserved

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearing of PageReserved is retained, and it is now flagged
in the page_alloc checks to help ensure we don't introduce any refcount
based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated.  We never completely handled it correctly anyway, and is be
reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and all
arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can
be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not.  This still
needs to be addressed (a generic page_is_ram() should work).

A last caveat: the ZERO_PAGE is now refcounted and managed with rmap (and
thus mapcounted and count towards shared rss).  These writes to the struct
page could cause excessive cacheline bouncing on big systems.  There are a
number of ways this could be addressed if it is an issue.

Signed-off-by: Nick Piggin <npiggin@suse.de>

Refcount bug fix for filemap_xip.c

Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/ppc64/kernel/vdso.c  |  12 +++--
 arch/sparc/mm/generic.c   |   3 ++
 arch/sparc64/mm/generic.c |   3 ++
 drivers/scsi/sg.c         |  12 +++--
 drivers/scsi/st.c         |  10 ++--
 fs/direct-io.c            |   4 +-
 include/linux/mm.h        |   5 +-
 kernel/power/swsusp.c     |  25 +++++----
 mm/bootmem.c              |   1 +
 mm/filemap_xip.c          |  11 ++--
 mm/fremap.c               |  23 ++++----
 mm/madvise.c              |   2 +-
 mm/memory.c               | 131 ++++++++++++++++++++++++++++------------------
 mm/mempolicy.c            |  29 +++++-----
 mm/mmap.c                 |  11 ++++
 mm/mprotect.c             |   8 +++
 mm/msync.c                |  17 +++---
 mm/page_alloc.c           |  14 ++---
 mm/rmap.c                 |  14 ++---
 mm/shmem.c                |   4 +-
 mm/swap.c                 |   4 +-
 sound/core/pcm_native.c   |   9 ++--
 22 files changed, 218 insertions(+), 134 deletions(-)

(limited to 'kernel')

diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c
index efa985f05aca..4aacf521e3e4 100644
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
 		return NOPAGE_SIGBUS;
 
 	/*
-	 * Last page is systemcfg, special handling here, no get_page() a
-	 * this is a reserved page
+	 * Last page is systemcfg.
 	 */
 	if ((vma->vm_end - address) <= PAGE_SIZE)
-		return virt_to_page(systemcfg);
+		pg = virt_to_page(systemcfg);
+	else
+		pg = virt_to_page(vbase + offset);
 
-	pg = virt_to_page(vbase + offset);
 	get_page(pg);
 	DBG(" ->page count: %d\n", page_count(pg));
 
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
 	 * gettimeofday will be totally dead. It's fine to use that for setting
 	 * breakpoints in the vDSO code pages though
 	 */
-	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+	vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
 	vma->vm_flags |= mm->def_flags;
 	vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
 	vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
 		ClearPageReserved(pg);
 		get_page(pg);
 	}
+
+	get_page(virt_to_page(systemcfg));
 }
 
 int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c
index 20ccb957fb77..659c9a71f867 100644
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c
index c954d91f01d0..afc01cec701f 100644
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
 	int space = GET_IOSPACE(pfn);
 	unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
 
+	/* See comment in mm/memory.c remap_pfn_range */
+	vma->vm_flags |= VM_IO | VM_RESERVED;
+
 	prot = __pgprot(pg_iobits);
 	offset -= from;
 	dir = pgd_offset(mm, from);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 861e51375d70..2d30b46806bf 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
-		/* unlock_page(sgl[i].page); */
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
+		/* unlock_page(page); */
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 5eb54d8019b4..da9766283bd7 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
 	int i;
 
 	for (i=0; i < nr_pages; i++) {
-		if (dirtied && !PageReserved(sgl[i].page))
-			SetPageDirty(sgl[i].page);
+		struct page *page = sgl[i].page;
+
+		/* XXX: just for debug. Remove when PageReserved is removed */
+		BUG_ON(PageReserved(page));
+		if (dirtied)
+			SetPageDirty(page);
 		/* FIXME: cache flush missing for rw==READ
 		 * FIXME: call the correct reference counting function
 		 */
-		page_cache_release(sgl[i].page);
+		page_cache_release(page);
 	}
 
 	return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0d06097bc995..3931e7f1e6bf 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
 	up_read(&current->mm->mmap_sem);
 
 	if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+		struct page *page = ZERO_PAGE(dio->curr_user_address);
 		/*
 		 * A memory fault, but the filesystem has some outstanding
 		 * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
 		 */
 		if (dio->page_errors == 0)
 			dio->page_errors = ret;
-		dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+		page_cache_get(page);
+		dio->pages[0] = page;
 		dio->head = 0;
 		dio->tail = 1;
 		ret = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0c64484d8ae0..da42093250c3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #define VM_DONTCOPY	0x00020000      /* Do not copy this vma on fork */
 #define VM_DONTEXPAND	0x00040000	/* Cannot expand with mremap() */
-#define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
+#define VM_RESERVED	0x00080000	/* Pages managed in a special way */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
 #define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
 
 static inline void put_page(struct page *page)
 {
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
 		int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 10bc5ec496d7..016504ccfccf 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
 			continue;
 		page = pfn_to_page(pfn);
 		/*
-		 * This condition results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations. This should be
-		 * corrected eventually when the cases giving rise to this
-		 * are better understood.
+		 * PageReserved results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations.
+		 *
+		 * rvmalloc should not cause this, because all implementations
+		 * appear to always be using vmalloc_32 on architectures with
+		 * highmem. This is a good thing, because we would like to save
+		 * rvmalloc pages.
+		 *
+		 * It appears to be triggered by pages which do not point to
+		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+		 * which sets PageReserved if the page does not point to valid
+		 * RAM.
+		 *
+		 * XXX: must remove usage of PageReserved!
 		 */
-		if (PageReserved(page)) {
-			printk("highmem reserved page?!\n");
+		if (PageReserved(page))
 			continue;
-		}
 		BUG_ON(PageNosave(page));
 		if (PageNosaveFree(page))
 			continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 		return 0;
 
 	page = pfn_to_page(pfn);
-	BUG_ON(PageReserved(page) && PageNosave(page));
 	if (PageNosave(page))
 		return 0;
-	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+	if (pfn_is_nosave(pfn)) {
 		pr_debug("[nosave pfn 0x%lx]", pfn);
 		return 0;
 	}
diff --git a/mm/bootmem.c b/mm/bootmem.c
index a58699b6579e..e8c567177dcf 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
 				if (j + 16 < BITS_PER_LONG)
 					prefetchw(page + j + 16);
 				__ClearPageReserved(page + j);
+				set_page_count(page + j, 0);
 			}
 			__free_pages(page, order);
 			i += BITS_PER_LONG;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 8c199f537732..9354ee279b13 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
 	unsigned long address;
 	pte_t *pte;
 	pte_t pteval;
+	struct page *page = ZERO_PAGE(address);
 
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping,
 		 * We need the page_table_lock to protect us from page faults,
 		 * munmap, fork, etc...
 		 */
-		pte = page_check_address(ZERO_PAGE(address), mm,
-					 address);
+		pte = page_check_address(page, mm, address);
 		if (!IS_ERR(pte)) {
 			/* Nuke the page table entry. */
 			flush_cache_page(vma, address, pte_pfn(*pte));
 			pteval = ptep_clear_flush(vma, address, pte);
+			page_remove_rmap(page);
+			dec_mm_counter(mm, file_rss);
 			BUG_ON(pte_dirty(pteval));
 			pte_unmap(pte);
 			spin_unlock(&mm->page_table_lock);
+			page_cache_release(page);
 		}
 	}
 	spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area,
 
 	page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
 	if (!IS_ERR(page)) {
-		return page;
+		goto out;
 	}
 	if (PTR_ERR(page) != -ENODATA)
 		return NULL;
@@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area,
 		page = ZERO_PAGE(address);
 	}
 
+out:
+	page_cache_get(page);
 	return page;
 }
 
diff --git a/mm/fremap.c b/mm/fremap.c
index fd7f2a17ff3e..224cc1598b35 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 		return;
 	if (pte_present(pte)) {
 		unsigned long pfn = pte_pfn(pte);
+		struct page *page;
 
 		flush_cache_page(vma, addr, pfn);
 		pte = ptep_clear_flush(vma, addr, ptep);
-		if (pfn_valid(pfn)) {
-			struct page *page = pfn_to_page(pfn);
-			if (!PageReserved(page)) {
-				if (pte_dirty(pte))
-					set_page_dirty(page);
-				page_remove_rmap(page);
-				page_cache_release(page);
-				dec_mm_counter(mm, file_rss);
-			}
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, pte, addr);
+			return;
 		}
+		page = pfn_to_page(pfn);
+		if (pte_dirty(pte))
+			set_page_dirty(page);
+		page_remove_rmap(page);
+		page_cache_release(page);
+		dec_mm_counter(mm, file_rss);
 	} else {
 		if (!pte_file(pte))
 			free_swap_and_cache(pte_to_swp_entry(pte));
@@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgd_t *pgd;
 	pte_t pte_val;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 	
@@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgd_t *pgd;
 	pte_t pte_val;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	pgd = pgd_offset(mm, addr);
 	spin_lock(&mm->page_table_lock);
 	
diff --git a/mm/madvise.c b/mm/madvise.c
index 20e075d1c64c..17aaf3e16449 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 			     unsigned long start, unsigned long end)
 {
 	*prev = vma;
-	if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
 		return -EINVAL;
 
 	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c
index da642b5528fa..e83f9440bb66 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -342,6 +342,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 #define NO_RSS 2	/* Increment neither file_rss nor anon_rss */
 
+/*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+	printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+			"vm_flags = %lx, vaddr = %lx\n",
+		(long long)pte_val(pte),
+		(vma->vm_mm == current->mm ? current->comm : "???"),
+		vma->vm_flags, vaddr);
+	dump_stack();
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 static inline int
 copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-		pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+		pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
 		unsigned long addr)
 {
+	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
 	unsigned long pfn;
@@ -375,18 +393,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
+	/* If the region is VM_RESERVED, the mapping is not
+	 * mapped via rmap - duplicate the pte as is.
+	 */
+	if (vm_flags & VM_RESERVED)
+		goto out_set_pte;
+
 	pfn = pte_pfn(pte);
-	/* the pte points outside of valid memory, the
-	 * mapping is assumed to be good, meaningful
-	 * and not mapped via rmap - duplicate the
-	 * mapping as is.
+	/* If the pte points outside of valid memory but
+	 * the region is not VM_RESERVED, we have a problem.
 	 */
-	page = NULL;
-	if (pfn_valid(pfn))
-		page = pfn_to_page(pfn);
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		goto out_set_pte; /* try to do something sane */
+	}
 
-	if (!page || PageReserved(page))
-		goto out_set_pte;
+	page = pfn_to_page(pfn);
 
 	/*
 	 * If it's a COW mapping, write protect it both
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
-	unsigned long vm_flags = vma->vm_flags;
 	int progress = 0;
 	int rss[NO_RSS+1], anon;
 
@@ -446,8 +467,7 @@ again:
 			progress++;
 			continue;
 		}
-		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
-							vm_flags, addr);
+		anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
 		rss[anon]++;
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return 0;
 }
 
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
+	struct mm_struct *mm = tlb->mm;
 	pte_t *pte;
 	int file_rss = 0;
 	int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (pte_present(ptent)) {
 			struct page *page = NULL;
-			unsigned long pfn = pte_pfn(ptent);
-			if (pfn_valid(pfn)) {
-				page = pfn_to_page(pfn);
-				if (PageReserved(page))
-					page = NULL;
+			if (!(vma->vm_flags & VM_RESERVED)) {
+				unsigned long pfn = pte_pfn(ptent);
+				if (unlikely(!pfn_valid(pfn)))
+					print_bad_pte(vma, ptent, addr);
+				else
+					page = pfn_to_page(pfn);
 			}
 			if (unlikely(details) && page) {
 				/*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 				     page->index > details->last_index))
 					continue;
 			}
-			ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			if (unlikely(details) && details->nonlinear_vma
 			    && linear_page_index(details->nonlinear_vma,
 						addr) != page->index)
-				set_pte_at(tlb->mm, addr, pte,
+				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
 			if (PageAnon(page))
 				anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 			continue;
 		if (!pte_file(ptent))
 			free_swap_and_cache(pte_to_swp_entry(ptent));
-		pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+		pte_clear_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
-	add_mm_rss(tlb->mm, -file_rss, -anon_rss);
+	add_mm_rss(mm, -file_rss, -anon_rss);
 	pte_unmap(pte - 1);
 }
 
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pud_t *pud,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		zap_pte_range(tlb, pmd, addr, next, details);
+		zap_pte_range(tlb, vma, pmd, addr, next, details);
 	} while (pmd++, addr = next, addr != end);
 }
 
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+				struct vm_area_struct *vma, pgd_t *pgd,
 				unsigned long addr, unsigned long end,
 				struct zap_details *details)
 {
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		zap_pmd_range(tlb, pud, addr, next, details);
+		zap_pmd_range(tlb, vma, pud, addr, next, details);
 	} while (pud++, addr = next, addr != end);
 }
 
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		zap_pud_range(tlb, pgd, addr, next, details);
+		zap_pud_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
 }
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & VM_IO)
+		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
 				|| !(flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
-				if (!PageReserved(page))
-					page_cache_get(page);
+				page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	if (!pte)
 		return -ENOMEM;
 	do {
-		pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+		struct page *page = ZERO_PAGE(addr);
+		pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+		page_cache_get(page);
+		page_add_file_rmap(page);
+		inc_mm_counter(mm, file_rss);
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		return -ENOMEM;
 	do {
 		BUG_ON(!pte_none(*pte));
-		if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-			set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED tells swapout not to try to touch
-	 *	this region.
+	 *   VM_RESERVED tells the core MM not to "manage" these pages
+         *	(e.g. refcount, mapcount, try to swap them out).
 	 */
 	vma->vm_flags |= VM_IO | VM_RESERVED;
 
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
+	BUG_ON(vma->vm_flags & VM_RESERVED);
+
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_ERROR(orig_pte);
+		print_bad_pte(vma, orig_pte, address);
 		ret = VM_FAULT_OOM;
 		goto unlock;
 	}
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/*
 	 * Ok, we need to copy. Oh, well..
 	 */
-	if (!PageReserved(old_page))
-		page_cache_get(old_page);
+	page_cache_get(old_page);
 	pte_unmap(page_table);
 	spin_unlock(&mm->page_table_lock);
 
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spin_lock(&mm->page_table_lock);
 	page_table = pte_offset_map(pmd, address);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		if (PageReserved(old_page))
+		page_remove_rmap(old_page);
+		if (!PageAnon(old_page)) {
 			inc_mm_counter(mm, anon_rss);
-		else {
-			page_remove_rmap(old_page);
-			if (!PageAnon(old_page)) {
-				inc_mm_counter(mm, anon_rss);
-				dec_mm_counter(mm, file_rss);
-			}
+			dec_mm_counter(mm, file_rss);
 		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		int write_access)
 {
+	struct page *page = ZERO_PAGE(addr);
 	pte_t entry;
 
 	/* Mapping of ZERO_PAGE - vm_page_prot is readonly */
-	entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
+	entry = mk_pte(page, vma->vm_page_prot);
 
 	if (write_access) {
-		struct page *page;
-
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 		spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		lru_cache_add_active(page);
 		SetPageReferenced(page);
 		page_add_anon_rmap(page, vma, address);
+	} else {
+		inc_mm_counter(mm, file_rss);
+		page_add_file_rmap(page);
+		page_cache_get(page);
 	}
 
 	set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!PageReserved(new_page)) {
+		} else if (!(vma->vm_flags & VM_RESERVED)) {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 		}
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		/*
 		 * Page table corrupted: show pte and kill process.
 		 */
-		pte_ERROR(orig_pte);
+		print_bad_pte(vma, orig_pte, address);
 		return VM_FAULT_OOM;
 	}
 	/* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_page_prot = PAGE_READONLY;
-	gate_vma.vm_flags = 0;
+	gate_vma.vm_flags = VM_RESERVED;
 	return 0;
 }
 __initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 43b1199af591..11d824f282f1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 }
 
 /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	spin_lock(&mm->page_table_lock);
+	spin_lock(&vma->vm_mm->page_table_lock);
 	orig_pte = pte = pte_offset_map(pmd, addr);
 	do {
 		unsigned long pfn;
@@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		if (!pte_present(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (!pfn_valid(pfn)) {
+			print_bad_pte(vma, *pte, addr);
 			continue;
+		}
 		nid = pfn_to_nid(pfn);
 		if (!node_isset(nid, *nodes))
 			break;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(orig_pte);
-	spin_unlock(&mm->page_table_lock);
+	spin_unlock(&vma->vm_mm->page_table_lock);
 	return addr != end;
 }
 
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pmd_t *pmd;
@@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		if (check_pte_range(mm, pmd, addr, next, nodes))
+		if (check_pte_range(vma, pmd, addr, next, nodes))
 			return -EIO;
 	} while (pmd++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pud_t *pud;
@@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		if (check_pmd_range(mm, pud, addr, next, nodes))
+		if (check_pmd_range(vma, pud, addr, next, nodes))
 			return -EIO;
 	} while (pud++, addr = next, addr != end);
 	return 0;
 }
 
-static inline int check_pgd_range(struct mm_struct *mm,
+static inline int check_pgd_range(struct vm_area_struct *vma,
 		unsigned long addr, unsigned long end, nodemask_t *nodes)
 {
 	pgd_t *pgd;
 	unsigned long next;
 
-	pgd = pgd_offset(mm, addr);
+	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		if (check_pud_range(mm, pgd, addr, next, nodes))
+		if (check_pud_range(vma, pgd, addr, next, nodes))
 			return -EIO;
 	} while (pgd++, addr = next, addr != end);
 	return 0;
@@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	first = find_vma(mm, start);
 	if (!first)
 		return ERR_PTR(-EFAULT);
+	if (first->vm_flags & VM_RESERVED)
+		return ERR_PTR(-EACCES);
 	prev = NULL;
 	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 		if (!vma->vm_next && vma->vm_end < end)
@@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 				endvma = end;
 			if (vma->vm_start > start)
 				start = vma->vm_start;
-			err = check_pgd_range(vma->vm_mm,
-					   start, endvma, nodes);
+			err = check_pgd_range(vma, start, endvma, nodes);
 			if (err) {
 				first = ERR_PTR(err);
 				break;
diff --git a/mm/mmap.c b/mm/mmap.c
index 459b9f068ad7..8a111792b8db 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1088,6 +1088,17 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
+		if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+						== (VM_WRITE | VM_RESERVED)) {
+			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+				"PROT_WRITE mmap of VM_RESERVED memory, which "
+				"is deprecated. Please report this to "
+				"linux-kernel@vger.kernel.org\n",current->comm);
+			if (vma->vm_ops && vma->vm_ops->close)
+				vma->vm_ops->close(vma);
+			error = -EACCES;
+			goto unmap_and_free_vma;
+		}
 	} else if (vm_flags & VM_SHARED) {
 		error = shmem_zero_setup(vma);
 		if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b426f01c5e9c..672a76fddd5e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
+		if (oldflags & VM_RESERVED) {
+			BUG_ON(oldflags & VM_WRITE);
+			printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+				"PROT_WRITE mprotect of VM_RESERVED memory, "
+				"which is deprecated. Please report this to "
+				"linux-kernel@vger.kernel.org\n",current->comm);
+			return -EACCES;
+		}
 		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
diff --git a/mm/msync.c b/mm/msync.c
index 3b5f1c521d4b..860395486060 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -25,6 +25,7 @@
 static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	pte_t *pte;
 	int progress = 0;
 
@@ -37,7 +38,7 @@ again:
 		if (progress >= 64) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(&vma->vm_mm->page_table_lock))
+			    need_lockbreak(&mm->page_table_lock))
 				break;
 		}
 		progress++;
@@ -46,11 +47,11 @@ again:
 		if (!pte_maybe_dirty(*pte))
 			continue;
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, *pte, addr);
 			continue;
+		}
 		page = pfn_to_page(pfn);
-		if (PageReserved(page))
-			continue;
 
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
 		    page_test_and_clear_dirty(page))
@@ -58,7 +59,7 @@ again:
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap(pte - 1);
-	cond_resched_lock(&vma->vm_mm->page_table_lock);
+	cond_resched_lock(&mm->page_table_lock);
 	if (addr != end)
 		goto again;
 }
@@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma,
 
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
-	 * to do anything more on an msync() */
-	if (is_vm_hugetlb_page(vma))
+	 * to do anything more on an msync().
+	 * Can't do anything with VM_RESERVED regions either.
+	 */
+	if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
 		return;
 
 	BUG_ON(addr >= end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 60663232fbb2..0541288ebf4b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
 			1 << PG_reclaim |
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback);
+			1 << PG_writeback |
+			1 << PG_reserved );
 	set_page_count(page, 0);
 	reset_page_mapcount(page);
 	page->mapping = NULL;
@@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order)
 {
        if (PagePrivate(page)           &&
            (page_order(page) == order) &&
-           !PageReserved(page)         &&
             page_count(page) == 0)
                return 1;
        return 0;
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_reclaim	|
 			1 << PG_slab	|
 			1 << PG_swapcache |
-			1 << PG_writeback )))
+			1 << PG_writeback |
+			1 << PG_reserved )))
 		bad_page(function, page);
 	if (PageDirty(page))
 		__ClearPageDirty(page);
@@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order)
 			1 << PG_reclaim	|
 			1 << PG_slab    |
 			1 << PG_swapcache |
-			1 << PG_writeback )))
+			1 << PG_writeback |
+			1 << PG_reserved )))
 		bad_page(__FUNCTION__, page);
 
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
 
 fastcall void __free_pages(struct page *page, unsigned int order)
 {
-	if (!PageReserved(page) && put_page_testzero(page)) {
+	if (put_page_testzero(page)) {
 		if (order == 0)
 			free_hot_page(page);
 		else
@@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 0);
+		set_page_count(page, 1);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
diff --git a/mm/rmap.c b/mm/rmap.c
index 504757624cce..f69d5342ce7f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
 void page_add_anon_rmap(struct page *page,
 	struct vm_area_struct *vma, unsigned long address)
 {
-	BUG_ON(PageReserved(page));
-
 	if (atomic_inc_and_test(&page->_mapcount)) {
 		struct anon_vma *anon_vma = vma->anon_vma;
 
@@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page,
 void page_add_file_rmap(struct page *page)
 {
 	BUG_ON(PageAnon(page));
-	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
-		return;
+	BUG_ON(!pfn_valid(page_to_pfn(page)));
 
 	if (atomic_inc_and_test(&page->_mapcount))
 		inc_page_state(nr_mapped);
@@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page)
  */
 void page_remove_rmap(struct page *page)
 {
-	BUG_ON(PageReserved(page));
-
 	if (atomic_add_negative(-1, &page->_mapcount)) {
 		BUG_ON(page_mapcount(page) < 0);
 		/*
@@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
 			continue;
 
 		pfn = pte_pfn(*pte);
-		if (!pfn_valid(pfn))
+		if (unlikely(!pfn_valid(pfn))) {
+			print_bad_pte(vma, *pte, address);
 			continue;
+		}
 
 		page = pfn_to_page(pfn);
 		BUG_ON(PageAnon(page));
-		if (PageReserved(page))
-			continue;
 
 		if (ptep_clear_flush_young(vma, address, pte))
 			continue;
@@ -808,7 +803,6 @@ int try_to_unmap(struct page *page)
 {
 	int ret;
 
-	BUG_ON(PageReserved(page));
 	BUG_ON(!PageLocked(page));
 
 	if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c
index 6796311a23ef..37777f4c11f8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 			 */
 			if (!offset)
 				mark_page_accessed(page);
-		} else
+		} else {
 			page = ZERO_PAGE(0);
+			page_cache_get(page);
+		}
 
 		/*
 		 * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/swap.c b/mm/swap.c
index 7771d2803f62..21d15f99805c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -48,7 +48,7 @@ void put_page(struct page *page)
 		}
 		return;
 	}
-	if (!PageReserved(page) && put_page_testzero(page))
+	if (put_page_testzero(page))
 		__page_cache_release(page);
 }
 EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
 		struct page *page = pages[i];
 		struct zone *pagezone;
 
-		if (PageReserved(page) || !put_page_testzero(page))
+		if (!put_page_testzero(page))
 			continue;
 
 		pagezone = page_zone(page);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 67abebabf83e..e97b2d162cc7 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->status);
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
 		return NOPAGE_OOM;
 	runtime = substream->runtime;
 	page = virt_to_page(runtime->control);
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
 		vaddr = runtime->dma_area + offset;
 		page = virt_to_page(vaddr);
 	}
-	if (!PageReserved(page))
-		get_page(page);
+	get_page(page);
 	if (type)
 		*type = VM_FAULT_MINOR;
 	return page;
-- 
cgit v1.2.3


From 365e9c87a982c03d0af3886e29d877f581b59611 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:18 -0700
Subject: [PATCH] mm: update_hiwaters just in time

update_mem_hiwater has attracted various criticisms, in particular from those
concerned with mm scalability.  Originally it was called whenever rss or
total_vm got raised.  Then many of those callsites were replaced by a timer
tick call from account_system_time.  Now Frank van Maarseveen reports that to
be found inadequate.  How about this?  Works for Frank.

Replace update_mem_hiwater, a poor combination of two unrelated ops, by macros
update_hiwater_rss and update_hiwater_vm.  Don't attempt to keep
mm->hiwater_rss up to date at timer tick, nor every time we raise rss (usually
by 1): those are hot paths.  Do the opposite, update only when about to lower
rss (usually by many), or just before final accounting in do_exit.  Handle
mm->hiwater_vm in the same way, though it's much less of an issue.  Demand
that whoever collects these hiwater statistics do the work of taking the
maximum with rss or total_vm.

And there has been no collector of these hiwater statistics in the tree.  The
new convention needs an example, so match Frank's usage by adding a VmPeak
line above VmSize to /proc/<pid>/status, and also a VmHWM line above VmRSS
(High-Water-Mark or High-Water-Memory).

There was a particular anomaly during mremap move, that hiwater_vm might be
captured too high.  A fleeting such anomaly remains, but it's quickly
corrected now, whereas before it would stick.

What locking?  None: if the app is racy then these statistics will be racy,
it's not worth any overhead to make them exact.  But whenever it suits,
hiwater_vm is updated under exclusive mmap_sem, and hiwater_rss under
page_table_lock (for now) or with preemption disabled (later on): without
going to any trouble, minimize the time between reading current values and
updating, to minimize those occasions when a racing thread bumps a count up
and back down in between.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/compat.c           |  1 -
 fs/exec.c             |  1 -
 fs/proc/task_mmu.c    | 23 +++++++++++++++++++++--
 include/linux/mm.h    |  3 ---
 include/linux/sched.h | 10 ++++++++++
 kernel/exit.c         |  5 ++++-
 kernel/sched.c        |  2 --
 mm/fremap.c           |  4 +++-
 mm/hugetlb.c          |  3 +++
 mm/memory.c           | 17 +----------------
 mm/mmap.c             |  4 ++++
 mm/mremap.c           | 12 ++++++++++--
 mm/nommu.c            | 15 ++-------------
 mm/rmap.c             |  6 ++++++
 14 files changed, 64 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index a719e158e002..8e71cdbecc7c 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1490,7 +1490,6 @@ int compat_do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/exec.c b/fs/exec.c
index cefadf5ab83b..9bb55c8cf224 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1207,7 +1207,6 @@ int do_execve(char * filename,
 		/* execve success */
 		security_bprm_free(bprm);
 		acct_update_integrals(current);
-		update_mem_hiwater(current);
 		kfree(bprm);
 		return retval;
 	}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index bccee7cf9ccd..7c89b4549049 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,22 +14,41 @@
 char *task_mem(struct mm_struct *mm, char *buffer)
 {
 	unsigned long data, text, lib;
+	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
+
+	/*
+	 * Note: to minimize their overhead, mm maintains hiwater_vm and
+	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
+	 * collector of these hiwater stats must therefore get total_vm
+	 * and rss too, which will usually be the higher.  Barriers? not
+	 * worth the effort, such snapshots can always be inconsistent.
+	 */
+	hiwater_vm = total_vm = mm->total_vm;
+	if (hiwater_vm < mm->hiwater_vm)
+		hiwater_vm = mm->hiwater_vm;
+	hiwater_rss = total_rss = get_mm_rss(mm);
+	if (hiwater_rss < mm->hiwater_rss)
+		hiwater_rss = mm->hiwater_rss;
 
 	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	buffer += sprintf(buffer,
+		"VmPeak:\t%8lu kB\n"
 		"VmSize:\t%8lu kB\n"
 		"VmLck:\t%8lu kB\n"
+		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
 		"VmLib:\t%8lu kB\n"
 		"VmPTE:\t%8lu kB\n",
-		(mm->total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
+		hiwater_vm << (PAGE_SHIFT-10),
+		(total_vm - mm->reserved_vm) << (PAGE_SHIFT-10),
 		mm->locked_vm << (PAGE_SHIFT-10),
-		get_mm_rss(mm) << (PAGE_SHIFT-10),
+		hiwater_rss << (PAGE_SHIFT-10),
+		total_rss << (PAGE_SHIFT-10),
 		data << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		(PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index da42093250c3..7d4552fe0864 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,9 +938,6 @@ static inline void vm_stat_account(struct mm_struct *mm,
 }
 #endif /* CONFIG_PROC_FS */
 
-/* update per process rss and vm hiwater data */
-extern void update_mem_hiwater(struct task_struct *tsk);
-
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index afcaac66cbd5..a9c0b7d26303 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -256,6 +256,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
 #define dec_mm_counter(mm, member) (mm)->_##member--
 #define get_mm_rss(mm) ((mm)->_file_rss + (mm)->_anon_rss)
 
+#define update_hiwater_rss(mm)	do {			\
+	unsigned long _rss = get_mm_rss(mm);		\
+	if ((mm)->hiwater_rss < _rss)			\
+		(mm)->hiwater_rss = _rss;		\
+} while (0)
+#define update_hiwater_vm(mm)	do {			\
+	if ((mm)->hiwater_vm < (mm)->total_vm)		\
+		(mm)->hiwater_vm = (mm)->total_vm;	\
+} while (0)
+
 typedef unsigned long mm_counter_t;
 
 struct mm_struct {
diff --git a/kernel/exit.c b/kernel/exit.c
index 3b25b182d2be..79f52b85d6ed 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -839,7 +839,10 @@ fastcall NORET_TYPE void do_exit(long code)
 				preempt_count());
 
 	acct_update_integrals(tsk);
-	update_mem_hiwater(tsk);
+	if (tsk->mm) {
+		update_hiwater_rss(tsk->mm);
+		update_hiwater_vm(tsk->mm);
+	}
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
  		del_timer_sync(&tsk->signal->real_timer);
diff --git a/kernel/sched.c b/kernel/sched.c
index 1e5cafdf4e27..4f26c544d02c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2511,8 +2511,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 		cpustat->idle = cputime64_add(cpustat->idle, tmp);
 	/* Account for system time used */
 	acct_update_integrals(p);
-	/* Update rss highwater mark */
-	update_mem_hiwater(p);
 }
 
 /*
diff --git a/mm/fremap.c b/mm/fremap.c
index 7f08d10ceaff..49719a35769a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -143,8 +143,10 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!pte)
 		goto err_unlock;
 
-	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte))
+	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
+		update_hiwater_rss(mm);
 		dec_mm_counter(mm, file_rss);
+	}
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 094455bcbbf7..ac5f044bf514 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -310,6 +310,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
 	BUG_ON(start & ~HPAGE_MASK);
 	BUG_ON(end & ~HPAGE_MASK);
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (address = start; address < end; address += HPAGE_SIZE) {
 		ptep = huge_pte_offset(mm, address);
 		if (! ptep)
diff --git a/mm/memory.c b/mm/memory.c
index a25ee1d3e20a..692ad810263d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -820,6 +820,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
 	tlb_finish_mmu(tlb, address, end);
 	spin_unlock(&mm->page_table_lock);
@@ -2225,22 +2226,6 @@ unsigned long vmalloc_to_pfn(void * vmalloc_addr)
 
 EXPORT_SYMBOL(vmalloc_to_pfn);
 
-/*
- * update_mem_hiwater
- *	- update per process rss and vm high water data
- */
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	if (tsk->mm) {
-		unsigned long rss = get_mm_rss(tsk->mm);
-
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
-
 #if !defined(__HAVE_ARCH_GATE_AREA)
 
 #if defined(AT_SYSINFO_EHDR)
diff --git a/mm/mmap.c b/mm/mmap.c
index 8a111792b8db..c43b28457007 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1640,6 +1640,8 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
  */
 static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 {
+	/* Update high watermark before we lower total_vm */
+	update_hiwater_vm(mm);
 	do {
 		long nrpages = vma_pages(vma);
 
@@ -1668,6 +1670,7 @@ static void unmap_region(struct mm_struct *mm,
 	lru_add_drain();
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
+	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 	free_pgtables(&tlb, vma, prev? prev->vm_end: FIRST_USER_ADDRESS,
@@ -1953,6 +1956,7 @@ void exit_mmap(struct mm_struct *mm)
 
 	flush_cache_mm(mm);
 	tlb = tlb_gather_mmu(mm, 1);
+	/* Don't update_hiwater_rss(mm) here, do_exit already did */
 	/* Use -1 here to ensure all VMAs in the mm are unmapped */
 	end = unmap_vmas(&tlb, mm, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
diff --git a/mm/mremap.c b/mm/mremap.c
index 318eea5467a0..ccf456477020 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -167,6 +167,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	unsigned long new_pgoff;
 	unsigned long moved_len;
 	unsigned long excess = 0;
+	unsigned long hiwater_vm;
 	int split = 0;
 
 	/*
@@ -205,9 +206,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	}
 
 	/*
-	 * if we failed to move page tables we still do total_vm increment
-	 * since do_munmap() will decrement it by old_len == new_len
+	 * If we failed to move page tables we still do total_vm increment
+	 * since do_munmap() will decrement it by old_len == new_len.
+	 *
+	 * Since total_vm is about to be raised artificially high for a
+	 * moment, we need to restore high watermark afterwards: if stats
+	 * are taken meanwhile, total_vm and hiwater_vm appear too high.
+	 * If this were a serious issue, we'd add a flag to do_munmap().
 	 */
+	hiwater_vm = mm->hiwater_vm;
 	mm->total_vm += new_len >> PAGE_SHIFT;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT);
 
@@ -216,6 +223,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		vm_unacct_memory(excess >> PAGE_SHIFT);
 		excess = 0;
 	}
+	mm->hiwater_vm = hiwater_vm;
 
 	/* Restore VM_ACCOUNT if one or two pieces of vma left */
 	if (excess) {
diff --git a/mm/nommu.c b/mm/nommu.c
index 599924886eb5..dfb124ffb9be 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -931,6 +931,8 @@ int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len)
 	realalloc -= kobjsize(vml);
 	askedalloc -= sizeof(*vml);
 	kfree(vml);
+
+	update_hiwater_vm(mm);
 	mm->total_vm -= len >> PAGE_SHIFT;
 
 #ifdef DEBUG
@@ -1078,19 +1080,6 @@ void arch_unmap_area(struct mm_struct *mm, unsigned long addr)
 {
 }
 
-void update_mem_hiwater(struct task_struct *tsk)
-{
-	unsigned long rss;
-
-	if (likely(tsk->mm)) {
-		rss = get_mm_rss(tsk->mm);
-		if (tsk->mm->hiwater_rss < rss)
-			tsk->mm->hiwater_rss = rss;
-		if (tsk->mm->hiwater_vm < tsk->mm->total_vm)
-			tsk->mm->hiwater_vm = tsk->mm->total_vm;
-	}
-}
-
 void unmap_mapping_range(struct address_space *mapping,
 			 loff_t const holebegin, loff_t const holelen,
 			 int even_cows)
diff --git a/mm/rmap.c b/mm/rmap.c
index f69d5342ce7f..4c52c56c9905 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -538,6 +538,9 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	if (pte_dirty(pteval))
 		set_page_dirty(page);
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	if (PageAnon(page)) {
 		swp_entry_t entry = { .val = page->private };
 		/*
@@ -628,6 +631,9 @@ static void try_to_unmap_cluster(unsigned long cursor,
 	if (!pmd_present(*pmd))
 		goto out_unlock;
 
+	/* Update high watermark before we lower rss */
+	update_hiwater_rss(mm);
+
 	for (original_pte = pte = pte_offset_map(pmd, address);
 			address < end; pte++, address += PAGE_SIZE) {
 
-- 
cgit v1.2.3


From c74df32c724a1652ad8399b4891bb02c9d43743a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:23 -0700
Subject: [PATCH] mm: ptd_alloc take ptlock

Second step in pushing down the page_table_lock.  Remove the temporary
bridging hack from __pud_alloc, __pmd_alloc, __pte_alloc: expect callers not
to hold page_table_lock, whether it's on init_mm or a user mm; take
page_table_lock internally to check if a racing task already allocated.

Convert their callers from common code.  But avoid coming back to change them
again later: instead of moving the spin_lock(&mm->page_table_lock) down,
switch over to new macros pte_alloc_map_lock and pte_unmap_unlock, which
encapsulate the mapping+locking and unlocking+unmapping together, and in the
end may use alternatives to the mm page_table_lock itself.

These callers all hold mmap_sem (some exclusively, some not), so at no level
can a page table be whipped away from beneath them; and pte_alloc uses the
"atomic" pmd_present to test whether it needs to allocate.  It appears that on
all arches we can safely descend without page_table_lock.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/exec.c          |  14 +++-----
 include/linux/mm.h |  18 ++++++++++
 kernel/fork.c      |   2 --
 mm/fremap.c        |  48 ++++++++++---------------
 mm/hugetlb.c       |  12 ++++---
 mm/memory.c        | 104 +++++++++++++++++------------------------------------
 mm/mremap.c        |  27 +++++---------
 7 files changed, 90 insertions(+), 135 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 9bb55c8cf224..ba73797eb4cb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -309,25 +309,24 @@ void install_arg_page(struct vm_area_struct *vma,
 	pud_t * pud;
 	pmd_t * pmd;
 	pte_t * pte;
+	spinlock_t *ptl;
 
 	if (unlikely(anon_vma_prepare(vma)))
-		goto out_sig;
+		goto out;
 
 	flush_dcache_page(page);
 	pgd = pgd_offset(mm, address);
-
-	spin_lock(&mm->page_table_lock);
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
 		goto out;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
 		goto out;
-	pte = pte_alloc_map(mm, pmd, address);
+	pte = pte_alloc_map_lock(mm, pmd, address, &ptl);
 	if (!pte)
 		goto out;
 	if (!pte_none(*pte)) {
-		pte_unmap(pte);
+		pte_unmap_unlock(pte, ptl);
 		goto out;
 	}
 	inc_mm_counter(mm, anon_rss);
@@ -335,14 +334,11 @@ void install_arg_page(struct vm_area_struct *vma,
 	set_pte_at(mm, address, pte, pte_mkdirty(pte_mkwrite(mk_pte(
 					page, vma->vm_page_prot))));
 	page_add_anon_rmap(page, vma, address);
-	pte_unmap(pte);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
 
 	/* no need for flush_tlb */
 	return;
 out:
-	spin_unlock(&mm->page_table_lock);
-out_sig:
 	__free_page(page);
 	force_sig(SIGKILL, current);
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 22c2d6922c0e..d4c3512e7db4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -779,10 +779,28 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#define pte_offset_map_lock(mm, pmd, address, ptlp)	\
+({							\
+	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	pte_t *__pte = pte_offset_map(pmd, address);	\
+	*(ptlp) = __ptl;				\
+	spin_lock(__ptl);				\
+	__pte;						\
+})
+
+#define pte_unmap_unlock(pte, ptl)	do {		\
+	spin_unlock(ptl);				\
+	pte_unmap(pte);					\
+} while (0)
+
 #define pte_alloc_map(mm, pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
 		NULL: pte_offset_map(pmd, address))
 
+#define pte_alloc_map_lock(mm, pmd, address, ptlp)	\
+	((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \
+		NULL: pte_offset_map_lock(mm, pmd, address, ptlp))
+
 #define pte_alloc_kernel(pmd, address)			\
 	((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \
 		NULL: pte_offset_kernel(pmd, address))
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a587b3224e3..8a069612eac3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -255,7 +255,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		/*
 		 * Link in the new vma and copy the page table entries.
 		 */
-		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 
@@ -265,7 +264,6 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, tmp);
-		spin_unlock(&mm->page_table_lock);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
diff --git a/mm/fremap.c b/mm/fremap.c
index 49719a35769a..d862be3bc3e3 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -63,23 +63,20 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
+	spinlock_t *ptl;
 
 	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
-	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto err_unlock;
-
+		goto out;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto err_unlock;
-
-	pte = pte_alloc_map(mm, pmd, addr);
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
-		goto err_unlock;
+		goto out;
 
 	/*
 	 * This page may have been truncated. Tell the
@@ -89,10 +86,10 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inode = vma->vm_file->f_mapping->host;
 	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	if (!page->mapping || page->index >= size)
-		goto err_unlock;
+		goto unlock;
 	err = -ENOMEM;
 	if (page_mapcount(page) > INT_MAX/2)
-		goto err_unlock;
+		goto unlock;
 
 	if (pte_none(*pte) || !zap_pte(mm, vma, addr, pte))
 		inc_mm_counter(mm, file_rss);
@@ -101,17 +98,15 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 	page_add_file_rmap(page);
 	pte_val = *pte;
-	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-
 	err = 0;
-err_unlock:
-	spin_unlock(&mm->page_table_lock);
+unlock:
+	pte_unmap_unlock(pte, ptl);
+out:
 	return err;
 }
 EXPORT_SYMBOL(install_page);
 
-
 /*
  * Install a file pte to a given virtual memory address, release any
  * previously existing mapping.
@@ -125,23 +120,20 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 	pud_t *pud;
 	pgd_t *pgd;
 	pte_t pte_val;
+	spinlock_t *ptl;
 
 	BUG_ON(vma->vm_flags & VM_RESERVED);
 
 	pgd = pgd_offset(mm, addr);
-	spin_lock(&mm->page_table_lock);
-	
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto err_unlock;
-
+		goto out;
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto err_unlock;
-
-	pte = pte_alloc_map(mm, pmd, addr);
+		goto out;
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
-		goto err_unlock;
+		goto out;
 
 	if (!pte_none(*pte) && zap_pte(mm, vma, addr, pte)) {
 		update_hiwater_rss(mm);
@@ -150,17 +142,13 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff));
 	pte_val = *pte;
-	pte_unmap(pte);
 	update_mmu_cache(vma, addr, pte_val);
-	spin_unlock(&mm->page_table_lock);
-	return 0;
-
-err_unlock:
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(pte, ptl);
+	err = 0;
+out:
 	return err;
 }
 
-
 /***
  * sys_remap_file_pages - remap arbitrary pages of a shared backing store
  *                        file within an existing vma.
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index ac5f044bf514..ea0826ff2663 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -277,12 +277,15 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 	unsigned long addr;
 
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		src_pte = huge_pte_offset(src, addr);
+		if (!src_pte)
+			continue;
 		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
 			goto nomem;
+		spin_lock(&dst->page_table_lock);
 		spin_lock(&src->page_table_lock);
-		src_pte = huge_pte_offset(src, addr);
-		if (src_pte && !pte_none(*src_pte)) {
+		if (!pte_none(*src_pte)) {
 			entry = *src_pte;
 			ptepage = pte_page(entry);
 			get_page(ptepage);
@@ -290,6 +293,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			set_huge_pte_at(dst, addr, dst_pte, entry);
 		}
 		spin_unlock(&src->page_table_lock);
+		spin_unlock(&dst->page_table_lock);
 	}
 	return 0;
 
@@ -354,7 +358,6 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 
 	hugetlb_prefault_arch_hook(mm);
 
-	spin_lock(&mm->page_table_lock);
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
 		pte_t *pte = huge_pte_alloc(mm, addr);
@@ -389,11 +392,12 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				goto out;
 			}
 		}
+		spin_lock(&mm->page_table_lock);
 		add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
 		set_huge_pte_at(mm, addr, pte, make_huge_pte(vma, page));
+		spin_unlock(&mm->page_table_lock);
 	}
 out:
-	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
 
diff --git a/mm/memory.c b/mm/memory.c
index 4bdd1186b43b..a40e4b1cee4f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -282,14 +282,11 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 
 int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 {
-	struct page *new;
-
-	spin_unlock(&mm->page_table_lock);
-	new = pte_alloc_one(mm, address);
-	spin_lock(&mm->page_table_lock);
+	struct page *new = pte_alloc_one(mm, address);
 	if (!new)
 		return -ENOMEM;
 
+	spin_lock(&mm->page_table_lock);
 	if (pmd_present(*pmd))		/* Another has populated it */
 		pte_free(new);
 	else {
@@ -297,6 +294,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
 	}
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 
@@ -344,9 +342,6 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
- *
- * dst->page_table_lock is held on entry and exit,
- * but may be dropped within p[mg]d_alloc() and pte_alloc_map().
  */
 
 static inline void
@@ -419,17 +414,19 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		unsigned long addr, unsigned long end)
 {
 	pte_t *src_pte, *dst_pte;
+	spinlock_t *src_ptl, *dst_ptl;
 	int progress = 0;
 	int rss[2];
 
 again:
 	rss[1] = rss[0] = 0;
-	dst_pte = pte_alloc_map(dst_mm, dst_pmd, addr);
+	dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
+	src_ptl = &src_mm->page_table_lock;
+	spin_lock(src_ptl);
 
-	spin_lock(&src_mm->page_table_lock);
 	do {
 		/*
 		 * We are holding two locks at this point - either of them
@@ -438,8 +435,8 @@ again:
 		if (progress >= 32) {
 			progress = 0;
 			if (need_resched() ||
-			    need_lockbreak(&src_mm->page_table_lock) ||
-			    need_lockbreak(&dst_mm->page_table_lock))
+			    need_lockbreak(src_ptl) ||
+			    need_lockbreak(dst_ptl))
 				break;
 		}
 		if (pte_none(*src_pte)) {
@@ -449,12 +446,12 @@ again:
 		copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss);
 		progress += 8;
 	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
-	spin_unlock(&src_mm->page_table_lock);
 
+	spin_unlock(src_ptl);
 	pte_unmap_nested(src_pte - 1);
-	pte_unmap(dst_pte - 1);
 	add_mm_rss(dst_mm, rss[0], rss[1]);
-	cond_resched_lock(&dst_mm->page_table_lock);
+	pte_unmap_unlock(dst_pte - 1, dst_ptl);
+	cond_resched();
 	if (addr != end)
 		goto again;
 	return 0;
@@ -1049,8 +1046,9 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long addr, unsigned long end, pgprot_t prot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_alloc_map(mm, pmd, addr);
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -1062,7 +1060,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, addr, pte, zero_pte);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 
@@ -1112,14 +1110,12 @@ int zeromap_page_range(struct vm_area_struct *vma,
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = zeromap_pud_range(mm, pgd, addr, next, prot);
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 
@@ -1133,8 +1129,9 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 			unsigned long pfn, pgprot_t prot)
 {
 	pte_t *pte;
+	spinlock_t *ptl;
 
-	pte = pte_alloc_map(mm, pmd, addr);
+	pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
 	if (!pte)
 		return -ENOMEM;
 	do {
@@ -1142,7 +1139,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
 		set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
 		pfn++;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
-	pte_unmap(pte - 1);
+	pte_unmap_unlock(pte - 1, ptl);
 	return 0;
 }
 
@@ -1210,7 +1207,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	pfn -= addr >> PAGE_SHIFT;
 	pgd = pgd_offset(mm, addr);
 	flush_cache_range(vma, addr, end);
-	spin_lock(&mm->page_table_lock);
 	do {
 		next = pgd_addr_end(addr, end);
 		err = remap_pud_range(mm, pgd, addr, next,
@@ -1218,7 +1214,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 		if (err)
 			break;
 	} while (pgd++, addr = next, addr != end);
-	spin_unlock(&mm->page_table_lock);
 	return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -1985,17 +1980,9 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
  * with external mmu caches can use to update those (ie the Sparc or
  * PowerPC hashed page tables that act as extended TLBs).
  *
- * Note the "page_table_lock". It is to protect against kswapd removing
- * pages from under us. Note that kswapd only ever _removes_ pages, never
- * adds them. As such, once we have noticed that the page is not present,
- * we can drop the lock early.
- *
- * The adding of pages is protected by the MM semaphore (which we hold),
- * so we don't need to worry about a page being suddenly been added into
- * our VM.
- *
- * We enter with the pagetable spinlock held, we are supposed to
- * release it when done.
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
  */
 static inline int handle_pte_fault(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
@@ -2003,6 +1990,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 {
 	pte_t entry;
 
+	spin_lock(&mm->page_table_lock);
 	entry = *pte;
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
@@ -2051,30 +2039,18 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(is_vm_hugetlb_page(vma)))
 		return hugetlb_fault(mm, vma, address, write_access);
 
-	/*
-	 * We need the page table lock to synchronize with kswapd
-	 * and the SMP-safe atomic PTE updates.
-	 */
 	pgd = pgd_offset(mm, address);
-	spin_lock(&mm->page_table_lock);
-
 	pud = pud_alloc(mm, pgd, address);
 	if (!pud)
-		goto oom;
-
+		return VM_FAULT_OOM;
 	pmd = pmd_alloc(mm, pud, address);
 	if (!pmd)
-		goto oom;
-
+		return VM_FAULT_OOM;
 	pte = pte_alloc_map(mm, pmd, address);
 	if (!pte)
-		goto oom;
-	
-	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
+		return VM_FAULT_OOM;
 
- oom:
-	spin_unlock(&mm->page_table_lock);
-	return VM_FAULT_OOM;
+	return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
 #ifndef __PAGETABLE_PUD_FOLDED
@@ -2084,24 +2060,16 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  */
 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 {
-	pud_t *new;
-
-	if (mm != &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
-	new = pud_alloc_one(mm, address);
-	if (!new) {
-		if (mm != &init_mm)	/* Temporary bridging hack */
-			spin_lock(&mm->page_table_lock);
+	pud_t *new = pud_alloc_one(mm, address);
+	if (!new)
 		return -ENOMEM;
-	}
 
 	spin_lock(&mm->page_table_lock);
 	if (pgd_present(*pgd))		/* Another has populated it */
 		pud_free(new);
 	else
 		pgd_populate(mm, pgd, new);
-	if (mm == &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PUD_FOLDED */
@@ -2113,16 +2081,9 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
  */
 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 {
-	pmd_t *new;
-
-	if (mm != &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
-	new = pmd_alloc_one(mm, address);
-	if (!new) {
-		if (mm != &init_mm)	/* Temporary bridging hack */
-			spin_lock(&mm->page_table_lock);
+	pmd_t *new = pmd_alloc_one(mm, address);
+	if (!new)
 		return -ENOMEM;
-	}
 
 	spin_lock(&mm->page_table_lock);
 #ifndef __ARCH_HAS_4LEVEL_HACK
@@ -2136,8 +2097,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	else
 		pgd_populate(mm, pud, new);
 #endif /* __ARCH_HAS_4LEVEL_HACK */
-	if (mm == &init_mm)		/* Temporary bridging hack */
-		spin_unlock(&mm->page_table_lock);
+	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
 #endif /* __PAGETABLE_PMD_FOLDED */
diff --git a/mm/mremap.c b/mm/mremap.c
index 616facc3d28a..8de77b632a20 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -28,9 +28,6 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
 	pud_t *pud;
 	pmd_t *pmd;
 
-	/*
-	 * We don't need page_table_lock: we have mmap_sem exclusively.
-	 */
 	pgd = pgd_offset(mm, addr);
 	if (pgd_none_or_clear_bad(pgd))
 		return NULL;
@@ -50,25 +47,20 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
-	pmd_t *pmd = NULL;
+	pmd_t *pmd;
 
-	/*
-	 * We do need page_table_lock: because allocators expect that.
-	 */
-	spin_lock(&mm->page_table_lock);
 	pgd = pgd_offset(mm, addr);
 	pud = pud_alloc(mm, pgd, addr);
 	if (!pud)
-		goto out;
+		return NULL;
 
 	pmd = pmd_alloc(mm, pud, addr);
 	if (!pmd)
-		goto out;
+		return NULL;
 
 	if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr))
-		pmd = NULL;
-out:
-	spin_unlock(&mm->page_table_lock);
+		return NULL;
+
 	return pmd;
 }
 
@@ -80,6 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
+	spinlock_t *old_ptl;
 
 	if (vma->vm_file) {
 		/*
@@ -95,9 +88,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			new_vma->vm_truncate_count = 0;
 	}
 
-	spin_lock(&mm->page_table_lock);
-	old_pte = pte_offset_map(old_pmd, old_addr);
-	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
+ 	new_pte = pte_offset_map_nested(new_pmd, new_addr);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -110,8 +102,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	}
 
 	pte_unmap_nested(new_pte - 1);
-	pte_unmap(old_pte - 1);
-	spin_unlock(&mm->page_table_lock);
+	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
 		spin_unlock(&mapping->i_mmap_lock);
 }
-- 
cgit v1.2.3


From deceb6cd17e6dfafe4c4f81b1b4153bc41b2cb70 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:33 -0700
Subject: [PATCH] mm: follow_page with inner ptlock

Final step in pushing down common core's page_table_lock.  follow_page no
longer wants caller to hold page_table_lock, uses pte_offset_map_lock itself;
and so no page_table_lock is taken in get_user_pages itself.

But get_user_pages (and get_futex_key) do then need follow_page to pin the
page for them: take Daniel's suggestion of bitflags to follow_page.

Need one for WRITE, another for TOUCH (it was the accessed flag before:
vanished along with check_user_page_readable, but surely get_numa_maps is
wrong to mark every page it finds as accessed), another for GET.

And another, ANON to dispose of untouched_anonymous_page: it seems silly for
that to descend a second time, let follow_page observe if there was no page
table and return ZERO_PAGE if so.  Fix minor bug in that: check VM_LOCKED -
make_pages_present ought to make readonly anonymous present.

Give get_numa_maps a cond_resched while we're there.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/task_mmu.c |   3 +-
 include/linux/mm.h |  20 ++++---
 kernel/futex.c     |   6 +--
 mm/memory.c        | 152 +++++++++++++++++++++++++----------------------------
 mm/nommu.c         |   3 +-
 5 files changed, 88 insertions(+), 96 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7e5e7ec2e36d..d2fa42006d8f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -419,7 +419,6 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 	for_each_node(i)
 		md->node[i] =0;
 
-	spin_lock(&mm->page_table_lock);
  	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
 		page = follow_page(mm, vaddr, 0);
 		if (page) {
@@ -434,8 +433,8 @@ static struct numa_maps *get_numa_maps(const struct vm_area_struct *vma)
 				md->anon++;
 			md->node[page_to_nid(page)]++;
 		}
+		cond_resched();
 	}
-	spin_unlock(&mm->page_table_lock);
 	return md;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aa8de20e2e80..e8d1424153bb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -938,14 +938,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
-extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
-
-extern struct page * vmalloc_to_page(void *addr);
-extern unsigned long vmalloc_to_pfn(void *addr);
-extern struct page * follow_page(struct mm_struct *mm, unsigned long address,
-		int write);
-int remap_pfn_range(struct vm_area_struct *, unsigned long,
-		unsigned long, unsigned long, pgprot_t);
+struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr);
+struct page *vmalloc_to_page(void *addr);
+unsigned long vmalloc_to_pfn(void *addr);
+int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
+			unsigned long pfn, unsigned long size, pgprot_t);
+
+struct page *follow_page(struct mm_struct *, unsigned long address,
+			unsigned int foll_flags);
+#define FOLL_WRITE	0x01	/* check pte is writable */
+#define FOLL_TOUCH	0x02	/* mark page accessed */
+#define FOLL_GET	0x04	/* do get_page on page */
+#define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
diff --git a/kernel/futex.c b/kernel/futex.c
index ca05fe6a70b2..3b4d5ad44cc6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -205,15 +205,13 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	/*
 	 * Do a quick atomic lookup first - this is the fastpath.
 	 */
-	spin_lock(&current->mm->page_table_lock);
-	page = follow_page(mm, uaddr, 0);
+	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
 	if (likely(page != NULL)) {
 		key->shared.pgoff =
 			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		spin_unlock(&current->mm->page_table_lock);
+		put_page(page);
 		return 0;
 	}
-	spin_unlock(&current->mm->page_table_lock);
 
 	/*
 	 * Do it the general way.
diff --git a/mm/memory.c b/mm/memory.c
index 51f7c0a220d4..8461e2dd91d7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -807,86 +807,82 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 
 /*
  * Do a quick page-table lookup for a single page.
- * mm->page_table_lock must be held.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+			unsigned int flags)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *ptep, pte;
+	spinlock_t *ptl;
 	unsigned long pfn;
 	struct page *page;
 
-	page = follow_huge_addr(mm, address, write);
-	if (! IS_ERR(page))
-		return page;
+	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
+	if (!IS_ERR(page)) {
+		BUG_ON(flags & FOLL_GET);
+		goto out;
+	}
 
+	page = NULL;
 	pgd = pgd_offset(mm, address);
 	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		goto out;
+		goto no_page_table;
 
 	pud = pud_offset(pgd, address);
 	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		goto out;
+		goto no_page_table;
 	
 	pmd = pmd_offset(pud, address);
 	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+		goto no_page_table;
+
+	if (pmd_huge(*pmd)) {
+		BUG_ON(flags & FOLL_GET);
+		page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
 		goto out;
-	if (pmd_huge(*pmd))
-		return follow_huge_pmd(mm, address, pmd, write);
+	}
 
-	ptep = pte_offset_map(pmd, address);
+	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (!ptep)
 		goto out;
 
 	pte = *ptep;
-	pte_unmap(ptep);
-	if (pte_present(pte)) {
-		if (write && !pte_write(pte))
-			goto out;
-		pfn = pte_pfn(pte);
-		if (pfn_valid(pfn)) {
-			page = pfn_to_page(pfn);
-			if (write && !pte_dirty(pte) &&!PageDirty(page))
-				set_page_dirty(page);
-			mark_page_accessed(page);
-			return page;
-		}
-	}
+	if (!pte_present(pte))
+		goto unlock;
+	if ((flags & FOLL_WRITE) && !pte_write(pte))
+		goto unlock;
+	pfn = pte_pfn(pte);
+	if (!pfn_valid(pfn))
+		goto unlock;
 
+	page = pfn_to_page(pfn);
+	if (flags & FOLL_GET)
+		get_page(page);
+	if (flags & FOLL_TOUCH) {
+		if ((flags & FOLL_WRITE) &&
+		    !pte_dirty(pte) && !PageDirty(page))
+			set_page_dirty(page);
+		mark_page_accessed(page);
+	}
+unlock:
+	pte_unmap_unlock(ptep, ptl);
 out:
-	return NULL;
-}
-
-static inline int
-untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma,
-			 unsigned long address)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-
-	/* Check if the vma is for an anonymous mapping. */
-	if (vma->vm_ops && vma->vm_ops->nopage)
-		return 0;
-
-	/* Check if page directory entry exists. */
-	pgd = pgd_offset(mm, address);
-	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
-		return 1;
-
-	pud = pud_offset(pgd, address);
-	if (pud_none(*pud) || unlikely(pud_bad(*pud)))
-		return 1;
-
-	/* Check if page middle directory entry exists. */
-	pmd = pmd_offset(pud, address);
-	if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-		return 1;
+	return page;
 
-	/* There is a pte slot for 'address' in 'mm'. */
-	return 0;
+no_page_table:
+	/*
+	 * When core dumping an enormous anonymous area that nobody
+	 * has touched so far, we don't want to allocate page tables.
+	 */
+	if (flags & FOLL_ANON) {
+		page = ZERO_PAGE(address);
+		if (flags & FOLL_GET)
+			get_page(page);
+		BUG_ON(flags & FOLL_WRITE);
+	}
+	return page;
 }
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
@@ -894,18 +890,19 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		struct page **pages, struct vm_area_struct **vmas)
 {
 	int i;
-	unsigned int flags;
+	unsigned int vm_flags;
 
 	/* 
 	 * Require read or write permissions.
 	 * If 'force' is set, we only require the "MAY" flags.
 	 */
-	flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
-	flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+	vm_flags  = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+	vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
 	i = 0;
 
 	do {
-		struct vm_area_struct *	vma;
+		struct vm_area_struct *vma;
+		unsigned int foll_flags;
 
 		vma = find_extend_vma(mm, start);
 		if (!vma && in_gate_area(tsk, start)) {
@@ -946,7 +943,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 		}
 
 		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
-				|| !(flags & vma->vm_flags))
+				|| !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
 		if (is_vm_hugetlb_page(vma)) {
@@ -954,29 +951,25 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 						&start, &len, i);
 			continue;
 		}
-		spin_lock(&mm->page_table_lock);
+
+		foll_flags = FOLL_TOUCH;
+		if (pages)
+			foll_flags |= FOLL_GET;
+		if (!write && !(vma->vm_flags & VM_LOCKED) &&
+		    (!vma->vm_ops || !vma->vm_ops->nopage))
+			foll_flags |= FOLL_ANON;
+
 		do {
-			int write_access = write;
 			struct page *page;
 
-			cond_resched_lock(&mm->page_table_lock);
-			while (!(page = follow_page(mm, start, write_access))) {
-				int ret;
-
-				/*
-				 * Shortcut for anonymous pages. We don't want
-				 * to force the creation of pages tables for
-				 * insanely big anonymously mapped areas that
-				 * nobody touched so far. This is important
-				 * for doing a core dump for these mappings.
-				 */
-				if (!write && untouched_anonymous_page(mm,vma,start)) {
-					page = ZERO_PAGE(start);
-					break;
-				}
-				spin_unlock(&mm->page_table_lock);
-				ret = __handle_mm_fault(mm, vma, start, write_access);
+			if (write)
+				foll_flags |= FOLL_WRITE;
 
+			cond_resched();
+			while (!(page = follow_page(mm, start, foll_flags))) {
+				int ret;
+				ret = __handle_mm_fault(mm, vma, start,
+						foll_flags & FOLL_WRITE);
 				/*
 				 * The VM_FAULT_WRITE bit tells us that do_wp_page has
 				 * broken COW when necessary, even if maybe_mkwrite
@@ -984,7 +977,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				 * subsequent page lookups as if they were reads.
 				 */
 				if (ret & VM_FAULT_WRITE)
-					write_access = 0;
+					foll_flags &= ~FOLL_WRITE;
 				
 				switch (ret & ~VM_FAULT_WRITE) {
 				case VM_FAULT_MINOR:
@@ -1000,12 +993,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				default:
 					BUG();
 				}
-				spin_lock(&mm->page_table_lock);
 			}
 			if (pages) {
 				pages[i] = page;
 				flush_dcache_page(page);
-				page_cache_get(page);
 			}
 			if (vmas)
 				vmas[i] = vma;
@@ -1013,7 +1004,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			start += PAGE_SIZE;
 			len--;
 		} while (len && start < vma->vm_end);
-		spin_unlock(&mm->page_table_lock);
 	} while (len);
 	return i;
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index dfb124ffb9be..d1e076a487cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1049,7 +1049,8 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
 
 EXPORT_SYMBOL(find_vma);
 
-struct page * follow_page(struct mm_struct *mm, unsigned long addr, int write)
+struct page *follow_page(struct mm_struct *mm, unsigned long address,
+			unsigned int foll_flags)
 {
 	return NULL;
 }
-- 
cgit v1.2.3


From 4c21e2f2441dc5fbb957b030333f5a3f2d02dea7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Sat, 29 Oct 2005 18:16:40 -0700
Subject: [PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/mm-armv.c       |  1 +
 arch/frv/mm/pgalloc.c       |  4 ++--
 arch/i386/mm/pgtable.c      |  8 ++++----
 arch/um/kernel/skas/mmu.c   |  1 +
 fs/afs/file.c               |  4 ++--
 fs/buffer.c                 |  2 +-
 fs/jfs/jfs_metapage.c       | 12 ++++++------
 fs/xfs/linux-2.6/xfs_buf.c  |  7 ++++---
 include/linux/buffer_head.h |  6 +++---
 include/linux/mm.h          | 46 +++++++++++++++++++++++++++++++++++++--------
 kernel/kexec.c              |  4 ++--
 mm/Kconfig                  | 13 +++++++++++++
 mm/filemap.c                |  2 +-
 mm/memory.c                 | 24 +++++++++++++----------
 mm/mremap.c                 | 11 ++++++++++-
 mm/page_alloc.c             | 16 ++++++++--------
 mm/page_io.c                |  6 ++++--
 mm/rmap.c                   |  4 ++--
 mm/shmem.c                  | 22 ++++++++++------------
 mm/swap.c                   |  2 +-
 mm/swap_state.c             |  8 ++++----
 mm/swapfile.c               | 12 ++++++------
 mm/vmscan.c                 |  2 +-
 23 files changed, 138 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mm/mm-armv.c b/arch/arm/mm/mm-armv.c
index 60f3e039bac2..1221fdde1769 100644
--- a/arch/arm/mm/mm-armv.c
+++ b/arch/arm/mm/mm-armv.c
@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
 	pte = pmd_page(*pmd);
 	pmd_clear(pmd);
 	dec_page_state(nr_page_table_pages);
+	pte_lock_deinit(pte);
 	pte_free(pte);
 	pmd_free(pmd);
 free:
diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c
index 4eaec0f3525b..2c67dfe5a6b3 100644
--- a/arch/frv/mm/pgalloc.c
+++ b/arch/frv/mm/pgalloc.c
@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
 	if (pgd_list)
 		pgd_list->private = (unsigned long) &page->index;
 	pgd_list = page;
-	page->private = (unsigned long) &pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *) page->index;
-	pprev = (struct page **) page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
 		next->private = (unsigned long) pprev;
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index dcdce2c6c532..39c099f15b5f 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
 	struct page *page = virt_to_page(pgd);
 	page->index = (unsigned long)pgd_list;
 	if (pgd_list)
-		pgd_list->private = (unsigned long)&page->index;
+		set_page_private(pgd_list, (unsigned long)&page->index);
 	pgd_list = page;
-	page->private = (unsigned long)&pgd_list;
+	set_page_private(page, (unsigned long)&pgd_list);
 }
 
 static inline void pgd_list_del(pgd_t *pgd)
 {
 	struct page *next, **pprev, *page = virt_to_page(pgd);
 	next = (struct page *)page->index;
-	pprev = (struct page **)page->private;
+	pprev = (struct page **)page_private(page);
 	*pprev = next;
 	if (next)
-		next->private = (unsigned long)pprev;
+		set_page_private(next, (unsigned long)pprev);
 }
 
 void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 02cf36e0331a..9e5e39cea821 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
 
 	if(!proc_mm || !ptrace_faultinfo){
 		free_page(mmu->id.stack);
+		pte_lock_deinit(virt_to_page(mmu->last_page_table));
 		pte_free_kernel((pte_t *) mmu->last_page_table);
                 dec_page_state(nr_page_table_pages);
 #ifdef CONFIG_3_LEVEL_PGTABLES
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0d576987ec67..4975c9c193dd 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
 		cachefs_uncache_page(vnode->cache, page);
 #endif
 
-		pageio = (struct cachefs_page *) page->private;
-		page->private = 0;
+		pageio = (struct cachefs_page *) page_private(page);
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 
 		if (pageio)
diff --git a/fs/buffer.c b/fs/buffer.c
index b1667986442f..2066e4cb700c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -96,7 +96,7 @@ static void
 __clear_page_buffers(struct page *page)
 {
 	ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 	page_cache_release(page);
 }
 
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index 26091a5f88d4..8a53981f9f27 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -86,7 +86,7 @@ struct meta_anchor {
 	atomic_t io_count;
 	struct metapage *mp[MPS_PER_PAGE];
 };
-#define mp_anchor(page) ((struct meta_anchor *)page->private)
+#define mp_anchor(page) ((struct meta_anchor *)page_private(page))
 
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 		if (!a)
 			return -ENOMEM;
 		memset(a, 0, sizeof(struct meta_anchor));
-		page->private = (unsigned long)a;
+		set_page_private(page, (unsigned long)a);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
 	a->mp[index] = NULL;
 	if (--a->mp_count == 0) {
 		kfree(a);
-		page->private = 0;
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 		kunmap(page);
 	}
@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
 #else
 static inline struct metapage *page_to_mp(struct page *page, uint offset)
 {
-	return PagePrivate(page) ? (struct metapage *)page->private : NULL;
+	return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
 }
 
 static inline int insert_metapage(struct page *page, struct metapage *mp)
 {
 	if (mp) {
-		page->private = (unsigned long)mp;
+		set_page_private(page, (unsigned long)mp);
 		SetPagePrivate(page);
 		kmap(page);
 	}
@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
 
 static inline void remove_metapage(struct page *page, struct metapage *mp)
 {
-	page->private = 0;
+	set_page_private(page, 0);
 	ClearPagePrivate(page);
 	kunmap(page);
 }
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ba4767c04adf..4cd46abe8434 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -181,8 +181,9 @@ set_page_region(
 	size_t		offset,
 	size_t		length)
 {
-	page->private |= page_region_mask(offset, length);
-	if (page->private == ~0UL)
+	set_page_private(page,
+		page_private(page) | page_region_mask(offset, length));
+	if (page_private(page) == ~0UL)
 		SetPageUptodate(page);
 }
 
@@ -194,7 +195,7 @@ test_page_region(
 {
 	unsigned long	mask = page_region_mask(offset, length);
 
-	return (mask && (page->private & mask) == mask);
+	return (mask && (page_private(page) & mask) == mask);
 }
 
 /*
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 88af42f5e04a..c937d6e65502 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
 /* If we *know* page->private refers to buffer_heads */
 #define page_buffers(page)					\
 	({							\
-		BUG_ON(!PagePrivate(page));		\
-		((struct buffer_head *)(page)->private);	\
+		BUG_ON(!PagePrivate(page));			\
+		((struct buffer_head *)page_private(page));	\
 	})
 #define page_has_buffers(page)	PagePrivate(page)
 
@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
 {
 	page_cache_get(page);
 	SetPagePrivate(page);
-	page->private = (unsigned long)head;
+	set_page_private(page, (unsigned long)head);
 }
 
 static inline void get_bh(struct buffer_head *bh)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8d1424153bb..8a514eca40d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -226,13 +226,18 @@ struct page {
 					 * to show when page is mapped
 					 * & limit reverse map searches.
 					 */
-	unsigned long private;		/* Mapping-private opaque data:
+	union {
+		unsigned long private;	/* Mapping-private opaque data:
 					 * usually used for buffer_heads
 					 * if PagePrivate set; used for
 					 * swp_entry_t if PageSwapCache
 					 * When page is free, this indicates
 					 * order in the buddy system.
 					 */
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+		spinlock_t ptl;
+#endif
+	} u;
 	struct address_space *mapping;	/* If low bit clear, points to
 					 * inode address_space, or NULL.
 					 * If page mapped as anonymous
@@ -260,6 +265,9 @@ struct page {
 #endif /* WANT_PAGE_VIRTUAL */
 };
 
+#define page_private(page)		((page)->u.private)
+#define set_page_private(page, v)	((page)->u.private = (v))
+
 /*
  * FIXME: take this include out, include page-flags.h in
  * files which need it (119 of them)
@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
 
 #ifdef CONFIG_HUGETLB_PAGE
 
-static inline int page_count(struct page *p)
+static inline int page_count(struct page *page)
 {
-	if (PageCompound(p))
-		p = (struct page *)p->private;
-	return atomic_read(&(p)->_count) + 1;
+	if (PageCompound(page))
+		page = (struct page *)page_private(page);
+	return atomic_read(&page->_count) + 1;
 }
 
 static inline void get_page(struct page *page)
 {
 	if (unlikely(PageCompound(page)))
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 	atomic_inc(&page->_count);
 }
 
@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
 static inline pgoff_t page_index(struct page *page)
 {
 	if (unlikely(PageSwapCache(page)))
-		return page->private;
+		return page_private(page);
 	return page->index;
 }
 
@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
 }
 #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
 
+#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
+/*
+ * We tuck a spinlock to guard each pagetable page into its struct page,
+ * at page->private, with BUILD_BUG_ON to make sure that this will not
+ * overflow into the next struct page (as it might with DEBUG_SPINLOCK).
+ * When freeing, reset page->mapping so free_pages_check won't complain.
+ */
+#define __pte_lockptr(page)	&((page)->u.ptl)
+#define pte_lock_init(_page)	do {					\
+	spin_lock_init(__pte_lockptr(_page));				\
+} while (0)
+#define pte_lock_deinit(page)	((page)->mapping = NULL)
+#define pte_lockptr(mm, pmd)	({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
+#else
+/*
+ * We use mm->page_table_lock to guard all pagetable pages of the mm.
+ */
+#define pte_lock_init(page)	do {} while (0)
+#define pte_lock_deinit(page)	do {} while (0)
+#define pte_lockptr(mm, pmd)	({(void)(pmd); &(mm)->page_table_lock;})
+#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
+
 #define pte_offset_map_lock(mm, pmd, address, ptlp)	\
 ({							\
-	spinlock_t *__ptl = &(mm)->page_table_lock;	\
+	spinlock_t *__ptl = pte_lockptr(mm, pmd);	\
 	pte_t *__pte = pte_offset_map(pmd, address);	\
 	*(ptlp) = __ptl;				\
 	spin_lock(__ptl);				\
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 36c5d9cd4cc1..2c95848fbce8 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 	if (pages) {
 		unsigned int count, i;
 		pages->mapping = NULL;
-		pages->private = order;
+		set_page_private(pages, order);
 		count = 1 << order;
 		for (i = 0; i < count; i++)
 			SetPageReserved(pages + i);
@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
 {
 	unsigned int order, count, i;
 
-	order = page->private;
+	order = page_private(page);
 	count = 1 << order;
 	for (i = 0; i < count; i++)
 		ClearPageReserved(page + i);
diff --git a/mm/Kconfig b/mm/Kconfig
index 391ffc54d136..f35a550ba4b9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -111,3 +111,16 @@ config SPARSEMEM_STATIC
 config SPARSEMEM_EXTREME
 	def_bool y
 	depends on SPARSEMEM && !SPARSEMEM_STATIC
+
+# Heavily threaded applications may benefit from splitting the mm-wide
+# page_table_lock, so that faults on different parts of the user address
+# space can be handled with less contention: split it at this NR_CPUS.
+# Default to 4 for wider testing, though 8 might be more appropriate.
+# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
+# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
+#
+config SPLIT_PTLOCK_CPUS
+	int
+	default "4096" if ARM && !CPU_CACHE_VIPT
+	default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
+	default "4"
diff --git a/mm/filemap.c b/mm/filemap.c
index 8aa344e88489..f560b41c8f61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -152,7 +152,7 @@ static int sync_page(void *word)
 	 * in the ->sync_page() methods make essential use of the
 	 * page_mapping(), merely passing the page down to the backing
 	 * device's unplug functions when it's non-NULL, which in turn
-	 * ignore it for all cases but swap, where only page->private is
+	 * ignore it for all cases but swap, where only page_private(page) is
 	 * of interest. When page_mapping() does go NULL, the entire
 	 * call stack gracefully ignores the page and returns.
 	 * -- wli
diff --git a/mm/memory.c b/mm/memory.c
index 8461e2dd91d7..e9ef599498b5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = pmd_page(*pmd);
 	pmd_clear(pmd);
+	pte_lock_deinit(page);
 	pte_free_tlb(tlb, page);
 	dec_page_state(nr_page_table_pages);
 	tlb->mm->nr_ptes--;
@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
 	if (!new)
 		return -ENOMEM;
 
+	pte_lock_init(new);
 	spin_lock(&mm->page_table_lock);
-	if (pmd_present(*pmd))		/* Another has populated it */
+	if (pmd_present(*pmd)) {	/* Another has populated it */
+		pte_lock_deinit(new);
 		pte_free(new);
-	else {
+	} else {
 		mm->nr_ptes++;
 		inc_page_state(nr_page_table_pages);
 		pmd_populate(mm, pmd, new);
@@ -432,7 +435,7 @@ again:
 	if (!dst_pte)
 		return -ENOMEM;
 	src_pte = pte_offset_map_nested(src_pmd, addr);
-	src_ptl = &src_mm->page_table_lock;
+	src_ptl = pte_lockptr(src_mm, src_pmd);
 	spin_lock(src_ptl);
 
 	do {
@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
  * (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page and do_no_page can safely check later on).
  */
-static inline int pte_unmap_same(struct mm_struct *mm,
+static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
 				pte_t *page_table, pte_t orig_pte)
 {
 	int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
 	if (sizeof(pte_t) > sizeof(unsigned long)) {
-		spin_lock(&mm->page_table_lock);
+		spinlock_t *ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
 		same = pte_same(*page_table, orig_pte);
-		spin_unlock(&mm->page_table_lock);
+		spin_unlock(ptl);
 	}
 #endif
 	pte_unmap(page_table);
@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte;
 	int ret = VM_FAULT_MINOR;
 
-	if (!pte_unmap_same(mm, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 
 	entry = pte_to_swp_entry(orig_pte);
@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		page_cache_get(page);
 		entry = mk_pte(page, vma->vm_page_prot);
 
-		ptl = &mm->page_table_lock;
+		ptl = pte_lockptr(mm, pmd);
 		spin_lock(ptl);
 		if (!pte_none(*page_table))
 			goto release;
@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pgoff_t pgoff;
 	int err;
 
-	if (!pte_unmap_same(mm, page_table, orig_pte))
+	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		return VM_FAULT_MINOR;
 
 	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
 					pte, pmd, write_access, entry);
 	}
 
-	ptl = &mm->page_table_lock;
+	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (unlikely(!pte_same(*pte, entry)))
 		goto unlock;
diff --git a/mm/mremap.c b/mm/mremap.c
index 8de77b632a20..b535438c363c 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 	struct address_space *mapping = NULL;
 	struct mm_struct *mm = vma->vm_mm;
 	pte_t *old_pte, *new_pte, pte;
-	spinlock_t *old_ptl;
+	spinlock_t *old_ptl, *new_ptl;
 
 	if (vma->vm_file) {
 		/*
@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 			new_vma->vm_truncate_count = 0;
 	}
 
+	/*
+	 * We don't have to worry about the ordering of src and dst
+	 * pte locks because exclusive mmap_sem prevents deadlock.
+	 */
 	old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
  	new_pte = pte_offset_map_nested(new_pmd, new_addr);
+	new_ptl = pte_lockptr(mm, new_pmd);
+	if (new_ptl != old_ptl)
+		spin_lock(new_ptl);
 
 	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 				   new_pte++, new_addr += PAGE_SIZE) {
@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
 		set_pte_at(mm, new_addr, new_pte, pte);
 	}
 
+	if (new_ptl != old_ptl)
+		spin_unlock(new_ptl);
 	pte_unmap_nested(new_pte - 1);
 	pte_unmap_unlock(old_pte - 1, old_ptl);
 	if (mapping)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0541288ebf4b..a2995a5d012c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 		struct page *p = page + i;
 
 		SetPageCompound(p);
-		p->private = (unsigned long)page;
+		set_page_private(p, (unsigned long)page);
 	}
 }
 
@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
-		if (p->private != (unsigned long)page)
+		if (page_private(p) != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
  * So, we don't need atomic page->flags operations here.
  */
 static inline unsigned long page_order(struct page *page) {
-	return page->private;
+	return page_private(page);
 }
 
 static inline void set_page_order(struct page *page, int order) {
-	page->private = order;
+	set_page_private(page, order);
 	__SetPagePrivate(page);
 }
 
 static inline void rmv_page_order(struct page *page)
 {
 	__ClearPagePrivate(page);
-	page->private = 0;
+	set_page_private(page, 0);
 }
 
 /*
@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
  * (a) the buddy is free &&
  * (b) the buddy is on the buddy system &&
  * (c) a page and its buddy have the same order.
- * for recording page's order, we use page->private and PG_private.
+ * for recording page's order, we use page_private(page) and PG_private.
  *
  */
 static inline int page_is_buddy(struct page *page, int order)
@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
  * parts of the VM system.
  * At each level, we keep a list of pages, which are heads of continuous
  * free pages of length of (1 << order) and marked with PG_Private.Page's
- * order is recorded in page->private field.
+ * order is recorded in page_private(page) field.
  * So when we are allocating or freeing one, we can derive the state of the
  * other.  That is, if we allocate a small block, and both were   
  * free, the remainder of the region must be split into blocks.   
@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
 	page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
-	page->private = 0;
+	set_page_private(page, 0);
 	set_page_refs(page, order);
 	kernel_map_pages(page, 1 << order, 1);
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index 330e00d6db00..bb2b0d53889c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		unlock_page(page);
 		goto out;
 	}
-	bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write);
+	bio = get_swap_bio(GFP_NOIO, page_private(page), page,
+				end_swap_bio_write);
 	if (bio == NULL) {
 		set_page_dirty(page);
 		unlock_page(page);
@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
 
 	BUG_ON(!PageLocked(page));
 	ClearPageUptodate(page);
-	bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read);
+	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
+				end_swap_bio_read);
 	if (bio == NULL) {
 		unlock_page(page);
 		ret = -ENOMEM;
diff --git a/mm/rmap.c b/mm/rmap.c
index a84bdfe582c0..a33e779d1bd8 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
 		return NULL;
 	}
 
-	ptl = &mm->page_table_lock;
+	ptl = pte_lockptr(mm, pmd);
 	spin_lock(ptl);
 	if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
 		*ptlp = ptl;
@@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
 	update_hiwater_rss(mm);
 
 	if (PageAnon(page)) {
-		swp_entry_t entry = { .val = page->private };
+		swp_entry_t entry = { .val = page_private(page) };
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
diff --git a/mm/shmem.c b/mm/shmem.c
index 37777f4c11f8..dc25565a61e9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -71,9 +71,6 @@
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
-/* Keep swapped page count in private field of indirect struct page */
-#define nr_swapped		private
-
 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
 enum sgp_type {
 	SGP_QUICK,	/* don't try more than file page cache lookup */
@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
 
 	entry->val = value;
 	info->swapped += incdec;
-	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT)
-		kmap_atomic_to_page(entry)->nr_swapped += incdec;
+	if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
+		struct page *page = kmap_atomic_to_page(entry);
+		set_page_private(page, page_private(page) + incdec);
+	}
 }
 
 /*
@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 
 		spin_unlock(&info->lock);
 		page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
-		if (page) {
-			page->nr_swapped = 0;
-		}
+		if (page)
+			set_page_private(page, 0);
 		spin_lock(&info->lock);
 
 		if (!page) {
@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
 			diroff = 0;
 		}
 		subdir = dir[diroff];
-		if (subdir && subdir->nr_swapped) {
+		if (subdir && page_private(subdir)) {
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
 				size = ENTRIES_PER_PAGE;
@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
 			nr_swaps_freed += freed;
 			if (offset)
 				spin_lock(&info->lock);
-			subdir->nr_swapped -= freed;
+			set_page_private(subdir, page_private(subdir) - freed);
 			if (offset)
 				spin_unlock(&info->lock);
-			BUG_ON(subdir->nr_swapped > offset);
+			BUG_ON(page_private(subdir) > offset);
 		}
 		if (offset)
 			offset = 0;
@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 			dir = shmem_dir_map(subdir);
 		}
 		subdir = *dir;
-		if (subdir && subdir->nr_swapped) {
+		if (subdir && page_private(subdir)) {
 			ptr = shmem_swp_map(subdir);
 			size = limit - idx;
 			if (size > ENTRIES_PER_PAGE)
diff --git a/mm/swap.c b/mm/swap.c
index 21d15f99805c..b89512877ec2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -39,7 +39,7 @@ int page_cluster;
 void put_page(struct page *page)
 {
 	if (unlikely(PageCompound(page))) {
-		page = (struct page *)page->private;
+		page = (struct page *)page_private(page);
 		if (put_page_testzero(page)) {
 			void (*dtor)(struct page *page);
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 132164f7d0a7..cafc1edcbeba 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
 			page_cache_get(page);
 			SetPageLocked(page);
 			SetPageSwapCache(page);
-			page->private = entry.val;
+			set_page_private(page, entry.val);
 			total_swapcache_pages++;
 			pagecache_acct(1);
 		}
@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
 
-	radix_tree_delete(&swapper_space.page_tree, page->private);
-	page->private = 0;
+	radix_tree_delete(&swapper_space.page_tree, page_private(page));
+	set_page_private(page, 0);
 	ClearPageSwapCache(page);
 	total_swapcache_pages--;
 	pagecache_acct(-1);
@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 
 	write_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 510f0039b000..8970c0b74194 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 	swp_entry_t entry;
 
 	down_read(&swap_unplug_sem);
-	entry.val = page->private;
+	entry.val = page_private(page);
 	if (PageSwapCache(page)) {
 		struct block_device *bdev = swap_info[swp_type(entry)].bdev;
 		struct backing_dev_info *bdi;
@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
 		/*
 		 * If the page is removed from swapcache from under us (with a
 		 * racy try_to_unuse/swapoff) we need an additional reference
-		 * count to avoid reading garbage from page->private above. If
-		 * the WARN_ON triggers during a swapoff it maybe the race
+		 * count to avoid reading garbage from page_private(page) above.
+		 * If the WARN_ON triggers during a swapoff it maybe the race
 		 * condition and it's harmless. However if it triggers without
 		 * swapoff it signals a problem.
 		 */
@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
 	struct swap_info_struct *p;
 	swp_entry_t entry;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (p) {
 		/* Subtract the 1 for the swap cache itself */
@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (page_count(page) != 2) /* 2: us + cache */
 		return 0;
 
-	entry.val = page->private;
+	entry.val = page_private(page);
 	p = swap_info_get(entry);
 	if (!p)
 		return 0;
@@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page)
 	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
 
 	if (PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page->private };
+		swp_entry_t entry = { .val = page_private(page) };
 		struct swap_info_struct *sis;
 
 		sis = get_swap_info_struct(swp_type(entry));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 41d1064aabfb..135bf8ca96ee 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page->private };
+			swp_entry_t swap = { .val = page_private(page) };
 			__delete_from_swap_cache(page);
 			write_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
-- 
cgit v1.2.3


From 4276d32260662d5401a15a0a46e506fb5c8ab563 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Sun, 30 Oct 2005 14:59:18 -0800
Subject: [PATCH] Remove redundant configs.o

Since CONFIG_IKCONFIG_PROC already depends on CONFIG_IKCONFIG, adding
configs.o again is redundant.

Signed-off-by: Brian Gerst <bgerst@didntduck.org>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/Makefile | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index ff4dc02ce170..980b5e454441 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_IKCONFIG) += configs.o
-obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o
 obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-- 
cgit v1.2.3


From c32b6b8e524d2c337767d312814484d9289550cf Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Sun, 30 Oct 2005 14:59:54 -0800
Subject: [PATCH] create and destroy cpufreq sysfs entries based on cpu
 notifiers

cpufreq entries in sysfs should only be populated when CPU is online state.
 When we either boot with maxcpus=x and then boot the other cpus by echoing
to sysfs online file, these entries should be created and destroyed when
CPU_DEAD is notified.  Same treatement as cache entries under sysfs.

We place the processor in the lowest frequency, so hw managed P-State
transitions can still work on the other threads to save power.

Primary goal was to just make these directories appear/disapper dynamically.

There is one in this patch i had to do, which i really dont like myself but
probably best if someone handling the cpufreq infrastructure could give
this code right treatment if this is not acceptable.  I guess its probably
good for the first cut.

- Converting lock_cpu_hotplug()/unlock_cpu_hotplug() to disable/enable preempt.
  The locking was smack in the middle of the notification path, when the
  hotplug is already holding the lock. I tried another solution to avoid this
  so avoid taking locks if we know we are from notification path. The solution
  was getting very ugly and i decided this was probably good for this iteration
  until someone who understands cpufreq could do a better job than me.

(akpm: export cpucontrol to GPL modules: drivers/cpufreq/cpufreq_stats.c now
does lock_cpu_hotplug())

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Zwane Mwaikambo <zwane@holomorphy.com>
Cc: Greg KH <greg@kroah.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cpufreq/cpufreq.c       | 69 ++++++++++++++++++++++++++++++++++++++---
 drivers/cpufreq/cpufreq_stats.c | 42 ++++++++++++++++++++++---
 kernel/cpu.c                    |  1 +
 3 files changed, 103 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 346906a96e8b..6c6121b85a54 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -4,6 +4,9 @@
  *  Copyright (C) 2001 Russell King
  *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
  *
+ *  Oct 2005 - Ashok Raj <ashok.raj@intel.com>
+ *         		Added handling for CPU hotplug
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -567,6 +570,9 @@ static int cpufreq_add_dev (struct sys_device * sys_dev)
 	unsigned long flags;
 	unsigned int j;
 
+	if (cpu_is_offline(cpu))
+		return 0;
+
 	cpufreq_debug_disable_ratelimit();
 	dprintk("adding CPU %u\n", cpu);
 
@@ -673,7 +679,7 @@ err_out:
 
 nomem_out:
 	module_put(cpufreq_driver->owner);
- module_out:
+module_out:
 	cpufreq_debug_enable_ratelimit();
 	return ret;
 }
@@ -762,7 +768,6 @@ static int cpufreq_remove_dev (struct sys_device * sys_dev)
 	down(&data->lock);
 	if (cpufreq_driver->target)
 		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
-	cpufreq_driver->target = NULL;
 	up(&data->lock);
 
 	kobject_unregister(&data->kobj);
@@ -1109,17 +1114,30 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 			    unsigned int relation)
 {
 	int retval = -EINVAL;
-	lock_cpu_hotplug();
+
+	/*
+	 * Converted the lock_cpu_hotplug to preempt_disable()
+	 * and preempt_enable(). This is a bit kludgy and relies on how cpu
+	 * hotplug works. All we need is a guarantee that cpu hotplug won't make
+	 * progress on any cpu. Once we do preempt_disable(), this would ensure
+	 * that hotplug threads don't get onto this cpu, thereby delaying
+	 * the cpu remove process.
+	 *
+	 * We removed the lock_cpu_hotplug since we need to call this function
+	 * via cpu hotplug callbacks, which result in locking the cpu hotplug
+	 * thread itself. Agree this is not very clean, cpufreq community
+	 * could improve this if required. - Ashok Raj <ashok.raj@intel.com>
+	 */
+	preempt_disable();
 	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
-	unlock_cpu_hotplug();
+	preempt_enable();
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
 
-
 int cpufreq_driver_target(struct cpufreq_policy *policy,
 			  unsigned int target_freq,
 			  unsigned int relation)
@@ -1406,6 +1424,45 @@ int cpufreq_update_policy(unsigned int cpu)
 }
 EXPORT_SYMBOL(cpufreq_update_policy);
 
+static int __cpuinit cpufreq_cpu_callback(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct cpufreq_policy *policy;
+	struct sys_device *sys_dev;
+
+	sys_dev = get_cpu_sysdev(cpu);
+
+	if (sys_dev) {
+		switch (action) {
+		case CPU_ONLINE:
+			cpufreq_add_dev(sys_dev);
+			break;
+		case CPU_DOWN_PREPARE:
+			/*
+			 * We attempt to put this cpu in lowest frequency
+			 * possible before going down. This will permit
+			 * hardware-managed P-State to switch other related
+			 * threads to min or higher speeds if possible.
+			 */
+			policy = cpufreq_cpu_data[cpu];
+			if (policy) {
+				cpufreq_driver_target(policy, policy->min,
+						CPUFREQ_RELATION_H);
+			}
+			break;
+		case CPU_DEAD:
+			cpufreq_remove_dev(sys_dev);
+			break;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_cpu_notifier =
+{
+    .notifier_call = cpufreq_cpu_callback,
+};
 
 /*********************************************************************
  *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
@@ -1466,6 +1523,7 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
 	}
 
 	if (!ret) {
+		register_cpu_notifier(&cpufreq_cpu_notifier);
 		dprintk("driver %s up and running\n", driver_data->name);
 		cpufreq_debug_enable_ratelimit();
 	}
@@ -1497,6 +1555,7 @@ int cpufreq_unregister_driver(struct cpufreq_driver *driver)
 	dprintk("unregistering driver %s\n", driver->name);
 
 	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
+	unregister_cpu_notifier(&cpufreq_cpu_notifier);
 
 	spin_lock_irqsave(&cpufreq_driver_lock, flags);
 	cpufreq_driver = NULL;
diff --git a/drivers/cpufreq/cpufreq_stats.c b/drivers/cpufreq/cpufreq_stats.c
index 741b6b191e6a..3597f25d5efa 100644
--- a/drivers/cpufreq/cpufreq_stats.c
+++ b/drivers/cpufreq/cpufreq_stats.c
@@ -19,6 +19,7 @@
 #include <linux/percpu.h>
 #include <linux/kobject.h>
 #include <linux/spinlock.h>
+#include <linux/notifier.h>
 #include <asm/cputime.h>
 
 static spinlock_t cpufreq_stats_lock;
@@ -298,6 +299,27 @@ cpufreq_stat_notifier_trans (struct notifier_block *nb, unsigned long val,
 	return 0;
 }
 
+static int __cpuinit cpufreq_stat_cpu_callback(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	switch (action) {
+	case CPU_ONLINE:
+		cpufreq_update_policy(cpu);
+		break;
+	case CPU_DEAD:
+		cpufreq_stats_free_table(cpu);
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block cpufreq_stat_cpu_notifier =
+{
+	.notifier_call = cpufreq_stat_cpu_callback,
+};
+
 static struct notifier_block notifier_policy_block = {
 	.notifier_call = cpufreq_stat_notifier_policy
 };
@@ -311,6 +333,7 @@ __init cpufreq_stats_init(void)
 {
 	int ret;
 	unsigned int cpu;
+
 	spin_lock_init(&cpufreq_stats_lock);
 	if ((ret = cpufreq_register_notifier(&notifier_policy_block,
 				CPUFREQ_POLICY_NOTIFIER)))
@@ -323,20 +346,31 @@ __init cpufreq_stats_init(void)
 		return ret;
 	}
 
-	for_each_cpu(cpu)
-		cpufreq_update_policy(cpu);
+	register_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	lock_cpu_hotplug();
+	for_each_online_cpu(cpu) {
+		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_ONLINE,
+			(void *)(long)cpu);
+	}
+	unlock_cpu_hotplug();
 	return 0;
 }
 static void
 __exit cpufreq_stats_exit(void)
 {
 	unsigned int cpu;
+
 	cpufreq_unregister_notifier(&notifier_policy_block,
 			CPUFREQ_POLICY_NOTIFIER);
 	cpufreq_unregister_notifier(&notifier_trans_block,
 			CPUFREQ_TRANSITION_NOTIFIER);
-	for_each_cpu(cpu)
-		cpufreq_stats_free_table(cpu);
+	unregister_cpu_notifier(&cpufreq_stat_cpu_notifier);
+	lock_cpu_hotplug();
+	for_each_online_cpu(cpu) {
+		cpufreq_stat_cpu_callback(&cpufreq_stat_cpu_notifier, CPU_DEAD,
+			(void *)(long)cpu);
+	}
+	unlock_cpu_hotplug();
 }
 
 MODULE_AUTHOR ("Zou Nan hai <nanhai.zou@intel.com>");
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 53d8263ae12e..3619e939182e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -17,6 +17,7 @@
 
 /* This protects CPUs going up and down... */
 DECLARE_MUTEX(cpucontrol);
+EXPORT_SYMBOL_GPL(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
-- 
cgit v1.2.3


From 351619baf9878731b4272fa10dda0f84f5582241 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:55 -0800
Subject: [PATCH] swsusp: rework image freeing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The following patch makes swsusp use PG_nosave and PG_nosave_free flags to
mark pages that should be freed after the state of the system has been
restored from the image (or in case of an error during suspend).

This allows us to avoid storing metadata in swap twice and to reduce the
amount of memory needed by swsusp.  �Additionally, it allows us to simplify
the code by removing a couple of functions that are no longer necessary.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 159 ++++++++++++++++++++++++--------------------------
 1 file changed, 75 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 016504ccfccf..ae46506e2137 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -708,24 +708,28 @@ static void count_data_pages(void)
 	}
 }
 
-
 static void copy_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe * pbe = pagedir_nosave;
+	struct pbe *pbe = pagedir_nosave, *p;
 
 	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe(p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
 				struct page * page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
-				pbe->orig_address = (long) page_address(page);
+				pbe->orig_address = (unsigned long)page_address(page);
 				/* copy_page is not usable for copying task structs. */
 				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
 				pbe = pbe->next;
@@ -736,15 +740,6 @@ static void copy_data_pages(void)
 }
 
 
-/**
- *	calc_nr - Determine the number of pages needed for a pbe list.
- */
-
-static int calc_nr(int nr_copy)
-{
-	return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1);
-}
-
 /**
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
@@ -755,6 +750,8 @@ static inline void free_pagedir(struct pbe *pblist)
 
 	while (pblist) {
 		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
 		free_page((unsigned long)pblist);
 		pblist = pbe;
 	}
@@ -800,6 +797,16 @@ static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
 	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
@@ -822,11 +829,11 @@ static struct pbe * alloc_pagedir(unsigned nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	pblist = (struct pbe *)alloc_image_page();
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+		pbe->next = (struct pbe *)alloc_image_page();
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
@@ -836,51 +843,29 @@ static struct pbe * alloc_pagedir(unsigned nr_pages)
 }
 
 /**
- *	free_image_pages - Free pages allocated for snapshot
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
  */
 
-static void free_image_pages(void)
+void swsusp_free(void)
 {
-	struct pbe * p;
+	struct zone *zone;
+	unsigned long zone_pfn;
 
-	for_each_pbe (p, pagedir_save) {
-		if (p->address) {
-			ClearPageNosave(virt_to_page(p->address));
-			free_page(p->address);
-			p->address = 0;
-		}
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
 	}
 }
 
-/**
- *	alloc_image_pages - Allocate pages for the snapshot.
- */
-
-static int alloc_image_pages(void)
-{
-	struct pbe * p;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-		if (!p->address)
-			return -ENOMEM;
-		SetPageNosave(virt_to_page(p->address));
-	}
-	return 0;
-}
-
-/* Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
- */
-void swsusp_free(void)
-{
-	BUG_ON(PageNosave(virt_to_page(pagedir_save)));
-	BUG_ON(PageNosaveFree(virt_to_page(pagedir_save)));
-	free_image_pages();
-	free_pagedir(pagedir_save);
-}
-
-
 /**
  *	enough_free_mem - Make sure we enough free memory to snapshot.
  *
@@ -890,12 +875,9 @@ void swsusp_free(void)
 
 static int enough_free_mem(void)
 {
-	if (nr_free_pages() < (nr_copy_pages + PAGES_FOR_IO)) {
-		pr_debug("swsusp: Not enough free pages: Have %d\n",
-			 nr_free_pages());
-		return 0;
-	}
-	return 1;
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
 
@@ -914,33 +896,16 @@ static int enough_swap(void)
 	struct sysinfo i;
 
 	si_swapinfo(&i);
-	if (i.freeswap < (nr_copy_pages + PAGES_FOR_IO))  {
-		pr_debug("swsusp: Not enough swap. Need %ld\n",i.freeswap);
-		return 0;
-	}
-	return 1;
+	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+	return i.freeswap > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
 static int swsusp_alloc(void)
 {
-	int error;
+	struct pbe * p;
 
 	pagedir_nosave = NULL;
-	nr_copy_pages = calc_nr(nr_copy_pages);
-	nr_copy_pages_check = nr_copy_pages;
-
-	pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
-		 nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem())
-		return -ENOMEM;
-
-	if (!enough_swap())
-		return -ENOSPC;
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE))
-		return -ENOSPC;
 
 	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
@@ -948,10 +913,14 @@ static int swsusp_alloc(void)
 	}
 	create_pbe_list(pagedir_save, nr_copy_pages);
 	pagedir_nosave = pagedir_save;
-	if ((error = alloc_image_pages())) {
-		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-		swsusp_free();
-		return error;
+
+	for_each_pbe (p, pagedir_save) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return -ENOMEM;
+		}
 	}
 
 	return 0;
@@ -963,7 +932,7 @@ static int suspend_prepare_image(void)
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
-		printk(KERN_CRIT "Suspend machine: Not enough free pages for highmem\n");
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
 		restore_highmem();
 		return -ENOMEM;
 	}
@@ -971,6 +940,28 @@ static int suspend_prepare_image(void)
 	drain_local_pages();
 	count_data_pages();
 	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+	nr_copy_pages_check = nr_copy_pages;
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_copy_pages,
+		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem()) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE)) {
+		printk(KERN_ERR "swsusp: Too many image pages\n");
+		return -ENOSPC;
+	}
+
+	if (!enough_swap()) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
 
 	error = swsusp_alloc();
 	if (error)
-- 
cgit v1.2.3


From 25761b6eb7b33823bcfff6bfe2a015badcd76fb8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:56 -0800
Subject: [PATCH] swsusp: move snapshot functionality to separate file

The following patch moves the functionality of swsusp related to creating and
handling the snapshot of memory to a separate file, snapshot.c

This should enable us to untangle the code in the future and eventually to
implement some parts of swsusp.c in the user space.

The patch does not change the code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/suspend.h |   6 +
 kernel/power/Makefile   |   2 +-
 kernel/power/power.h    |  17 ++
 kernel/power/snapshot.c | 464 ++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/power/swsusp.c   | 440 +--------------------------------------------
 5 files changed, 492 insertions(+), 437 deletions(-)
 create mode 100644 kernel/power/snapshot.c

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index ba448c760168..380915e9563d 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -74,4 +74,10 @@ void __restore_processor_state(struct saved_context *ctxt);
 extern unsigned long get_usable_page(gfp_t gfp_mask);
 extern void free_eaten_memory(void);
 
+/*
+ * XXX: We try to keep some more pages free so that I/O operations succeed
+ * without paging. Might this be more?
+ */
+#define PAGES_FOR_IO	512
+
 #endif /* _LINUX_SWSUSP_H */
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 2f438d0eaa13..c71eb4579c07 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -4,7 +4,7 @@ EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
 obj-y				:= main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 6748de23e83c..e54dd8435de7 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,3 +53,20 @@ extern void thaw_processes(void);
 
 extern int pm_prepare_console(void);
 extern void pm_restore_console(void);
+
+
+/* References to section boundaries */
+extern const void __nosave_begin, __nosave_end;
+
+extern unsigned int nr_copy_pages;
+extern suspend_pagedir_t *pagedir_nosave;
+extern suspend_pagedir_t *pagedir_save;
+
+extern asmlinkage int swsusp_arch_suspend(void);
+extern asmlinkage int swsusp_arch_resume(void);
+
+extern int restore_highmem(void);
+extern void free_pagedir(struct pbe *pblist);
+extern struct pbe * alloc_pagedir(unsigned nr_pages);
+extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern int enough_swap(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
new file mode 100644
index 000000000000..0f0a7f306b0d
--- /dev/null
+++ b/kernel/power/snapshot.c
@@ -0,0 +1,464 @@
+/*
+ * linux/kernel/power/swsusp.c
+ *
+ * This file is to realize architecture-independent
+ * machine suspend feature using pretty near only high-level routines
+ *
+ * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
+ *
+ * This file is released under the GPLv2, and is based on swsusp.c.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/suspend.h>
+#include <linux/smp_lock.h>
+#include <linux/file.h>
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/bitops.h>
+#include <linux/vt_kern.h>
+#include <linux/kbd_kern.h>
+#include <linux/keyboard.h>
+#include <linux/spinlock.h>
+#include <linux/genhd.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/swap.h>
+#include <linux/pm.h>
+#include <linux/device.h>
+#include <linux/buffer_head.h>
+#include <linux/swapops.h>
+#include <linux/bootmem.h>
+#include <linux/syscalls.h>
+#include <linux/console.h>
+#include <linux/highmem.h>
+#include <linux/bio.h>
+#include <linux/mount.h>
+
+#include <asm/uaccess.h>
+#include <asm/mmu_context.h>
+#include <asm/pgtable.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+
+#include <linux/random.h>
+#include <linux/crypto.h>
+#include <asm/scatterlist.h>
+
+#include "power.h"
+
+
+
+
+#ifdef CONFIG_HIGHMEM
+struct highmem_page {
+	char *data;
+	struct page *page;
+	struct highmem_page *next;
+};
+
+static struct highmem_page *highmem_copy;
+
+static int save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	mark_free_pages(zone);
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%1000))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			continue;
+		}
+		BUG_ON(PageNosave(page));
+		if (PageNosaveFree(page))
+			continue;
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			return -ENOMEM;
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+	return 0;
+}
+#endif /* CONFIG_HIGHMEM */
+
+
+static int save_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	struct zone *zone;
+	int res = 0;
+
+	pr_debug("swsusp: Saving Highmem\n");
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
+	}
+#endif
+	return 0;
+}
+
+int restore_highmem(void)
+{
+#ifdef CONFIG_HIGHMEM
+	printk("swsusp: Restoring Highmem\n");
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+#endif
+	return 0;
+}
+
+
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
+
+/**
+ *	saveable - Determine whether a page should be cloned or not.
+ *	@pfn:	The page
+ *
+ *	We save a page if it's Reserved, and not in the range of pages
+ *	statically defined as 'unsaveable', or if it isn't reserved, and
+ *	isn't part of a free chunk of pages.
+ */
+
+static int saveable(struct zone * zone, unsigned long * zone_pfn)
+{
+	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
+	struct page * page;
+
+	if (!pfn_valid(pfn))
+		return 0;
+
+	page = pfn_to_page(pfn);
+	BUG_ON(PageReserved(page) && PageNosave(page));
+	if (PageNosave(page))
+		return 0;
+	if (PageReserved(page) && pfn_is_nosave(pfn)) {
+		pr_debug("[nosave pfn 0x%lx]", pfn);
+		return 0;
+	}
+	if (PageNosaveFree(page))
+		return 0;
+
+	return 1;
+}
+
+static void count_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	nr_copy_pages = 0;
+
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			nr_copy_pages += saveable(zone, &zone_pfn);
+	}
+}
+
+static void copy_data_pages(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *pbe = pagedir_nosave, *p;
+
+	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	for_each_zone (zone) {
+		if (is_highmem(zone))
+			continue;
+		mark_free_pages(zone);
+		/* This is necessary for swsusp_free() */
+		for_each_pb_page (p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p));
+		for_each_pbe(p, pagedir_nosave)
+			SetPageNosaveFree(virt_to_page(p->address));
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+			if (saveable(zone, &zone_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				BUG_ON(!pbe);
+				pbe->orig_address = (unsigned long)page_address(page);
+				/* copy_page is not usable for copying task structs. */
+				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
+				pbe = pbe->next;
+			}
+		}
+	}
+	BUG_ON(pbe);
+}
+
+
+/**
+ *	free_pagedir - free pages allocated with alloc_pagedir()
+ */
+
+void free_pagedir(struct pbe *pblist)
+{
+	struct pbe *pbe;
+
+	while (pblist) {
+		pbe = (pblist + PB_PAGE_SKIP)->next;
+		ClearPageNosave(virt_to_page(pblist));
+		ClearPageNosaveFree(virt_to_page(pblist));
+		free_page((unsigned long)pblist);
+		pblist = pbe;
+	}
+}
+
+/**
+ *	fill_pb_page - Create a list of PBEs on a given memory page
+ */
+
+static inline void fill_pb_page(struct pbe *pbpage)
+{
+	struct pbe *p;
+
+	p = pbpage;
+	pbpage += PB_PAGE_SKIP;
+	do
+		p->next = p + 1;
+	while (++p < pbpage);
+}
+
+/**
+ *	create_pbe_list - Create a list of PBEs on top of a given chain
+ *	of memory pages allocated with alloc_pagedir()
+ */
+
+void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+{
+	struct pbe *pbpage, *p;
+	unsigned num = PBES_PER_PAGE;
+
+	for_each_pb_page (pbpage, pblist) {
+		if (num >= nr_pages)
+			break;
+
+		fill_pb_page(pbpage);
+		num += PBES_PER_PAGE;
+	}
+	if (pbpage) {
+		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
+			p->next = p + 1;
+		p->next = NULL;
+	}
+	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
+}
+
+static void *alloc_image_page(void)
+{
+	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	if (res) {
+		SetPageNosave(virt_to_page(res));
+		SetPageNosaveFree(virt_to_page(res));
+	}
+	return res;
+}
+
+/**
+ *	alloc_pagedir - Allocate the page directory.
+ *
+ *	First, determine exactly how many pages we need and
+ *	allocate them.
+ *
+ *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
+ *	struct pbe elements (pbes) and the last element in the page points
+ *	to the next page.
+ *
+ *	On each page we set up a list of struct_pbe elements.
+ */
+
+struct pbe * alloc_pagedir(unsigned nr_pages)
+{
+	unsigned num;
+	struct pbe *pblist, *pbe;
+
+	if (!nr_pages)
+		return NULL;
+
+	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
+	pblist = (struct pbe *)alloc_image_page();
+	/* FIXME: rewrite this ugly loop */
+	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
+        		pbe = pbe->next, num += PBES_PER_PAGE) {
+		pbe += PB_PAGE_SKIP;
+		pbe->next = (struct pbe *)alloc_image_page();
+	}
+	if (!pbe) { /* get_zeroed_page() failed */
+		free_pagedir(pblist);
+		pblist = NULL;
+        }
+	return pblist;
+}
+
+/**
+ * Free pages we allocated for suspend. Suspend pages are alocated
+ * before atomic copy, so we need to free them after resume.
+ */
+
+void swsusp_free(void)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+
+	for_each_zone(zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
+				struct page * page;
+				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
+				if (PageNosave(page) && PageNosaveFree(page)) {
+					ClearPageNosave(page);
+					ClearPageNosaveFree(page);
+					free_page((long) page_address(page));
+				}
+			}
+	}
+}
+
+
+/**
+ *	enough_free_mem - Make sure we enough free memory to snapshot.
+ *
+ *	Returns TRUE or FALSE after checking the number of available
+ *	free pages.
+ */
+
+static int enough_free_mem(void)
+{
+	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
+	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
+		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+}
+
+
+static int swsusp_alloc(void)
+{
+	struct pbe * p;
+
+	pagedir_nosave = NULL;
+
+	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
+	    !!(nr_copy_pages % PBES_PER_PAGE))
+		return -ENOSPC;
+
+	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
+		return -ENOMEM;
+	}
+	create_pbe_list(pagedir_save, nr_copy_pages);
+	pagedir_nosave = pagedir_save;
+
+	for_each_pbe (p, pagedir_save) {
+		p->address = (unsigned long)alloc_image_page();
+		if (!p->address) {
+			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+			swsusp_free();
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static int suspend_prepare_image(void)
+{
+	int error;
+
+	pr_debug("swsusp: critical section: \n");
+	if (save_highmem()) {
+		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
+		restore_highmem();
+		return -ENOMEM;
+	}
+
+	drain_local_pages();
+	count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+
+	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
+		 nr_copy_pages,
+		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 PAGES_FOR_IO, nr_free_pages());
+
+	if (!enough_free_mem()) {
+		printk(KERN_ERR "swsusp: Not enough free memory\n");
+		return -ENOMEM;
+	}
+
+	if (!enough_swap()) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+
+	error = swsusp_alloc();
+	if (error)
+		return error;
+
+	/* During allocating of suspend pagedir, new cold pages may appear.
+	 * Kill them.
+	 */
+	drain_local_pages();
+	copy_data_pages();
+
+	/*
+	 * End of critical section. From now on, we can write to memory,
+	 * but we should not touch disk. This specially means we must _not_
+	 * touch swap space! Except we must write out our image of course.
+	 */
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	return 0;
+}
+
+
+asmlinkage int swsusp_save(void)
+{
+	return suspend_prepare_image();
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae46506e2137..fc50b5d2dd26 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -5,7 +5,7 @@
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -84,16 +84,10 @@
 #define MAXKEY 32
 #define MAXIV  32
 
-/* References to section boundaries */
-extern const void __nosave_begin, __nosave_end;
-
-/* Variables to be preserved over suspend */
-static int nr_copy_pages_check;
-
 extern char resume_file[];
 
 /* Local variables that should not be affected by save */
-static unsigned int nr_copy_pages __nosavedata = 0;
+unsigned int nr_copy_pages __nosavedata = 0;
 
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume
@@ -109,7 +103,7 @@ static unsigned int nr_copy_pages __nosavedata = 0;
    MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-static suspend_pagedir_t *pagedir_save;
+suspend_pagedir_t *pagedir_save;
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
@@ -123,12 +117,6 @@ static struct swsusp_header {
 
 static struct swsusp_info swsusp_info;
 
-/*
- * XXX: We try to keep some more pages free so that I/O operations succeed
- * without paging. Might this be more?
- */
-#define PAGES_FOR_IO	512
-
 /*
  * Saving part...
  */
@@ -552,335 +540,6 @@ static int write_suspend_image(void)
 	goto Done;
 }
 
-
-#ifdef CONFIG_HIGHMEM
-struct highmem_page {
-	char *data;
-	struct page *page;
-	struct highmem_page *next;
-};
-
-static struct highmem_page *highmem_copy;
-
-static int save_highmem_zone(struct zone *zone)
-{
-	unsigned long zone_pfn;
-	mark_free_pages(zone);
-	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-		struct page *page;
-		struct highmem_page *save;
-		void *kaddr;
-		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
-
-		if (!(pfn%1000))
-			printk(".");
-		if (!pfn_valid(pfn))
-			continue;
-		page = pfn_to_page(pfn);
-		/*
-		 * PageReserved results from rvmalloc() sans vmalloc_32()
-		 * and architectural memory reservations.
-		 *
-		 * rvmalloc should not cause this, because all implementations
-		 * appear to always be using vmalloc_32 on architectures with
-		 * highmem. This is a good thing, because we would like to save
-		 * rvmalloc pages.
-		 *
-		 * It appears to be triggered by pages which do not point to
-		 * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
-		 * which sets PageReserved if the page does not point to valid
-		 * RAM.
-		 *
-		 * XXX: must remove usage of PageReserved!
-		 */
-		if (PageReserved(page))
-			continue;
-		BUG_ON(PageNosave(page));
-		if (PageNosaveFree(page))
-			continue;
-		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
-		if (!save)
-			return -ENOMEM;
-		save->next = highmem_copy;
-		save->page = page;
-		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data) {
-			kfree(save);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page, KM_USER0);
-		memcpy(save->data, kaddr, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		highmem_copy = save;
-	}
-	return 0;
-}
-#endif /* CONFIG_HIGHMEM */
-
-
-static int save_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	struct zone *zone;
-	int res = 0;
-
-	pr_debug("swsusp: Saving Highmem\n");
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			res = save_highmem_zone(zone);
-		if (res)
-			return res;
-	}
-#endif
-	return 0;
-}
-
-static int restore_highmem(void)
-{
-#ifdef CONFIG_HIGHMEM
-	printk("swsusp: Restoring Highmem\n");
-	while (highmem_copy) {
-		struct highmem_page *save = highmem_copy;
-		void *kaddr;
-		highmem_copy = save->next;
-
-		kaddr = kmap_atomic(save->page, KM_USER0);
-		memcpy(kaddr, save->data, PAGE_SIZE);
-		kunmap_atomic(kaddr, KM_USER0);
-		free_page((long) save->data);
-		kfree(save);
-	}
-#endif
-	return 0;
-}
-
-
-static int pfn_is_nosave(unsigned long pfn)
-{
-	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
-	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
-	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
-}
-
-/**
- *	saveable - Determine whether a page should be cloned or not.
- *	@pfn:	The page
- *
- *	We save a page if it's Reserved, and not in the range of pages
- *	statically defined as 'unsaveable', or if it isn't reserved, and
- *	isn't part of a free chunk of pages.
- */
-
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
-{
-	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
-
-	if (!pfn_valid(pfn))
-		return 0;
-
-	page = pfn_to_page(pfn);
-	if (PageNosave(page))
-		return 0;
-	if (pfn_is_nosave(pfn)) {
-		pr_debug("[nosave pfn 0x%lx]", pfn);
-		return 0;
-	}
-	if (PageNosaveFree(page))
-		return 0;
-
-	return 1;
-}
-
-static void count_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	nr_copy_pages = 0;
-
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
-	}
-}
-
-static void copy_data_pages(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *pbe = pagedir_nosave, *p;
-
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
-	for_each_zone (zone) {
-		if (is_highmem(zone))
-			continue;
-		mark_free_pages(zone);
-		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pagedir_nosave)
-			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe(p, pagedir_nosave)
-			SetPageNosaveFree(virt_to_page(p->address));
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
-			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				BUG_ON(!pbe);
-				pbe->orig_address = (unsigned long)page_address(page);
-				/* copy_page is not usable for copying task structs. */
-				memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE);
-				pbe = pbe->next;
-			}
-		}
-	}
-	BUG_ON(pbe);
-}
-
-
-/**
- *	free_pagedir - free pages allocated with alloc_pagedir()
- */
-
-static inline void free_pagedir(struct pbe *pblist)
-{
-	struct pbe *pbe;
-
-	while (pblist) {
-		pbe = (pblist + PB_PAGE_SKIP)->next;
-		ClearPageNosave(virt_to_page(pblist));
-		ClearPageNosaveFree(virt_to_page(pblist));
-		free_page((unsigned long)pblist);
-		pblist = pbe;
-	}
-}
-
-/**
- *	fill_pb_page - Create a list of PBEs on a given memory page
- */
-
-static inline void fill_pb_page(struct pbe *pbpage)
-{
-	struct pbe *p;
-
-	p = pbpage;
-	pbpage += PB_PAGE_SKIP;
-	do
-		p->next = p + 1;
-	while (++p < pbpage);
-}
-
-/**
- *	create_pbe_list - Create a list of PBEs on top of a given chain
- *	of memory pages allocated with alloc_pagedir()
- */
-
-static void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
-{
-	struct pbe *pbpage, *p;
-	unsigned num = PBES_PER_PAGE;
-
-	for_each_pb_page (pbpage, pblist) {
-		if (num >= nr_pages)
-			break;
-
-		fill_pb_page(pbpage);
-		num += PBES_PER_PAGE;
-	}
-	if (pbpage) {
-		for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
-			p->next = p + 1;
-		p->next = NULL;
-	}
-	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
-}
-
-static void *alloc_image_page(void)
-{
-	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-	if (res) {
-		SetPageNosave(virt_to_page(res));
-		SetPageNosaveFree(virt_to_page(res));
-	}
-	return res;
-}
-
-/**
- *	alloc_pagedir - Allocate the page directory.
- *
- *	First, determine exactly how many pages we need and
- *	allocate them.
- *
- *	We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
- *	struct pbe elements (pbes) and the last element in the page points
- *	to the next page.
- *
- *	On each page we set up a list of struct_pbe elements.
- */
-
-static struct pbe * alloc_pagedir(unsigned nr_pages)
-{
-	unsigned num;
-	struct pbe *pblist, *pbe;
-
-	if (!nr_pages)
-		return NULL;
-
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)alloc_image_page();
-	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
-        		pbe = pbe->next, num += PBES_PER_PAGE) {
-		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)alloc_image_page();
-	}
-	if (!pbe) { /* get_zeroed_page() failed */
-		free_pagedir(pblist);
-		pblist = NULL;
-        }
-	return pblist;
-}
-
-/**
- * Free pages we allocated for suspend. Suspend pages are alocated
- * before atomic copy, so we need to free them after resume.
- */
-
-void swsusp_free(void)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-
-	for_each_zone(zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-				struct page * page;
-				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
-				if (PageNosave(page) && PageNosaveFree(page)) {
-					ClearPageNosave(page);
-					ClearPageNosaveFree(page);
-					free_page((long) page_address(page));
-				}
-			}
-	}
-}
-
-/**
- *	enough_free_mem - Make sure we enough free memory to snapshot.
- *
- *	Returns TRUE or FALSE after checking the number of available
- *	free pages.
- */
-
-static int enough_free_mem(void)
-{
-	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
-}
-
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -891,7 +550,7 @@ static int enough_free_mem(void)
  *	We should only consider resume_device.
  */
 
-static int enough_swap(void)
+int enough_swap(void)
 {
 	struct sysinfo i;
 
@@ -901,88 +560,6 @@ static int enough_swap(void)
 		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
 }
 
-static int swsusp_alloc(void)
-{
-	struct pbe * p;
-
-	pagedir_nosave = NULL;
-
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
-		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
-	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
-
-	for_each_pbe (p, pagedir_save) {
-		p->address = (unsigned long)alloc_image_page();
-		if (!p->address) {
-			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-			swsusp_free();
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-
-static int suspend_prepare_image(void)
-{
-	int error;
-
-	pr_debug("swsusp: critical section: \n");
-	if (save_highmem()) {
-		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
-		restore_highmem();
-		return -ENOMEM;
-	}
-
-	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
-	nr_copy_pages_check = nr_copy_pages;
-
-	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_copy_pages,
-		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
-		 PAGES_FOR_IO, nr_free_pages());
-
-	if (!enough_free_mem()) {
-		printk(KERN_ERR "swsusp: Not enough free memory\n");
-		return -ENOMEM;
-	}
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE)) {
-		printk(KERN_ERR "swsusp: Too many image pages\n");
-		return -ENOSPC;
-	}
-
-	if (!enough_swap()) {
-		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
-	}
-
-	error = swsusp_alloc();
-	if (error)
-		return error;
-
-	/* During allocating of suspend pagedir, new cold pages may appear.
-	 * Kill them.
-	 */
-	drain_local_pages();
-	copy_data_pages();
-
-	/*
-	 * End of critical section. From now on, we can write to memory,
-	 * but we should not touch disk. This specially means we must _not_
-	 * touch swap space! Except we must write out our image of course.
-	 */
-
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
-	return 0;
-}
-
 
 /* It is important _NOT_ to umount filesystems at this point. We want
  * them synced (in case something goes wrong) but we DO not want to mark
@@ -1002,14 +579,6 @@ int swsusp_write(void)
 }
 
 
-extern asmlinkage int swsusp_arch_suspend(void);
-extern asmlinkage int swsusp_arch_resume(void);
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
 
 int swsusp_suspend(void)
 {
@@ -1041,7 +610,6 @@ int swsusp_suspend(void)
 		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
-	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	restore_highmem();
 	device_power_up();
 	local_irq_enable();
-- 
cgit v1.2.3


From a0f496517f3e28d651d0cbbcf2d4fb701ed6957e Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:57 -0800
Subject: [PATCH] swsusp: reduce the use of global variables

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |  2 +-
 kernel/power/snapshot.c | 78 ++++++++++++++++++++++++-------------------------
 kernel/power/swsusp.c   |  6 ++--
 3 files changed, 43 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index e54dd8435de7..28afcb090149 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -69,4 +69,4 @@ extern int restore_highmem(void);
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe * alloc_pagedir(unsigned nr_pages);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
-extern int enough_swap(void);
+extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0f0a7f306b0d..03916cf3ff02 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -187,37 +187,38 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
 	return 1;
 }
 
-static void count_data_pages(void)
+static unsigned count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
+	unsigned n;
 
-	nr_copy_pages = 0;
-
+	n = 0;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			nr_copy_pages += saveable(zone, &zone_pfn);
+			n += saveable(zone, &zone_pfn);
 	}
+	return n;
 }
 
-static void copy_data_pages(void)
+static void copy_data_pages(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe *pbe = pagedir_nosave, *p;
+	struct pbe *pbe, *p;
 
-	pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
+	pbe = pblist;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
 		mark_free_pages(zone);
 		/* This is necessary for swsusp_free() */
-		for_each_pb_page (p, pagedir_nosave)
+		for_each_pb_page (p, pblist)
 			SetPageNosaveFree(virt_to_page(p));
-		for_each_pbe(p, pagedir_nosave)
+		for_each_pbe (p, pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
@@ -370,46 +371,39 @@ void swsusp_free(void)
  *	free pages.
  */
 
-static int enough_free_mem(void)
+static int enough_free_mem(unsigned nr_pages)
 {
 	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
-	return nr_free_pages() > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
-static int swsusp_alloc(void)
+static struct pbe *swsusp_alloc(unsigned nr_pages)
 {
-	struct pbe * p;
-
-	pagedir_nosave = NULL;
-
-	if (MAX_PBES < nr_copy_pages / PBES_PER_PAGE +
-	    !!(nr_copy_pages % PBES_PER_PAGE))
-		return -ENOSPC;
+	struct pbe *pblist, *p;
 
-	if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
+	if (!(pblist = alloc_pagedir(nr_pages))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
-		return -ENOMEM;
+		return NULL;
 	}
-	create_pbe_list(pagedir_save, nr_copy_pages);
-	pagedir_nosave = pagedir_save;
+	create_pbe_list(pblist, nr_pages);
 
-	for_each_pbe (p, pagedir_save) {
+	for_each_pbe (p, pblist) {
 		p->address = (unsigned long)alloc_image_page();
 		if (!p->address) {
 			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
 			swsusp_free();
-			return -ENOMEM;
+			return NULL;
 		}
 	}
 
-	return 0;
+	return pblist;
 }
 
 static int suspend_prepare_image(void)
 {
-	int error;
+	unsigned nr_pages;
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
@@ -419,33 +413,37 @@ static int suspend_prepare_image(void)
 	}
 
 	drain_local_pages();
-	count_data_pages();
-	printk("swsusp: Need to copy %u pages\n", nr_copy_pages);
+	nr_pages = count_data_pages();
+	printk("swsusp: Need to copy %u pages\n", nr_pages);
 
 	pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
-		 nr_copy_pages,
-		 nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE),
+		 nr_pages,
+		 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
 		 PAGES_FOR_IO, nr_free_pages());
 
-	if (!enough_free_mem()) {
+	/* This is needed because of the fixed size of swsusp_info */
+	if (MAX_PBES < (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE)
+		return -ENOSPC;
+
+	if (!enough_free_mem(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free memory\n");
 		return -ENOMEM;
 	}
 
-	if (!enough_swap()) {
+	if (!enough_swap(nr_pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
 
-	error = swsusp_alloc();
-	if (error)
-		return error;
+	pagedir_nosave = swsusp_alloc(nr_pages);
+	if (!pagedir_nosave)
+		return -ENOMEM;
 
 	/* During allocating of suspend pagedir, new cold pages may appear.
 	 * Kill them.
 	 */
 	drain_local_pages();
-	copy_data_pages();
+	copy_data_pages(pagedir_nosave);
 
 	/*
 	 * End of critical section. From now on, we can write to memory,
@@ -453,7 +451,9 @@ static int suspend_prepare_image(void)
 	 * touch swap space! Except we must write out our image of course.
 	 */
 
-	printk("swsusp: critical section/: done (%d pages copied)\n", nr_copy_pages );
+	nr_copy_pages = nr_pages;
+
+	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index fc50b5d2dd26..f6abfdb0a02a 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -550,14 +550,14 @@ static int write_suspend_image(void)
  *	We should only consider resume_device.
  */
 
-int enough_swap(void)
+int enough_swap(unsigned nr_pages)
 {
 	struct sysinfo i;
 
 	si_swapinfo(&i);
 	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_copy_pages + PAGES_FOR_IO +
-		nr_copy_pages/PBES_PER_PAGE + !!(nr_copy_pages%PBES_PER_PAGE));
+	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
 
-- 
cgit v1.2.3


From 2c1b4a5ca48831595979a850f40ced8e7da026f8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: rework memory freeing on resume

The following patch makes swsusp use the PG_nosave and PG_nosave_free flags to
mark pages that should be freed in case of an error during resume.

This allows us to simplify the code and to use swsusp_free() in all of the
swsusp's resume error paths, which makes them actually work.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/suspend.c |  84 ++++++++-------------------------
 include/linux/suspend.h      |   3 +-
 kernel/power/disk.c          |  14 +++---
 kernel/power/power.h         |   2 +-
 kernel/power/snapshot.c      |   2 +-
 kernel/power/swsusp.c        | 110 ++++++++++++++-----------------------------
 6 files changed, 65 insertions(+), 150 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 02516823f514..fd2bef780882 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -147,57 +147,7 @@ extern int restore_image(void);
 
 pgd_t *temp_level4_pgt;
 
-static void **pages;
-
-static inline void *__add_page(void)
-{
-	void **c;
-
-	c = (void **)get_usable_page(GFP_ATOMIC);
-	if (c) {
-		*c = pages;
-		pages = c;
-	}
-	return c;
-}
-
-static inline void *__next_page(void)
-{
-	void **c;
-
-	c = pages;
-	if (c) {
-		pages = *c;
-		*c = NULL;
-	}
-	return c;
-}
-
-/*
- * Try to allocate as many usable pages as needed and daisy chain them.
- * If one allocation fails, free the pages allocated so far
- */
-static int alloc_usable_pages(unsigned long n)
-{
-	void *p;
-
-	pages = NULL;
-	do
-		if (!__add_page())
-			break;
-	while (--n);
-	if (n) {
-		p = __next_page();
-		while (p) {
-			free_page((unsigned long)p);
-			p = __next_page();
-		}
-		return -ENOMEM;
-	}
-	return 0;
-}
-
-static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
+static int res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long end)
 {
 	long i, j;
 
@@ -211,7 +161,9 @@ static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long e
 		if (paddr >= end)
 			break;
 
-		pmd = (pmd_t *)__next_page();
+		pmd = (pmd_t *)get_safe_page(GFP_ATOMIC);
+		if (!pmd)
+			return -ENOMEM;
 		set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
 		for (j = 0; j < PTRS_PER_PMD; pmd++, j++, paddr += PMD_SIZE) {
 			unsigned long pe;
@@ -223,13 +175,17 @@ static void res_phys_pud_init(pud_t *pud, unsigned long address, unsigned long e
 			set_pmd(pmd, __pmd(pe));
 		}
 	}
+	return 0;
 }
 
-static void set_up_temporary_mappings(void)
+static int set_up_temporary_mappings(void)
 {
 	unsigned long start, end, next;
+	int error;
 
-	temp_level4_pgt = (pgd_t *)__next_page();
+	temp_level4_pgt = (pgd_t *)get_safe_page(GFP_ATOMIC);
+	if (!temp_level4_pgt)
+		return -ENOMEM;
 
 	/* It is safe to reuse the original kernel mapping */
 	set_pgd(temp_level4_pgt + pgd_index(__START_KERNEL_map),
@@ -240,29 +196,27 @@ static void set_up_temporary_mappings(void)
 	end = (unsigned long)pfn_to_kaddr(end_pfn);
 
 	for (; start < end; start = next) {
-		pud_t *pud = (pud_t *)__next_page();
+		pud_t *pud = (pud_t *)get_safe_page(GFP_ATOMIC);
+		if (!pud)
+			return -ENOMEM;
 		next = start + PGDIR_SIZE;
 		if (next > end)
 			next = end;
-		res_phys_pud_init(pud, __pa(start), __pa(next));
+		if ((error = res_phys_pud_init(pud, __pa(start), __pa(next))))
+			return error;
 		set_pgd(temp_level4_pgt + pgd_index(start),
 			mk_kernel_pgd(__pa(pud)));
 	}
+	return 0;
 }
 
 int swsusp_arch_resume(void)
 {
-	unsigned long n;
+	int error;
 
-	n = ((end_pfn << PAGE_SHIFT) + PUD_SIZE - 1) >> PUD_SHIFT;
-	n += (n + PTRS_PER_PUD - 1) / PTRS_PER_PUD + 1;
-	pr_debug("swsusp_arch_resume(): pages needed = %lu\n", n);
-	if (alloc_usable_pages(n)) {
-		free_eaten_memory();
-		return -ENOMEM;
-	}
 	/* We have got enough memory and from now on we cannot recover */
-	set_up_temporary_mappings();
+	if ((error = set_up_temporary_mappings()))
+		return error;
 	restore_image();
 	return 0;
 }
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 380915e9563d..a61c04f804b2 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -71,8 +71,7 @@ void restore_processor_state(void);
 struct saved_context;
 void __save_processor_state(struct saved_context *ctxt);
 void __restore_processor_state(struct saved_context *ctxt);
-extern unsigned long get_usable_page(gfp_t gfp_mask);
-extern void free_eaten_memory(void);
+unsigned long get_safe_page(gfp_t gfp_mask);
 
 /*
  * XXX: We try to keep some more pages free so that I/O operations succeed
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 761956e813f5..44ef5e799df0 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -30,7 +30,6 @@ extern int swsusp_check(void);
 extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
-extern int swsusp_free(void);
 
 
 static int noresume = 0;
@@ -252,14 +251,17 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read()))
-		goto Cleanup;
+	if ((error = swsusp_read())) {
+		swsusp_free();
+		goto Thaw;
+	}
 
 	pr_debug("PM: Preparing devices for restore.\n");
 
 	if ((error = device_suspend(PMSG_FREEZE))) {
 		printk("Some devices failed to suspend\n");
-		goto Free;
+		swsusp_free();
+		goto Thaw;
 	}
 
 	mb();
@@ -268,9 +270,7 @@ static int software_resume(void)
 	swsusp_resume();
 	pr_debug("PM: Restore failed, recovering.n");
 	device_resume();
- Free:
-	swsusp_free();
- Cleanup:
+ Thaw:
 	unprepare_processes();
  Done:
 	/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 28afcb090149..d4fd96a135ab 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -66,7 +66,7 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern int restore_highmem(void);
-extern void free_pagedir(struct pbe *pblist);
 extern struct pbe * alloc_pagedir(unsigned nr_pages);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
+extern void swsusp_free(void);
 extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 03916cf3ff02..84e686bdb40b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -240,7 +240,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f6abfdb0a02a..50667f4f3a2b 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -629,6 +629,11 @@ int swsusp_resume(void)
 	 * execution continues at place where swsusp_arch_suspend was called
          */
 	BUG_ON(!error);
+	/* The only reason why swsusp_arch_resume() can fail is memory being
+	 * very tight, so we have to free it as soon as we can to avoid
+	 * subsequent failures
+	 */
+	swsusp_free();
 	restore_processor_state();
 	restore_highmem();
 	touch_softlockup_watchdog();
@@ -644,54 +649,28 @@ int swsusp_resume(void)
  *
  *	We don't know which pages are usable until we allocate them.
  *
- *	Allocated but unusable (ie eaten) memory pages are linked together
- *	to create a list, so that we can free them easily
- *
- *	We could have used a type other than (void *)
- *	for this purpose, but ...
+ *	Allocated but unusable (ie eaten) memory pages are marked so that
+ *	swsusp_free() can release them
  */
-static void **eaten_memory = NULL;
 
-static inline void eat_page(void *page)
-{
-	void **c;
-
-	c = eaten_memory;
-	eaten_memory = page;
-	*eaten_memory = c;
-}
-
-unsigned long get_usable_page(gfp_t gfp_mask)
+unsigned long get_safe_page(gfp_t gfp_mask)
 {
 	unsigned long m;
 
-	m = get_zeroed_page(gfp_mask);
-	while (!PageNosaveFree(virt_to_page(m))) {
-		eat_page((void *)m);
+	do {
 		m = get_zeroed_page(gfp_mask);
-		if (!m)
-			break;
+		if (m && PageNosaveFree(virt_to_page(m)))
+			/* This is for swsusp_free() */
+			SetPageNosave(virt_to_page(m));
+	} while (m && PageNosaveFree(virt_to_page(m)));
+	if (m) {
+		/* This is for swsusp_free() */
+		SetPageNosave(virt_to_page(m));
+		SetPageNosaveFree(virt_to_page(m));
 	}
 	return m;
 }
 
-void free_eaten_memory(void)
-{
-	unsigned long m;
-	void **c;
-	int i = 0;
-
-	c = eaten_memory;
-	while (c) {
-		m = (unsigned long)c;
-		c = *c;
-		free_page(m);
-		i++;
-	}
-	eaten_memory = NULL;
-	pr_debug("swsusp: %d unused pages freed\n", i);
-}
-
 /**
  *	check_pagedir - We ensure here that pages that the PBEs point to
  *	won't collide with pages where we're going to restore from the loaded
@@ -709,7 +688,7 @@ static int check_pagedir(struct pbe *pblist)
 		p->address = 0UL;
 
 	for_each_pbe (p, pblist) {
-		p->address = get_usable_page(GFP_ATOMIC);
+		p->address = get_safe_page(GFP_ATOMIC);
 		if (!p->address)
 			return -ENOMEM;
 	}
@@ -728,7 +707,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	unsigned long zone_pfn;
 	struct pbe *pbpage, *tail, *p;
 	void *m;
-	int rel = 0, error = 0;
+	int rel = 0;
 
 	if (!pblist) /* a sanity check */
 		return NULL;
@@ -736,41 +715,37 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
 			swsusp_info.pagedir_pages);
 
-	/* Set page flags */
+	/* Clear page flags */
 
 	for_each_zone (zone) {
         	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-                	SetPageNosaveFree(pfn_to_page(zone_pfn +
+        		if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+                		ClearPageNosaveFree(pfn_to_page(zone_pfn +
 					zone->zone_start_pfn));
 	}
 
-	/* Clear orig addresses */
+	/* Mark orig addresses */
 
 	for_each_pbe (p, pblist)
-		ClearPageNosaveFree(virt_to_page(p->orig_address));
+		SetPageNosaveFree(virt_to_page(p->orig_address));
 
 	tail = pblist + PB_PAGE_SKIP;
 
 	/* Relocate colliding pages */
 
 	for_each_pb_page (pbpage, pblist) {
-		if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-			m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
-			if (!m) {
-				error = -ENOMEM;
-				break;
-			}
+		if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
+			m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
+			if (!m)
+				return NULL;
 			memcpy(m, (void *)pbpage, PAGE_SIZE);
 			if (pbpage == pblist)
 				pblist = (struct pbe *)m;
 			else
 				tail->next = (struct pbe *)m;
-
-			eat_page((void *)pbpage);
 			pbpage = (struct pbe *)m;
 
 			/* We have to link the PBEs again */
-
 			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
 				if (p->next) /* needed to save the end */
 					p->next = p + 1;
@@ -780,15 +755,13 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 		tail = pbpage + PB_PAGE_SKIP;
 	}
 
-	if (error) {
-		printk("\nswsusp: Out of memory\n\n");
-		free_pagedir(pblist);
-		free_eaten_memory();
-		pblist = NULL;
-		/* Is this even worth handling? It should never ever happen, and we
-		   have just lost user's state, anyway... */
-	} else
-		printk("swsusp: Relocated %d pages\n", rel);
+	/* This is for swsusp_free() */
+	for_each_pb_page (pbpage, pblist) {
+		SetPageNosave(virt_to_page(pbpage));
+		SetPageNosaveFree(virt_to_page(pbpage));
+	}
+
+	printk("swsusp: Relocated %d pages\n", rel);
 
 	return pblist;
 }
@@ -1006,9 +979,7 @@ static int read_pagedir(struct pbe *pblist)
 			break;
 	}
 
-	if (error)
-		free_pagedir(pblist);
-	else
+	if (!error)
 		BUG_ON(i != swsusp_info.pagedir_pages);
 
 	return error;
@@ -1051,15 +1022,6 @@ static int read_suspend_image(void)
 	if (!error)
 		error = data_read(pagedir_nosave);
 
-	if (error) { /* We fail cleanly */
-		free_eaten_memory();
-		for_each_pbe (p, pagedir_nosave)
-			if (p->address) {
-				free_page(p->address);
-				p->address = 0UL;
-			}
-		free_pagedir(pagedir_nosave);
-	}
 	return error;
 }
 
-- 
cgit v1.2.3


From 96bc7aec20b50761822f96130127b8e31e168af1 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 30 Oct 2005 14:59:58 -0800
Subject: [PATCH] swsusp: remove unneccessary includes

Cleanup comments and remove unneccessary includes.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 25 ++-----------------------
 kernel/power/swsusp.c   |  9 +--------
 2 files changed, 3 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 84e686bdb40b..b4a923e59bc5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1,8 +1,7 @@
 /*
- * linux/kernel/power/swsusp.c
+ * linux/kernel/power/snapshot.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provide system snapshot/restore functionality.
  *
  * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz>
  *
@@ -15,30 +14,16 @@
 #include <linux/mm.h>
 #include <linux/suspend.h>
 #include <linux/smp_lock.h>
-#include <linux/file.h>
-#include <linux/utsname.h>
-#include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
-#include <linux/genhd.h>
 #include <linux/kernel.h>
-#include <linux/major.h>
-#include <linux/swap.h>
 #include <linux/pm.h>
 #include <linux/device.h>
-#include <linux/buffer_head.h>
-#include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
 #include <linux/highmem.h>
-#include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -46,15 +31,9 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
-#include <linux/random.h>
-#include <linux/crypto.h>
-#include <asm/scatterlist.h>
-
 #include "power.h"
 
 
-
-
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 50667f4f3a2b..ae8425b69b44 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,8 +1,7 @@
 /*
  * linux/kernel/power/swsusp.c
  *
- * This file is to realize architecture-independent
- * machine suspend feature using pretty near only high-level routines
+ * This file provides code to write suspend image to swap and read it back.
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
  * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
@@ -47,11 +46,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 #include <linux/delay.h>
-#include <linux/reboot.h>
 #include <linux/bitops.h>
-#include <linux/vt_kern.h>
-#include <linux/kbd_kern.h>
-#include <linux/keyboard.h>
 #include <linux/spinlock.h>
 #include <linux/genhd.h>
 #include <linux/kernel.h>
@@ -63,10 +58,8 @@
 #include <linux/swapops.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
-#include <linux/console.h>
 #include <linux/highmem.h>
 #include <linux/bio.h>
-#include <linux/mount.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
-- 
cgit v1.2.3


From de491861e1457c31aed6d44d96afb549365ff790 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Sun, 30 Oct 2005 14:59:59 -0800
Subject: [PATCH] swsusp: cleanups

Reduce number of ifdefs somehow, and fix whitespace a bit.  No real code
changes.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b4a923e59bc5..72787f925630 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -33,7 +33,6 @@
 
 #include "power.h"
 
-
 #ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
@@ -88,12 +87,10 @@ static int save_highmem_zone(struct zone *zone)
 	}
 	return 0;
 }
-#endif /* CONFIG_HIGHMEM */
 
 
 static int save_highmem(void)
 {
-#ifdef CONFIG_HIGHMEM
 	struct zone *zone;
 	int res = 0;
 
@@ -104,13 +101,11 @@ static int save_highmem(void)
 		if (res)
 			return res;
 	}
-#endif
 	return 0;
 }
 
 int restore_highmem(void)
 {
-#ifdef CONFIG_HIGHMEM
 	printk("swsusp: Restoring Highmem\n");
 	while (highmem_copy) {
 		struct highmem_page *save = highmem_copy;
@@ -123,9 +118,12 @@ int restore_highmem(void)
 		free_page((long) save->data);
 		kfree(save);
 	}
-#endif
 	return 0;
 }
+#else
+static int save_highmem(void) { return 0; }
+int restore_highmem(void) { return 0; }
+#endif /* CONFIG_HIGHMEM */
 
 
 static int pfn_is_nosave(unsigned long pfn)
@@ -144,10 +142,10 @@ static int pfn_is_nosave(unsigned long pfn)
  *	isn't part of a free chunk of pages.
  */
 
-static int saveable(struct zone * zone, unsigned long * zone_pfn)
+static int saveable(struct zone *zone, unsigned long *zone_pfn)
 {
 	unsigned long pfn = *zone_pfn + zone->zone_start_pfn;
-	struct page * page;
+	struct page *page;
 
 	if (!pfn_valid(pfn))
 		return 0;
@@ -201,7 +199,7 @@ static void copy_data_pages(struct pbe *pblist)
 			SetPageNosaveFree(virt_to_page(p->address));
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
 			if (saveable(zone, &zone_pfn)) {
-				struct page * page;
+				struct page *page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				BUG_ON(!pbe);
 				pbe->orig_address = (unsigned long)page_address(page);
@@ -295,7 +293,7 @@ static void *alloc_image_page(void)
  *	On each page we set up a list of struct_pbe elements.
  */
 
-struct pbe * alloc_pagedir(unsigned nr_pages)
+struct pbe *alloc_pagedir(unsigned nr_pages)
 {
 	unsigned num;
 	struct pbe *pblist, *pbe;
@@ -304,12 +302,12 @@ struct pbe * alloc_pagedir(unsigned nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = (struct pbe *)alloc_image_page();
+	pblist = alloc_image_page();
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = (struct pbe *)alloc_image_page();
+		pbe->next = alloc_image_page();
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
-- 
cgit v1.2.3


From 2e32a43efdc8175579cc91e8b620ac331376a437 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 15:00:00 -0800
Subject: [PATCH] swsusp: get rid of unnecessary wrapper function

The following patch merges two functions in a trivial way.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 72787f925630..42a628704398 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -378,7 +378,7 @@ static struct pbe *swsusp_alloc(unsigned nr_pages)
 	return pblist;
 }
 
-static int suspend_prepare_image(void)
+asmlinkage int swsusp_save(void)
 {
 	unsigned nr_pages;
 
@@ -433,9 +433,3 @@ static int suspend_prepare_image(void)
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
-
-
-asmlinkage int swsusp_save(void)
-{
-	return suspend_prepare_image();
-}
-- 
cgit v1.2.3


From 0245b3e787dc3267a915e1f56419e7e9c197e148 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 30 Oct 2005 15:00:01 -0800
Subject: [PATCH] swsusp: two simplifications

The following patch simplifies the progress meter in disk.c:free_some_memory()
and makes disk.c:pm_suspend_disk() call device_resume() explicitly in the
suspend path.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/disk.c   | 8 ++------
 kernel/power/swsusp.c | 2 +-
 2 files changed, 3 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 44ef5e799df0..027322a564f4 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -92,10 +92,7 @@ static void free_some_memory(void)
 	printk("Freeing memory...  ");
 	while ((tmp = shrink_all_memory(10000))) {
 		pages += tmp;
-		printk("\b%c", p[i]);
-		i++;
-		if (i > 3)
-			i = 0;
+		printk("\b%c", p[i++ % 4]);
 	}
 	printk("\bdone (%li pages freed)\n", pages);
 }
@@ -177,13 +174,12 @@ int pm_suspend_disk(void)
 		goto Done;
 
 	if (in_suspend) {
+		device_resume();
 		pr_debug("PM: writing image.\n");
 		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
-		/* swsusp_write can not fail in device_resume,
-		   no need to do second device_resume */
 			swsusp_free();
 			unprepare_processes();
 			return error;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae8425b69b44..12db1d2ad61f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -562,7 +562,7 @@ int enough_swap(unsigned nr_pages)
 int swsusp_write(void)
 {
 	int error;
-	device_resume();
+
 	lock_swapdevices();
 	error = write_suspend_image();
 	/* This will unlock ignored swap devices since writing is finished */
-- 
cgit v1.2.3


From eb9289eb20df6b54214c45ac7c6bf5179a149026 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Sun, 30 Oct 2005 15:00:01 -0800
Subject: [PATCH] introduce .valid callback for pm_ops

Add pm_ops.valid callback, so only the available pm states show in
/sys/power/state.  And this also makes an earlier states error report at
enter_state before we do actual suspend/resume.

Signed-off-by: Shaohua Li<shaohua.li@intel.com>
Acked-by: Pavel Machek<pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/acpi/sleep/main.c | 8 ++++++++
 include/linux/pm.h        | 1 +
 kernel/power/main.c       | 5 ++++-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index aee50b453265..930427fc0c4b 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -158,7 +158,15 @@ int acpi_suspend(u32 acpi_state)
 	return -EINVAL;
 }
 
+static int acpi_pm_state_valid(suspend_state_t pm_state)
+{
+	u32 acpi_state = acpi_suspend_states[pm_state];
+
+	return sleep_states[acpi_state];
+}
+
 static struct pm_ops acpi_pm_ops = {
+	.valid = acpi_pm_state_valid,
 	.prepare = acpi_pm_prepare,
 	.enter = acpi_pm_enter,
 	.finish = acpi_pm_finish,
diff --git a/include/linux/pm.h b/include/linux/pm.h
index c61d5de837ef..1514098d156d 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -170,6 +170,7 @@ typedef int __bitwise suspend_disk_method_t;
 
 struct pm_ops {
 	suspend_disk_method_t pm_disk_mode;
+	int (*valid)(suspend_state_t state);
 	int (*prepare)(suspend_state_t state);
 	int (*enter)(suspend_state_t state);
 	int (*finish)(suspend_state_t state);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 22bdc93cc038..18d7d693fbba 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,6 +167,8 @@ static int enter_state(suspend_state_t state)
 {
 	int error;
 
+	if (pm_ops->valid && !pm_ops->valid(state))
+		return -ENODEV;
 	if (down_trylock(&pm_sem))
 		return -EBUSY;
 
@@ -236,7 +238,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 	char * s = buf;
 
 	for (i = 0; i < PM_SUSPEND_MAX; i++) {
-		if (pm_states[i])
+		if (pm_states[i] && pm_ops && (!pm_ops->valid
+			||(pm_ops->valid && pm_ops->valid(i))))
 			s += sprintf(s,"%s ",pm_states[i]);
 	}
 	s += sprintf(s,"\n");
-- 
cgit v1.2.3


From a8db2db1e6a8d323d87a67c5391d48fe2b97faf5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:01:38 -0800
Subject: [PATCH] introduce setup_timer() helper

Every user of init_timer() also needs to initialize ->function and ->data
fields.  This patch adds a simple setup_timer() helper for that.

The schedule_timeout() is patched as an example of usage.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/kernel/i8259.c | 8 ++++----
 include/linux/timer.h      | 9 +++++++++
 kernel/timer.c             | 8 ++------
 3 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index b2a238b5a17e..c6c9791d77c1 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -494,7 +494,7 @@ void invalidate_interrupt7(void);
 void thermal_interrupt(void);
 void i8254_timer_resume(void);
 
-static void setup_timer(void)
+static void setup_timer_hardware(void)
 {
 	outb_p(0x34,0x43);		/* binary, mode 2, LSB/MSB, ch 0 */
 	udelay(10);
@@ -505,13 +505,13 @@ static void setup_timer(void)
 
 static int timer_resume(struct sys_device *dev)
 {
-	setup_timer();
+	setup_timer_hardware();
 	return 0;
 }
 
 void i8254_timer_resume(void)
 {
-	setup_timer();
+	setup_timer_hardware();
 }
 
 static struct sysdev_class timer_sysclass = {
@@ -594,7 +594,7 @@ void __init init_IRQ(void)
 	 * Set the clock to HZ Hz, we already have a valid
 	 * vector now:
 	 */
-	setup_timer();
+	setup_timer_hardware();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 3340f3bd135d..ddd5bbe1fc8e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -38,6 +38,15 @@ extern struct timer_base_s __init_timer_base;
 
 void fastcall init_timer(struct timer_list * timer);
 
+static inline void setup_timer(struct timer_list * timer,
+				void (*function)(unsigned long),
+				unsigned long data)
+{
+	timer->function = function;
+	timer->data = data;
+	init_timer(timer);
+}
+
 /***
  * timer_pending - is a timer pending?
  * @timer: the timer in question
diff --git a/kernel/timer.c b/kernel/timer.c
index 6a2e5f8dc725..6ed1a826e5ce 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1146,12 +1146,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 
 	expire = timeout + jiffies;
 
-	init_timer(&timer);
-	timer.expires = expire;
-	timer.data = (unsigned long) current;
-	timer.function = process_timeout;
-
-	add_timer(&timer);
+	setup_timer(&timer, process_timeout, (unsigned long)current);
+	__mod_timer(&timer, expire);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3


From 61e1a9ea4b425eb8c3b4965c35fe953bd881728f Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Sun, 30 Oct 2005 15:01:40 -0800
Subject: [PATCH] Add kthread_stop_sem()

Enhance the kthread API by adding kthread_stop_sem, for use in stopping
threads that spend their idle time waiting on a semaphore.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kthread.h | 12 ++++++++++++
 kernel/kthread.c        | 13 +++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 3fa786448db3..ebdd41fd1082 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -69,6 +69,18 @@ void kthread_bind(struct task_struct *k, unsigned int cpu);
  * was never called. */
 int kthread_stop(struct task_struct *k);
 
+/**
+ * kthread_stop_sem: stop a thread created by kthread_create().
+ * @k: thread created by kthread_create().
+ * @s: semaphore that @k waits on while idle.
+ *
+ * Does essentially the same thing as kthread_stop() above, but wakes
+ * @k by calling up(@s).
+ *
+ * Returns the result of threadfn(), or -EINTR if wake_up_process()
+ * was never called. */
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s);
+
 /**
  * kthread_should_stop: should this kthread return now?
  *
diff --git a/kernel/kthread.c b/kernel/kthread.c
index f50f174e92da..e75950a1092c 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -164,6 +164,12 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
 EXPORT_SYMBOL(kthread_bind);
 
 int kthread_stop(struct task_struct *k)
+{
+	return kthread_stop_sem(k, NULL);
+}
+EXPORT_SYMBOL(kthread_stop);
+
+int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 {
 	int ret;
 
@@ -178,7 +184,10 @@ int kthread_stop(struct task_struct *k)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	wake_up_process(k);
+	if (s)
+		up(s);
+	else
+		wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -189,7 +198,7 @@ int kthread_stop(struct task_struct *k)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop);
+EXPORT_SYMBOL(kthread_stop_sem);
 
 static __init int helper_init(void)
 {
-- 
cgit v1.2.3


From 1bb34a412750291e4e5e9f1d0fe7ae1b7e976098 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Sun, 30 Oct 2005 15:01:42 -0800
Subject: [PATCH] NTP shift_right cleanup

Create a macro shift_right() that avoids the numerous ugly conditionals in the
NTP code that look like:

        if(a < 0)
                b = -(-a >> shift);
        else
                b = a >> shift;

Replacing it with:

        b = shift_right(a, shift);

This should have zero effect on the logic, however it should probably have
a bit of testing just to be sure.

Also replace open-coded min/max with the macros.

Signed-off-by : John Stultz <johnstul@us.ibm.com>

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/timex.h |  7 +++++++
 kernel/time.c         | 25 ++++++------------------
 kernel/timer.c        | 53 +++++++++++----------------------------------------
 3 files changed, 24 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 7e050a2cc35b..04a4a8cb4ed3 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -282,6 +282,13 @@ static inline int ntp_synced(void)
 	return !(time_status & STA_UNSYNC);
 }
 
+/* Required to safely shift negative values */
+#define shift_right(x, s) ({	\
+	__typeof__(x) __x = (x);	\
+	__typeof__(s) __s = (s);	\
+	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
+})
+
 
 #ifdef CONFIG_TIME_INTERPOLATION
 
diff --git a/kernel/time.c b/kernel/time.c
index a3c2100470e1..245d595a13cb 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -338,30 +338,20 @@ int do_adjtimex(struct timex *txc)
 		        if (mtemp >= MINSEC) {
 			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
 							      SHIFT_UPDATE);
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> SHIFT_KH;
-			    else
-			        time_freq += ltemp >> SHIFT_KH;
+			    time_freq += shift_right(ltemp, SHIFT_KH);
 			} else /* calibration interval too short (p. 12) */
 				result = TIME_ERROR;
 		    } else {	/* PLL mode */
 		        if (mtemp < MAXSEC) {
 			    ltemp *= mtemp;
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> (time_constant +
-							time_constant +
-							SHIFT_KF - SHIFT_USEC);
-			    else
-			        time_freq += ltemp >> (time_constant +
+			    time_freq += shift_right(ltemp,(time_constant +
 						       time_constant +
-						       SHIFT_KF - SHIFT_USEC);
+						       SHIFT_KF - SHIFT_USEC));
 			} else /* calibration interval too long (p. 12) */
 				result = TIME_ERROR;
 		    }
-		    if (time_freq > time_tolerance)
-		        time_freq = time_tolerance;
-		    else if (time_freq < -time_tolerance)
-		        time_freq = -time_tolerance;
+		    time_freq = min(time_freq, time_tolerance);
+		    time_freq = max(time_freq, -time_tolerance);
 		} /* STA_PLL || STA_PPSTIME */
 	    } /* txc->modes & ADJ_OFFSET */
 	    if (txc->modes & ADJ_TICK) {
@@ -384,10 +374,7 @@ leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
 	    txc->offset	   = save_adjust;
 	else {
-	    if (time_offset < 0)
-		txc->offset = -(-time_offset >> SHIFT_UPDATE);
-	    else
-		txc->offset = time_offset >> SHIFT_UPDATE;
+	    txc->offset = shift_right(time_offset, SHIFT_UPDATE);
 	}
 	txc->freq	   = time_freq + pps_freq;
 	txc->maxerror	   = time_maxerror;
diff --git a/kernel/timer.c b/kernel/timer.c
index 6ed1a826e5ce..6b94adb45b03 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -703,23 +703,13 @@ static void second_overflow(void)
      * the adjustment over not more than the number of
      * seconds between updates.
      */
-    if (time_offset < 0) {
-	ltemp = -time_offset;
-	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
-	time_offset += ltemp;
-	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    } else {
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    }
 
     /*
      * Compute the frequency estimate and additional phase
@@ -736,39 +726,25 @@ static void second_overflow(void)
 			 STA_PPSWANDER | STA_PPSERROR);
     }
     ltemp = time_freq + pps_freq;
-    if (ltemp < 0)
-	time_adj -= -ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-    else
-	time_adj += ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+    time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
     /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
      * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
-    else
-	time_adj += (time_adj >> 2) + (time_adj >> 5);
+    time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
     /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
     /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
      * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
      */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -787,10 +763,8 @@ static void update_wall_time_one_tick(void)
 	     * Limit the amount of the step to be in the range
 	     * -tickadj .. +tickadj
 	     */
-	     if (time_adjust > tickadj)
-		time_adjust_step = tickadj;
-	     else if (time_adjust < -tickadj)
-		time_adjust_step = -tickadj;
+	     time_adjust_step = min(time_adjust_step, (long)tickadj);
+	     time_adjust_step = max(time_adjust_step, (long)-tickadj);
 
 	    /* Reduce by this step the amount of time left  */
 	    time_adjust -= time_adjust_step;
@@ -801,13 +775,8 @@ static void update_wall_time_one_tick(void)
 	 * advance the tick more.
 	 */
 	time_phase += time_adj;
-	if (time_phase <= -FINENSEC) {
-		long ltemp = -time_phase >> (SHIFT_SCALE - 10);
-		time_phase += ltemp << (SHIFT_SCALE - 10);
-		delta_nsec -= ltemp;
-	}
-	else if (time_phase >= FINENSEC) {
-		long ltemp = time_phase >> (SHIFT_SCALE - 10);
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
 		time_phase -= ltemp << (SHIFT_SCALE - 10);
 		delta_nsec += ltemp;
 	}
-- 
cgit v1.2.3


From a5a0d52c7305cb3629ef0cc9e2e0e106869e1907 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 30 Oct 2005 15:01:42 -0800
Subject: [PATCH] ntp whitespace cleanup

Fix bizarre 4-space coding style in the NTP code.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/timer.c | 252 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 126 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/timer.c b/kernel/timer.c
index 6b94adb45b03..cc18857601e2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -632,77 +632,74 @@ long time_next_adjust;
  */
 static void second_overflow(void)
 {
-    long ltemp;
-
-    /* Bump the maxerror field */
-    time_maxerror += time_tolerance >> SHIFT_USEC;
-    if ( time_maxerror > NTP_PHASE_LIMIT ) {
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_status |= STA_UNSYNC;
-    }
-
-    /*
-     * Leap second processing. If in leap-insert state at
-     * the end of the day, the system clock is set back one
-     * second; if in leap-delete state, the system clock is
-     * set ahead one second. The microtime() routine or
-     * external clock driver will insure that reported time
-     * is always monotonic. The ugly divides should be
-     * replaced.
-     */
-    switch (time_state) {
-
-    case TIME_OK:
-	if (time_status & STA_INS)
-	    time_state = TIME_INS;
-	else if (time_status & STA_DEL)
-	    time_state = TIME_DEL;
-	break;
-
-    case TIME_INS:
-	if (xtime.tv_sec % 86400 == 0) {
-	    xtime.tv_sec--;
-	    wall_to_monotonic.tv_sec++;
-	    /* The timer interpolator will make time change gradually instead
-	     * of an immediate jump by one second.
-	     */
-	    time_interpolator_update(-NSEC_PER_SEC);
-	    time_state = TIME_OOP;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+	long ltemp;
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
 	}
-	break;
-
-    case TIME_DEL:
-	if ((xtime.tv_sec + 1) % 86400 == 0) {
-	    xtime.tv_sec++;
-	    wall_to_monotonic.tv_sec--;
-	    /* Use of time interpolator for a gradual change of time */
-	    time_interpolator_update(NSEC_PER_SEC);
-	    time_state = TIME_WAIT;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
 	}
-	break;
-
-    case TIME_OOP:
-	time_state = TIME_WAIT;
-	break;
-
-    case TIME_WAIT:
-	if (!(time_status & (STA_INS | STA_DEL)))
-	    time_state = TIME_OK;
-    }
-
-    /*
-     * Compute the phase adjustment for the next second. In
-     * PLL mode, the offset is reduced by a fixed factor
-     * times the time constant. In FLL mode the offset is
-     * used directly. In either mode, the maximum phase
-     * adjustment for each second is clamped so as to spread
-     * the adjustment over not more than the number of
-     * seconds between updates.
-     */
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
 		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
@@ -711,40 +708,42 @@ static void second_overflow(void)
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 
-    /*
-     * Compute the frequency estimate and additional phase
-     * adjustment due to frequency error for the next
-     * second. When the PPS signal is engaged, gnaw on the
-     * watchdog counter and update the frequency computed by
-     * the pll and the PPS signal.
-     */
-    pps_valid++;
-    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-	pps_jitter = MAXTIME;
-	pps_stabil = MAXFREQ;
-	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-			 STA_PPSWANDER | STA_PPSERROR);
-    }
-    ltemp = time_freq + pps_freq;
-    time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second. When the PPS signal is
+	 * engaged, gnaw on the watchdog counter and update the frequency
+	 * computed by the pll and the PPS signal.
+	 */
+	pps_valid++;
+	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+		pps_jitter = MAXTIME;
+		pps_stabil = MAXFREQ;
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				STA_PPSWANDER | STA_PPSERROR);
+	}
+	ltemp = time_freq + pps_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
-     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
 #endif
 #if HZ == 250
-    /* Compensate for (HZ==250) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 255.85938; => only 0.05% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
-    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-     */
-    time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
@@ -753,21 +752,20 @@ static void update_wall_time_one_tick(void)
 {
 	long time_adjust_step, delta_nsec;
 
-	if ( (time_adjust_step = time_adjust) != 0 ) {
-	    /* We are doing an adjtime thing. 
-	     *
-	     * Prepare time_adjust_step to be within bounds.
-	     * Note that a positive time_adjust means we want the clock
-	     * to run faster.
-	     *
-	     * Limit the amount of the step to be in the range
-	     * -tickadj .. +tickadj
-	     */
-	     time_adjust_step = min(time_adjust_step, (long)tickadj);
-	     time_adjust_step = max(time_adjust_step, (long)-tickadj);
-
-	    /* Reduce by this step the amount of time left  */
-	    time_adjust -= time_adjust_step;
+	if ((time_adjust_step = time_adjust) != 0 ) {
+		/*
+		 * We are doing an adjtime thing.  Prepare time_adjust_step to
+		 * be within bounds.  Note that a positive time_adjust means we
+		 * want the clock to run faster.
+		 *
+		 * Limit the amount of the step to be in the range
+		 * -tickadj .. +tickadj
+		 */
+		time_adjust_step = min(time_adjust_step, (long)tickadj);
+		time_adjust_step = max(time_adjust_step, (long)-tickadj);
+
+		/* Reduce by this step the amount of time left  */
+		time_adjust -= time_adjust_step;
 	}
 	delta_nsec = tick_nsec + time_adjust_step * 1000;
 	/*
@@ -1106,8 +1104,8 @@ fastcall signed long __sched schedule_timeout(signed long timeout)
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
+				"value %lx from %p\n", timeout,
+				__builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
@@ -1133,15 +1131,15 @@ EXPORT_SYMBOL(schedule_timeout);
  */
 signed long __sched schedule_timeout_interruptible(signed long timeout)
 {
-       __set_current_state(TASK_INTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_interruptible);
 
 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 {
-       __set_current_state(TASK_UNINTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
@@ -1481,16 +1479,18 @@ static void time_interpolator_update(long delta_nsec)
 	if (!time_interpolator)
 		return;
 
-	/* The interpolator compensates for late ticks by accumulating
-         * the late time in time_interpolator->offset. A tick earlier than
-	 * expected will lead to a reset of the offset and a corresponding
-	 * jump of the clock forward. Again this only works if the
-	 * interpolator clock is running slightly slower than the regular clock
-	 * and the tuning logic insures that.
-         */
+	/*
+	 * The interpolator compensates for late ticks by accumulating the late
+	 * time in time_interpolator->offset. A tick earlier than expected will
+	 * lead to a reset of the offset and a corresponding jump of the clock
+	 * forward. Again this only works if the interpolator clock is running
+	 * slightly slower than the regular clock and the tuning logic insures
+	 * that.
+	 */
 
 	counter = time_interpolator_get_counter(1);
-	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+	offset = time_interpolator->offset +
+			GET_TI_NSECS(counter, time_interpolator);
 
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
 		time_interpolator->offset = offset - delta_nsec;
-- 
cgit v1.2.3


From 89ada67917f516212452443a56b9fd3b65b74dc7 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Sun, 30 Oct 2005 15:01:59 -0800
Subject: [PATCH] Use alloc_percpu to allocate workqueues locally

This patch makes the workqueus use alloc_percpu instead of an array.  The
workqueues are placed on nodes local to each processor.

The workqueue structure can grow to a significant size on a system with
lots of processors if this patch is not applied.  64 bit architectures with
all debugging features enabled and configured for 512 processors will not
be able to boot without this patch.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 91bacb13a7e2..7cee222231bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -12,6 +12,8 @@
  *   Andrew Morton <andrewm@uow.edu.au>
  *   Kai Petzke <wpp@marie.physik.tu-berlin.de>
  *   Theodore Ts'o <tytso@mit.edu>
+ *
+ * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>.
  */
 
 #include <linux/module.h>
@@ -57,7 +59,7 @@ struct cpu_workqueue_struct {
  * per-CPU workqueues:
  */
 struct workqueue_struct {
-	struct cpu_workqueue_struct cpu_wq[NR_CPUS];
+	struct cpu_workqueue_struct *cpu_wq;
 	const char *name;
 	struct list_head list; 	/* Empty if single thread */
 };
@@ -102,7 +104,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 		if (unlikely(is_single_threaded(wq)))
 			cpu = 0;
 		BUG_ON(!list_empty(&work->entry));
-		__queue_work(wq->cpu_wq + cpu, work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
 	}
 	put_cpu();
@@ -118,7 +120,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	if (unlikely(is_single_threaded(wq)))
 		cpu = 0;
 
-	__queue_work(wq->cpu_wq + cpu, work);
+	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
 int fastcall queue_delayed_work(struct workqueue_struct *wq,
@@ -265,13 +267,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 
 	if (is_single_threaded(wq)) {
 		/* Always use cpu 0's area. */
-		flush_cpu_workqueue(wq->cpu_wq + 0);
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0));
 	} else {
 		int cpu;
 
 		lock_cpu_hotplug();
 		for_each_online_cpu(cpu)
-			flush_cpu_workqueue(wq->cpu_wq + cpu);
+			flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 		unlock_cpu_hotplug();
 	}
 }
@@ -279,7 +281,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
 						   int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	struct task_struct *p;
 
 	spin_lock_init(&cwq->lock);
@@ -312,6 +314,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	if (!wq)
 		return NULL;
 
+	wq->cpu_wq = alloc_percpu(struct cpu_workqueue_struct);
 	wq->name = name;
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
@@ -353,7 +356,7 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
 	unsigned long flags;
 	struct task_struct *p;
 
-	cwq = wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	spin_lock_irqsave(&cwq->lock, flags);
 	p = cwq->thread;
 	cwq->thread = NULL;
@@ -380,6 +383,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 		spin_unlock(&workqueue_lock);
 	}
 	unlock_cpu_hotplug();
+	free_percpu(wq->cpu_wq);
 	kfree(wq);
 }
 
@@ -458,7 +462,7 @@ int current_is_keventd(void)
 
 	BUG_ON(!keventd_wq);
 
-	cwq = keventd_wq->cpu_wq + cpu;
+	cwq = per_cpu_ptr(keventd_wq->cpu_wq, cpu);
 	if (current == cwq->thread)
 		ret = 1;
 
@@ -470,7 +474,7 @@ int current_is_keventd(void)
 /* Take the work from this (downed) CPU. */
 static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 {
-	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
+	struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 	LIST_HEAD(list);
 	struct work_struct *work;
 
@@ -481,7 +485,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
 		printk("Taking work for %s\n", wq->name);
 		work = list_entry(list.next,struct work_struct,entry);
 		list_del(&work->entry);
-		__queue_work(wq->cpu_wq + smp_processor_id(), work);
+		__queue_work(per_cpu_ptr(wq->cpu_wq, smp_processor_id()), work);
 	}
 	spin_unlock_irq(&cwq->lock);
 }
@@ -508,15 +512,18 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 		/* Kick off worker threads. */
 		list_for_each_entry(wq, &workqueues, list) {
-			kthread_bind(wq->cpu_wq[hotcpu].thread, hotcpu);
-			wake_up_process(wq->cpu_wq[hotcpu].thread);
+			struct cpu_workqueue_struct *cwq;
+
+			cwq = per_cpu_ptr(wq->cpu_wq, hotcpu);
+			kthread_bind(cwq->thread, hotcpu);
+			wake_up_process(cwq->thread);
 		}
 		break;
 
 	case CPU_UP_CANCELED:
 		list_for_each_entry(wq, &workqueues, list) {
 			/* Unbind so it can run. */
-			kthread_bind(wq->cpu_wq[hotcpu].thread,
+			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
 				     smp_processor_id());
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
-- 
cgit v1.2.3


From dfc4f94d2ff95fc92127d3e512c1df7cab274fb8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 30 Oct 2005 15:02:03 -0800
Subject: [PATCH] remove timer debug field

Remove timer_list.magic and associated debugging code.

I originally added this when a spinlock was added to timer_list - this meant
that an all-zeroes timer became illegal and init_timer() was required.

That spinlock isn't even there any more, although timer.base must now be
initialised.

I'll keep this debugging code in -mm.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/kernel/vtime.c | 18 ++----------------
 include/linux/timer.h    |  5 -----
 kernel/timer.c           | 35 -----------------------------------
 3 files changed, 2 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index fa0726507b3d..22a895ecb7a4 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -24,7 +24,6 @@
 #include <asm/s390_ext.h>
 #include <asm/timer.h>
 
-#define VTIMER_MAGIC (TIMER_MAGIC + 1)
 static ext_int_info_t ext_int_info_timer;
 DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer);
 
@@ -277,20 +276,12 @@ static void do_cpu_timer_interrupt(struct pt_regs *regs, __u16 error_code)
 
 void init_virt_timer(struct vtimer_list *timer)
 {
-	timer->magic = VTIMER_MAGIC;
 	timer->function = NULL;
 	INIT_LIST_HEAD(&timer->entry);
 	spin_lock_init(&timer->lock);
 }
 EXPORT_SYMBOL(init_virt_timer);
 
-static inline int check_vtimer(struct vtimer_list *timer)
-{
-	if (timer->magic != VTIMER_MAGIC)
-		return -EINVAL;
-	return 0;
-}
-
 static inline int vtimer_pending(struct vtimer_list *timer)
 {
 	return (!list_empty(&timer->entry));
@@ -346,7 +337,7 @@ static void internal_add_vtimer(struct vtimer_list *timer)
 
 static inline int prepare_vtimer(struct vtimer_list *timer)
 {
-	if (check_vtimer(timer) || !timer->function) {
+	if (!timer->function) {
 		printk("add_virt_timer: uninitialized timer\n");
 		return -EINVAL;
 	}
@@ -414,7 +405,7 @@ int mod_virt_timer(struct vtimer_list *timer, __u64 expires)
 	unsigned long flags;
 	int cpu;
 
-	if (check_vtimer(timer) || !timer->function) {
+	if (!timer->function) {
 		printk("mod_virt_timer: uninitialized timer\n");
 		return	-EINVAL;
 	}
@@ -481,11 +472,6 @@ int del_virt_timer(struct vtimer_list *timer)
 	unsigned long flags;
 	struct vtimer_queue *vt_list;
 
-	if (check_vtimer(timer)) {
-		printk("del_virt_timer: timer not initialized\n");
-		return -EINVAL;
-	}
-
 	/* check if timer is pending */
 	if (!vtimer_pending(timer))
 		return 0;
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ddd5bbe1fc8e..b1dc583bb4d4 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -12,16 +12,12 @@ struct timer_list {
 	struct list_head entry;
 	unsigned long expires;
 
-	unsigned long magic;
-
 	void (*function)(unsigned long);
 	unsigned long data;
 
 	struct timer_base_s *base;
 };
 
-#define TIMER_MAGIC	0x4b87ad6e
-
 extern struct timer_base_s __init_timer_base;
 
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
@@ -29,7 +25,6 @@ extern struct timer_base_s __init_timer_base;
 		.expires = (_expires),				\
 		.data = (_data),				\
 		.base = &__init_timer_base,			\
-		.magic = TIMER_MAGIC,				\
 	}
 
 #define DEFINE_TIMER(_name, _function, _expires, _data)		\
diff --git a/kernel/timer.c b/kernel/timer.c
index cc18857601e2..562d53eabb46 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -91,30 +91,6 @@ static inline void set_running_timer(tvec_base_t *base,
 #endif
 }
 
-static void check_timer_failed(struct timer_list *timer)
-{
-	static int whine_count;
-	if (whine_count < 16) {
-		whine_count++;
-		printk("Uninitialised timer!\n");
-		printk("This is just a warning.  Your computer is OK\n");
-		printk("function=0x%p, data=0x%lx\n",
-			timer->function, timer->data);
-		dump_stack();
-	}
-	/*
-	 * Now fix it up
-	 */
-	timer->magic = TIMER_MAGIC;
-}
-
-static inline void check_timer(struct timer_list *timer)
-{
-	if (timer->magic != TIMER_MAGIC)
-		check_timer_failed(timer);
-}
-
-
 static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
@@ -177,7 +153,6 @@ void fastcall init_timer(struct timer_list *timer)
 {
 	timer->entry.next = NULL;
 	timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
-	timer->magic = TIMER_MAGIC;
 }
 EXPORT_SYMBOL(init_timer);
 
@@ -230,7 +205,6 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 	int ret = 0;
 
 	BUG_ON(!timer->function);
-	check_timer(timer);
 
 	base = lock_timer_base(timer, &flags);
 
@@ -283,9 +257,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
   	unsigned long flags;
 
   	BUG_ON(timer_pending(timer) || !timer->function);
-
-	check_timer(timer);
-
 	spin_lock_irqsave(&base->t_base.lock, flags);
 	timer->base = &base->t_base;
 	internal_add_timer(base, timer);
@@ -316,8 +287,6 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 {
 	BUG_ON(!timer->function);
 
-	check_timer(timer);
-
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -348,8 +317,6 @@ int del_timer(struct timer_list *timer)
 	unsigned long flags;
 	int ret = 0;
 
-	check_timer(timer);
-
 	if (timer_pending(timer)) {
 		base = lock_timer_base(timer, &flags);
 		if (timer_pending(timer)) {
@@ -412,8 +379,6 @@ out:
  */
 int del_timer_sync(struct timer_list *timer)
 {
-	check_timer(timer);
-
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
-- 
cgit v1.2.3


From 19a4fcb531659f2f7d18b5d04cee039176e9540d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:02:17 -0800
Subject: [PATCH] kill sigqueue->lock

This lock is used in sigqueue_free(), but it is always equal to
current->sighand->siglock, so we don't need to keep it in the struct
sigqueue.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/signal.h |  1 -
 kernel/signal.c        | 10 ++++------
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/signal.h b/include/linux/signal.h
index 7be18b5e2fb4..5dd5f02c5c5f 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -25,7 +25,6 @@
 
 struct sigqueue {
 	struct list_head list;
-	spinlock_t *lock;
 	int flags;
 	siginfo_t info;
 	struct user_struct *user;
diff --git a/kernel/signal.c b/kernel/signal.c
index 6904bbbfe116..8d6e64dfa5c6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -277,7 +277,6 @@ static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags,
 	} else {
 		INIT_LIST_HEAD(&q->list);
 		q->flags = 0;
-		q->lock = NULL;
 		q->user = get_uid(t->user);
 	}
 	return(q);
@@ -1371,11 +1370,12 @@ void sigqueue_free(struct sigqueue *q)
 	 * pending queue.
 	 */
 	if (unlikely(!list_empty(&q->list))) {
-		read_lock(&tasklist_lock);  
-		spin_lock_irqsave(q->lock, flags);
+		spinlock_t *lock = &current->sighand->siglock;
+		read_lock(&tasklist_lock);
+		spin_lock_irqsave(lock, flags);
 		if (!list_empty(&q->list))
 			list_del_init(&q->list);
-		spin_unlock_irqrestore(q->lock, flags);
+		spin_unlock_irqrestore(lock, flags);
 		read_unlock(&tasklist_lock);
 	}
 	q->flags &= ~SIGQUEUE_PREALLOC;
@@ -1414,7 +1414,6 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 		goto out;
 	}
 
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
@@ -1462,7 +1461,6 @@ send_group_sigqueue(int sig, struct sigqueue *q, struct task_struct *p)
 	 * We always use the shared queue for process-wide signals,
 	 * to avoid several races.
 	 */
-	q->lock = &p->sighand->siglock;
 	list_add_tail(&q->list, &p->signal->shared_pending.list);
 	sigaddset(&p->signal->shared_pending.signal, sig);
 
-- 
cgit v1.2.3


From 6dd69f1061bfdeca230509b173438e0731bff767 Mon Sep 17 00:00:00 2001
From: Vadim Lobanov <vlobanov@speakeasy.net>
Date: Sun, 30 Oct 2005 15:02:18 -0800
Subject: [PATCH] Unify sys_tkill() and sys_tgkill()

The majority of the sys_tkill() and sys_tgkill() function code is
duplicated between the two of them.  This patch pulls the duplication out
into a separate function -- do_tkill() -- and lets sys_tkill() and
sys_tgkill() be simple wrappers around it.  This should make it easier to
maintain in light of future changes.

Signed-off-by: Vadim Lobanov <vlobanov@speakeasy.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 71 ++++++++++++++++++++-------------------------------------
 1 file changed, 25 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8d6e64dfa5c6..1d905ec74bde 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2283,26 +2283,13 @@ sys_kill(int pid, int sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-/**
- *  sys_tgkill - send signal to one specific thread
- *  @tgid: the thread group ID of the thread
- *  @pid: the PID of the thread
- *  @sig: signal to be sent
- *
- *  This syscall also checks the tgid and returns -ESRCH even if the PID
- *  exists but it's not belonging to the target process anymore. This
- *  method solves the problem of threads exiting and PIDs getting reused.
- */
-asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+static int do_tkill(int tgid, int pid, int sig)
 {
-	struct siginfo info;
 	int error;
+	struct siginfo info;
 	struct task_struct *p;
 
-	/* This is only valid for single tasks */
-	if (pid <= 0 || tgid <= 0)
-		return -EINVAL;
-
+	error = -ESRCH;
 	info.si_signo = sig;
 	info.si_errno = 0;
 	info.si_code = SI_TKILL;
@@ -2311,8 +2298,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 
 	read_lock(&tasklist_lock);
 	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p && (p->tgid == tgid)) {
+	if (p && (tgid <= 0 || p->tgid == tgid)) {
 		error = check_kill_permission(sig, &info, p);
 		/*
 		 * The null signal is a permissions and process existence
@@ -2326,47 +2312,40 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
 		}
 	}
 	read_unlock(&tasklist_lock);
+
 	return error;
 }
 
+/**
+ *  sys_tgkill - send signal to one specific thread
+ *  @tgid: the thread group ID of the thread
+ *  @pid: the PID of the thread
+ *  @sig: signal to be sent
+ *
+ *  This syscall also checks the tgid and returns -ESRCH even if the PID
+ *  exists but it's not belonging to the target process anymore. This
+ *  method solves the problem of threads exiting and PIDs getting reused.
+ */
+asmlinkage long sys_tgkill(int tgid, int pid, int sig)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	return do_tkill(tgid, pid, sig);
+}
+
 /*
  *  Send a signal to only one task, even if it's a CLONE_THREAD task.
  */
 asmlinkage long
 sys_tkill(int pid, int sig)
 {
-	struct siginfo info;
-	int error;
-	struct task_struct *p;
-
 	/* This is only valid for single tasks */
 	if (pid <= 0)
 		return -EINVAL;
 
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = current->tgid;
-	info.si_uid = current->uid;
-
-	read_lock(&tasklist_lock);
-	p = find_task_by_pid(pid);
-	error = -ESRCH;
-	if (p) {
-		error = check_kill_permission(sig, &info, p);
-		/*
-		 * The null signal is a permissions and process existence
-		 * probe.  No signal is actually delivered.
-		 */
-		if (!error && sig && p->sighand) {
-			spin_lock_irq(&p->sighand->siglock);
-			handle_stop_signal(sig, p);
-			error = specific_send_sig_info(sig, &info, p);
-			spin_unlock_irq(&p->sighand->siglock);
-		}
-	}
-	read_unlock(&tasklist_lock);
-	return error;
+	return do_tkill(0, pid, sig);
 }
 
 asmlinkage long
-- 
cgit v1.2.3


From 4eb9af2a8a431a832830f986fead7332dab27229 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:02:21 -0800
Subject: [PATCH] posix-timers: use schedule_timeout() in common_nsleep()

common_nsleep() reimplements schedule_timeout_interruptible() for unknown
reason.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-timers.c | 19 +------------------
 1 file changed, 1 insertion(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dda3cda73c77..ea55c7a1cd75 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -1295,13 +1295,6 @@ sys_clock_getres(clockid_t which_clock, struct timespec __user *tp)
 	return error;
 }
 
-static void nanosleep_wake_up(unsigned long __data)
-{
-	struct task_struct *p = (struct task_struct *) __data;
-
-	wake_up_process(p);
-}
-
 /*
  * The standard says that an absolute nanosleep call MUST wake up at
  * the requested time in spite of clock settings.  Here is what we do:
@@ -1442,7 +1435,6 @@ static int common_nsleep(clockid_t which_clock,
 			 int flags, struct timespec *tsave)
 {
 	struct timespec t, dum;
-	struct timer_list new_timer;
 	DECLARE_WAITQUEUE(abs_wqueue, current);
 	u64 rq_time = (u64)0;
 	s64 left;
@@ -1451,10 +1443,6 @@ static int common_nsleep(clockid_t which_clock,
 	    &current_thread_info()->restart_block;
 
 	abs_wqueue.flags = 0;
-	init_timer(&new_timer);
-	new_timer.expires = 0;
-	new_timer.data = (unsigned long) current;
-	new_timer.function = nanosleep_wake_up;
 	abs = flags & TIMER_ABSTIME;
 
 	if (restart_block->fn == clock_nanosleep_restart) {
@@ -1490,13 +1478,8 @@ static int common_nsleep(clockid_t which_clock,
 		if (left < (s64)0)
 			break;
 
-		new_timer.expires = jiffies + left;
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_timer(&new_timer);
-
-		schedule();
+		schedule_timeout_interruptible(left);
 
-		del_timer_sync(&new_timer);
 		left = rq_time - get_jiffies_64();
 	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
 
-- 
cgit v1.2.3


From f35f31d7ed0150f9865619f21b5050c91b46c03f Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:27 -0800
Subject: [PATCH] cpuset cleanup

Remove one more useless line from cpuset_common_file_read().

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 28176d083f7b..b9342f90d28f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -995,7 +995,6 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
 		goto out;
 	}
 	*s++ = '\n';
-	*s = '\0';
 
 	retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
 out:
-- 
cgit v1.2.3


From 5aa15b5f27fc2c404530c6c8eabdb8437deb3163 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:28 -0800
Subject: [PATCH] cpusets: remove depth counted locking hack

Remove a rather hackish depth counter on cpuset locking.  The depth counter
was avoiding a possible double trip on the global cpuset_sem semaphore.  It
worked, but now an improved version of cpuset locking is available, to come
in the next patch, using two global semaphores.

This patch reverses "cpuset semaphore depth check deadlock fix"

The kernel still works, even after this patch, except for some rare and
difficult to reproduce race conditions when agressively creating and
destroying cpusets marked with the notify_on_release option, on very large
systems.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 105 +++++++++++++++++++++-----------------------------------
 1 file changed, 40 insertions(+), 65 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b9342f90d28f..cd54dba2be18 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -180,42 +180,6 @@ static struct super_block *cpuset_sb = NULL;
  */
 
 static DECLARE_MUTEX(cpuset_sem);
-static struct task_struct *cpuset_sem_owner;
-static int cpuset_sem_depth;
-
-/*
- * The global cpuset semaphore cpuset_sem can be needed by the
- * memory allocator to update a tasks mems_allowed (see the calls
- * to cpuset_update_current_mems_allowed()) or to walk up the
- * cpuset hierarchy to find a mem_exclusive cpuset see the calls
- * to cpuset_excl_nodes_overlap()).
- *
- * But if the memory allocation is being done by cpuset.c code, it
- * usually already holds cpuset_sem.  Double tripping on a kernel
- * semaphore deadlocks the current task, and any other task that
- * subsequently tries to obtain the lock.
- *
- * Run all up's and down's on cpuset_sem through the following
- * wrappers, which will detect this nested locking, and avoid
- * deadlocking.
- */
-
-static inline void cpuset_down(struct semaphore *psem)
-{
-	if (cpuset_sem_owner != current) {
-		down(psem);
-		cpuset_sem_owner = current;
-	}
-	cpuset_sem_depth++;
-}
-
-static inline void cpuset_up(struct semaphore *psem)
-{
-	if (--cpuset_sem_depth == 0) {
-		cpuset_sem_owner = NULL;
-		up(psem);
-	}
-}
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -558,10 +522,19 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
  * Refresh current tasks mems_allowed and mems_generation from
  * current tasks cpuset.  Call with cpuset_sem held.
  *
- * This routine is needed to update the per-task mems_allowed
- * data, within the tasks context, when it is trying to allocate
- * memory (in various mm/mempolicy.c routines) and notices
- * that some other task has been modifying its cpuset.
+ * Be sure to call refresh_mems() on any cpuset operation which
+ * (1) holds cpuset_sem, and (2) might possibly alloc memory.
+ * Call after obtaining cpuset_sem lock, before any possible
+ * allocation.  Otherwise one risks trying to allocate memory
+ * while the task cpuset_mems_generation is not the same as
+ * the mems_generation in its cpuset, which would deadlock on
+ * cpuset_sem in cpuset_update_current_mems_allowed().
+ *
+ * Since we hold cpuset_sem, once refresh_mems() is called, the
+ * test (current->cpuset_mems_generation != cs->mems_generation)
+ * in cpuset_update_current_mems_allowed() will remain false,
+ * until we drop cpuset_sem.  Anyone else who would change our
+ * cpusets mems_generation needs to lock cpuset_sem first.
  */
 
 static void refresh_mems(void)
@@ -867,7 +840,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -901,7 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -941,9 +914,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	mask = cs->cpus_allowed;
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -952,9 +925,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	mask = cs->mems_allowed;
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1351,7 +1324,8 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
+	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
 		set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1376,14 +1350,14 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	kfree(cs);
 	return err;
 }
@@ -1405,13 +1379,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_sem already */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
+	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
@@ -1427,7 +1402,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1530,10 +1505,10 @@ void cpuset_exit(struct task_struct *tsk)
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		cpuset_down(&cpuset_sem);
+		down(&cpuset_sem);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -1554,11 +1529,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return mask;
 }
@@ -1583,9 +1558,9 @@ void cpuset_update_current_mems_allowed(void)
 	if (!cs)
 		return;		/* task is exiting */
 	if (current->cpuset_mems_generation != cs->mems_generation) {
-		cpuset_down(&cpuset_sem);
+		down(&cpuset_sem);
 		refresh_mems();
-		cpuset_up(&cpuset_sem);
+		up(&cpuset_sem);
 	}
 }
 
@@ -1684,14 +1659,14 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 0;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	cs = current->cpuset;
 	if (!cs)
 		goto done;		/* current task exiting */
 	cs = nearest_exclusive_ancestor(cs);
 	allowed = node_isset(node, cs->mems_allowed);
 done:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	return allowed;
 }
 
@@ -1712,7 +1687,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	cs1 = current->cpuset;
 	if (!cs1)
 		goto done;		/* current task exiting */
@@ -1723,7 +1698,7 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	cs2 = nearest_exclusive_ancestor(cs2);
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 
 	return overlap;
 }
@@ -1746,7 +1721,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	cpuset_down(&cpuset_sem);
+	down(&cpuset_sem);
 	task_lock(tsk);
 	cs = tsk->cpuset;
 	task_unlock(tsk);
@@ -1761,7 +1736,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	cpuset_up(&cpuset_sem);
+	up(&cpuset_sem);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 053199edf54f685e7dea765b60d4d5e9070dadec Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:30 -0800
Subject: [PATCH] cpusets: dual semaphore locking overhaul

Overhaul cpuset locking.  Replace single semaphore with two semaphores.

The suggestion to use two locks was made by Roman Zippel.

Both locks are global.  Code that wants to modify cpusets must first
acquire the exclusive manage_sem, which allows them read-only access to
cpusets, and holds off other would-be modifiers.  Before making actual
changes, the second semaphore, callback_sem must be acquired as well.  Code
that needs only to query cpusets must acquire callback_sem, which is also a
global exclusive lock.

The earlier problems with double tripping are avoided, because it is
allowed for holders of manage_sem to nest the second callback_sem lock, and
only callback_sem is needed by code called from within __alloc_pages(),
where the double tripping had been possible.

This is not quite the same as a normal read/write semaphore, because
obtaining read-only access with intent to change must hold off other such
attempts, while allowing read-only access w/o such intention.  Changing
cpusets involves several related checks and changes, which must be done
while allowing read-only queries (to avoid the double trip), but while
ensuring nothing changes (holding off other would be modifiers.)

This overhaul of cpuset locking also makes careful use of task_lock() to
guard access to the task->cpuset pointer, closing a couple of race
conditions noticed while reading this code (thanks, Roman).  I've never
seen these races fail in any use or test.

See further the comments in the code.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h |   2 +-
 kernel/cpuset.c       | 418 +++++++++++++++++++++++++++++++++-----------------
 2 files changed, 282 insertions(+), 138 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c30bc308ef1..b2d2dc14f0b9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1211,7 +1211,7 @@ extern void unhash_process(struct task_struct *p);
 /*
  * Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
- * pins the final release of task.io_context.
+ * pins the final release of task.io_context.  Also protects ->cpuset.
  *
  * Nests both inside and outside of read_lock(&tasklist_lock).
  * It must not be nested with write_lock_irq(&tasklist_lock),
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index cd54dba2be18..7491352276b2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -60,6 +60,9 @@ struct cpuset {
 	cpumask_t cpus_allowed;		/* CPUs allowed to tasks in cpuset */
 	nodemask_t mems_allowed;	/* Memory Nodes allowed to tasks */
 
+	/*
+	 * Count is atomic so can incr (fork) or decr (exit) without a lock.
+	 */
 	atomic_t count;			/* count tasks using this cpuset */
 
 	/*
@@ -142,44 +145,91 @@ static struct vfsmount *cpuset_mount;
 static struct super_block *cpuset_sb = NULL;
 
 /*
- * cpuset_sem should be held by anyone who is depending on the children
- * or sibling lists of any cpuset, or performing non-atomic operations
- * on the flags or *_allowed values of a cpuset, such as raising the
- * CS_REMOVED flag bit iff it is not already raised, or reading and
- * conditionally modifying the *_allowed values.  One kernel global
- * cpuset semaphore should be sufficient - these things don't change
- * that much.
- *
- * The code that modifies cpusets holds cpuset_sem across the entire
- * operation, from cpuset_common_file_write() down, single threading
- * all cpuset modifications (except for counter manipulations from
- * fork and exit) across the system.  This presumes that cpuset
- * modifications are rare - better kept simple and safe, even if slow.
- *
- * The code that reads cpusets, such as in cpuset_common_file_read()
- * and below, only holds cpuset_sem across small pieces of code, such
- * as when reading out possibly multi-word cpumasks and nodemasks, as
- * the risks are less, and the desire for performance a little greater.
- * The proc_cpuset_show() routine needs to hold cpuset_sem to insure
- * that no cs->dentry is NULL, as it walks up the cpuset tree to root.
- *
- * The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
- * (usually) grab cpuset_sem.  These are the two most performance
- * critical pieces of code here.  The exception occurs on exit(),
- * when a task in a notify_on_release cpuset exits.  Then cpuset_sem
+ * We have two global cpuset semaphores below.  They can nest.
+ * It is ok to first take manage_sem, then nest callback_sem.  We also
+ * require taking task_lock() when dereferencing a tasks cpuset pointer.
+ * See "The task_lock() exception", at the end of this comment.
+ *
+ * A task must hold both semaphores to modify cpusets.  If a task
+ * holds manage_sem, then it blocks others wanting that semaphore,
+ * ensuring that it is the only task able to also acquire callback_sem
+ * and be able to modify cpusets.  It can perform various checks on
+ * the cpuset structure first, knowing nothing will change.  It can
+ * also allocate memory while just holding manage_sem.  While it is
+ * performing these checks, various callback routines can briefly
+ * acquire callback_sem to query cpusets.  Once it is ready to make
+ * the changes, it takes callback_sem, blocking everyone else.
+ *
+ * Calls to the kernel memory allocator can not be made while holding
+ * callback_sem, as that would risk double tripping on callback_sem
+ * from one of the callbacks into the cpuset code from within
+ * __alloc_pages().
+ *
+ * If a task is only holding callback_sem, then it has read-only
+ * access to cpusets.
+ *
+ * The task_struct fields mems_allowed and mems_generation may only
+ * be accessed in the context of that task, so require no locks.
+ *
+ * Any task can increment and decrement the count field without lock.
+ * So in general, code holding manage_sem or callback_sem can't rely
+ * on the count field not changing.  However, if the count goes to
+ * zero, then only attach_task(), which holds both semaphores, can
+ * increment it again.  Because a count of zero means that no tasks
+ * are currently attached, therefore there is no way a task attached
+ * to that cpuset can fork (the other way to increment the count).
+ * So code holding manage_sem or callback_sem can safely assume that
+ * if the count is zero, it will stay zero.  Similarly, if a task
+ * holds manage_sem or callback_sem on a cpuset with zero count, it
+ * knows that the cpuset won't be removed, as cpuset_rmdir() needs
+ * both of those semaphores.
+ *
+ * A possible optimization to improve parallelism would be to make
+ * callback_sem a R/W semaphore (rwsem), allowing the callback routines
+ * to proceed in parallel, with read access, until the holder of
+ * manage_sem needed to take this rwsem for exclusive write access
+ * and modify some cpusets.
+ *
+ * The cpuset_common_file_write handler for operations that modify
+ * the cpuset hierarchy holds manage_sem across the entire operation,
+ * single threading all such cpuset modifications across the system.
+ *
+ * The cpuset_common_file_read() handlers only hold callback_sem across
+ * small pieces of code, such as when reading out possibly multi-word
+ * cpumasks and nodemasks.
+ *
+ * The fork and exit callbacks cpuset_fork() and cpuset_exit(), don't
+ * (usually) take either semaphore.  These are the two most performance
+ * critical pieces of code here.  The exception occurs on cpuset_exit(),
+ * when a task in a notify_on_release cpuset exits.  Then manage_sem
  * is taken, and if the cpuset count is zero, a usermode call made
  * to /sbin/cpuset_release_agent with the name of the cpuset (path
  * relative to the root of cpuset file system) as the argument.
  *
- * A cpuset can only be deleted if both its 'count' of using tasks is
- * zero, and its list of 'children' cpusets is empty.  Since all tasks
- * in the system use _some_ cpuset, and since there is always at least
- * one task in the system (init, pid == 1), therefore, top_cpuset
- * always has either children cpusets and/or using tasks.  So no need
- * for any special hack to ensure that top_cpuset cannot be deleted.
+ * A cpuset can only be deleted if both its 'count' of using tasks
+ * is zero, and its list of 'children' cpusets is empty.  Since all
+ * tasks in the system use _some_ cpuset, and since there is always at
+ * least one task in the system (init, pid == 1), therefore, top_cpuset
+ * always has either children cpusets and/or using tasks.  So we don't
+ * need a special hack to ensure that top_cpuset cannot be deleted.
+ *
+ * The above "Tale of Two Semaphores" would be complete, but for:
+ *
+ *	The task_lock() exception
+ *
+ * The need for this exception arises from the action of attach_task(),
+ * which overwrites one tasks cpuset pointer with another.  It does
+ * so using both semaphores, however there are several performance
+ * critical places that need to reference task->cpuset without the
+ * expense of grabbing a system global semaphore.  Therefore except as
+ * noted below, when dereferencing or, as in attach_task(), modifying
+ * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
+ * (task->alloc_lock) already in the task_struct routinely used for
+ * such matters.
  */
 
-static DECLARE_MUTEX(cpuset_sem);
+static DECLARE_MUTEX(manage_sem);
+static DECLARE_MUTEX(callback_sem);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
@@ -354,7 +404,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
 }
 
 /*
- * Call with cpuset_sem held.  Writes path of cpuset into buf.
+ * Call with manage_sem held.  Writes path of cpuset into buf.
  * Returns 0 on success, -errno on error.
  */
 
@@ -406,10 +456,11 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
  * status of the /sbin/cpuset_release_agent task, so no sense holding
  * our caller up for that.
  *
- * The simple act of forking that task might require more memory,
- * which might need cpuset_sem.  So this routine must be called while
- * cpuset_sem is not held, to avoid a possible deadlock.  See also
- * comments for check_for_release(), below.
+ * When we had only one cpuset semaphore, we had to call this
+ * without holding it, to avoid deadlock when call_usermodehelper()
+ * allocated memory.  With two locks, we could now call this while
+ * holding manage_sem, but we still don't, so as to minimize
+ * the time manage_sem is held.
  */
 
 static void cpuset_release_agent(const char *pathbuf)
@@ -441,15 +492,15 @@ static void cpuset_release_agent(const char *pathbuf)
  * cs is notify_on_release() and now both the user count is zero and
  * the list of children is empty, prepare cpuset path in a kmalloc'd
  * buffer, to be returned via ppathbuf, so that the caller can invoke
- * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
- * Call here with cpuset_sem held.
+ * cpuset_release_agent() with it later on, once manage_sem is dropped.
+ * Call here with manage_sem held.
  *
  * This check_for_release() routine is responsible for kmalloc'ing
  * pathbuf.  The above cpuset_release_agent() is responsible for
  * kfree'ing pathbuf.  The caller of these routines is responsible
  * for providing a pathbuf pointer, initialized to NULL, then
- * calling check_for_release() with cpuset_sem held and the address
- * of the pathbuf pointer, then dropping cpuset_sem, then calling
+ * calling check_for_release() with manage_sem held and the address
+ * of the pathbuf pointer, then dropping manage_sem, then calling
  * cpuset_release_agent() with pathbuf, as set by check_for_release().
  */
 
@@ -480,7 +531,7 @@ static void check_for_release(struct cpuset *cs, char **ppathbuf)
  * One way or another, we guarantee to return some non-empty subset
  * of cpu_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
@@ -504,7 +555,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, cpumask_t *pmask)
  * One way or another, we guarantee to return some non-empty subset
  * of node_online_map.
  *
- * Call with cpuset_sem held.
+ * Call with callback_sem held.
  */
 
 static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
@@ -519,31 +570,44 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
 }
 
 /*
- * Refresh current tasks mems_allowed and mems_generation from
- * current tasks cpuset.  Call with cpuset_sem held.
- *
- * Be sure to call refresh_mems() on any cpuset operation which
- * (1) holds cpuset_sem, and (2) might possibly alloc memory.
- * Call after obtaining cpuset_sem lock, before any possible
- * allocation.  Otherwise one risks trying to allocate memory
- * while the task cpuset_mems_generation is not the same as
- * the mems_generation in its cpuset, which would deadlock on
- * cpuset_sem in cpuset_update_current_mems_allowed().
- *
- * Since we hold cpuset_sem, once refresh_mems() is called, the
- * test (current->cpuset_mems_generation != cs->mems_generation)
- * in cpuset_update_current_mems_allowed() will remain false,
- * until we drop cpuset_sem.  Anyone else who would change our
- * cpusets mems_generation needs to lock cpuset_sem first.
+ * Refresh current tasks mems_allowed and mems_generation from current
+ * tasks cpuset.
+ *
+ * Call without callback_sem or task_lock() held.  May be called with
+ * or without manage_sem held.  Will acquire task_lock() and might
+ * acquire callback_sem during call.
+ *
+ * The task_lock() is required to dereference current->cpuset safely.
+ * Without it, we could pick up the pointer value of current->cpuset
+ * in one instruction, and then attach_task could give us a different
+ * cpuset, and then the cpuset we had could be removed and freed,
+ * and then on our next instruction, we could dereference a no longer
+ * valid cpuset pointer to get its mems_generation field.
+ *
+ * This routine is needed to update the per-task mems_allowed data,
+ * within the tasks context, when it is trying to allocate memory
+ * (in various mm/mempolicy.c routines) and notices that some other
+ * task has been modifying its cpuset.
  */
 
 static void refresh_mems(void)
 {
-	struct cpuset *cs = current->cpuset;
+	int my_cpusets_mem_gen;
+
+	task_lock(current);
+	my_cpusets_mem_gen = current->cpuset->mems_generation;
+	task_unlock(current);
 
-	if (current->cpuset_mems_generation != cs->mems_generation) {
+	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
+		struct cpuset *cs;
+
+		down(&callback_sem);
+		task_lock(current);
+		cs = current->cpuset;
 		guarantee_online_mems(cs, &current->mems_allowed);
 		current->cpuset_mems_generation = cs->mems_generation;
+		task_unlock(current);
+		up(&callback_sem);
 	}
 }
 
@@ -552,7 +616,7 @@ static void refresh_mems(void)
  *
  * One cpuset is a subset of another if all its allowed CPUs and
  * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set.
+ * are only set if the other's are set.  Call holding manage_sem.
  */
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -570,7 +634,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  * If we replaced the flag and mask values of the current cpuset
  * (cur) with those values in the trial cpuset (trial), would
  * our various subset and exclusive rules still be valid?  Presumes
- * cpuset_sem held.
+ * manage_sem held.
  *
  * 'cur' is the address of an actual, in-use cpuset.  Operations
  * such as list traversal that depend on the actual address of the
@@ -624,7 +688,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *    exclusive child cpusets
  * Build these two partitions by calling partition_sched_domains
  *
- * Call with cpuset_sem held.  May nest a call to the
+ * Call with manage_sem held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
  */
 
@@ -669,6 +733,10 @@ static void update_cpu_domains(struct cpuset *cur)
 	unlock_cpu_hotplug();
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -685,12 +753,18 @@ static int update_cpumask(struct cpuset *cs, char *buf)
 	if (retval < 0)
 		return retval;
 	cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+	down(&callback_sem);
 	cs->cpus_allowed = trialcs.cpus_allowed;
+	up(&callback_sem);
 	if (is_cpu_exclusive(cs) && !cpus_unchanged)
 		update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Call with manage_sem held.  May take callback_sem during call.
+ */
+
 static int update_nodemask(struct cpuset *cs, char *buf)
 {
 	struct cpuset trialcs;
@@ -705,9 +779,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
 		return -ENOSPC;
 	retval = validate_change(cs, &trialcs);
 	if (retval == 0) {
+		down(&callback_sem);
 		cs->mems_allowed = trialcs.mems_allowed;
 		atomic_inc(&cpuset_mems_generation);
 		cs->mems_generation = atomic_read(&cpuset_mems_generation);
+		up(&callback_sem);
 	}
 	return retval;
 }
@@ -718,6 +794,8 @@ static int update_nodemask(struct cpuset *cs, char *buf)
  *						CS_NOTIFY_ON_RELEASE)
  * cs:	the cpuset to update
  * buf:	the buffer where we read the 0 or 1
+ *
+ * Call with manage_sem held.
  */
 
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
@@ -739,16 +817,27 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 		return err;
 	cpu_exclusive_changed =
 		(is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+	down(&callback_sem);
 	if (turning_on)
 		set_bit(bit, &cs->flags);
 	else
 		clear_bit(bit, &cs->flags);
+	up(&callback_sem);
 
 	if (cpu_exclusive_changed)
                 update_cpu_domains(cs);
 	return 0;
 }
 
+/*
+ * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
+ * writing the path of the old cpuset in 'ppathbuf' if it needs to be
+ * notified on release.
+ *
+ * Call holding manage_sem.  May take callback_sem and task_lock of
+ * the task 'pid' during call.
+ */
+
 static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 {
 	pid_t pid;
@@ -765,7 +854,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		read_lock(&tasklist_lock);
 
 		tsk = find_task_by_pid(pid);
-		if (!tsk) {
+		if (!tsk || tsk->flags & PF_EXITING) {
 			read_unlock(&tasklist_lock);
 			return -ESRCH;
 		}
@@ -783,10 +872,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 		get_task_struct(tsk);
 	}
 
+	down(&callback_sem);
+
 	task_lock(tsk);
 	oldcs = tsk->cpuset;
 	if (!oldcs) {
 		task_unlock(tsk);
+		up(&callback_sem);
 		put_task_struct(tsk);
 		return -ESRCH;
 	}
@@ -797,6 +889,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
 	guarantee_online_cpus(cs, &cpus);
 	set_cpus_allowed(tsk, cpus);
 
+	up(&callback_sem);
 	put_task_struct(tsk);
 	if (atomic_dec_and_test(&oldcs->count))
 		check_for_release(oldcs, ppathbuf);
@@ -840,7 +933,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	}
 	buffer[nbytes] = 0;	/* nul-terminate */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 
 	if (is_removed(cs)) {
 		retval = -ENODEV;
@@ -874,7 +967,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
 	if (retval == 0)
 		retval = nbytes;
 out2:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	cpuset_release_agent(pathbuf);
 out1:
 	kfree(buffer);
@@ -914,9 +1007,9 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->cpus_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -925,9 +1018,9 @@ static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
 	nodemask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	mask = cs->mems_allowed;
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return nodelist_scnprintf(page, PAGE_SIZE, mask);
 }
@@ -1135,7 +1228,9 @@ struct ctr_struct {
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cpuset 'cs'.
- * Return actual number of pids loaded.
+ * Return actual number of pids loaded.  No need to task_lock(p)
+ * when reading out p->cpuset, as we don't really care if it changes
+ * on the next cycle, and we are not going to try to dereference it.
  */
 static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
 {
@@ -1177,6 +1272,12 @@ static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
 	return cnt;
 }
 
+/*
+ * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * process id's of tasks currently attached to the cpuset being opened.
+ *
+ * Does not require any specific cpuset semaphores, and does not take any.
+ */
 static int cpuset_tasks_open(struct inode *unused, struct file *file)
 {
 	struct cpuset *cs = __d_cs(file->f_dentry->d_parent);
@@ -1324,7 +1425,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 	if (!cs)
 		return -ENOMEM;
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	cs->flags = 0;
 	if (notify_on_release(parent))
@@ -1339,25 +1440,27 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
 
 	cs->parent = parent;
 
+	down(&callback_sem);
 	list_add(&cs->sibling, &cs->parent->children);
+	up(&callback_sem);
 
 	err = cpuset_create_dir(cs, name, mode);
 	if (err < 0)
 		goto err;
 
 	/*
-	 * Release cpuset_sem before cpuset_populate_dir() because it
+	 * Release manage_sem before cpuset_populate_dir() because it
 	 * will down() this new directory's i_sem and if we race with
 	 * another mkdir, we might deadlock.
 	 */
-	up(&cpuset_sem);
+	up(&manage_sem);
 
 	err = cpuset_populate_dir(cs->dentry);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 	return 0;
 err:
 	list_del(&cs->sibling);
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(cs);
 	return err;
 }
@@ -1379,30 +1482,32 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 
 	/* the vfs holds both inode->i_sem already */
 
-	down(&cpuset_sem);
+	down(&manage_sem);
 	refresh_mems();
 	if (atomic_read(&cs->count) > 0) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	if (!list_empty(&cs->children)) {
-		up(&cpuset_sem);
+		up(&manage_sem);
 		return -EBUSY;
 	}
 	parent = cs->parent;
+	down(&callback_sem);
 	set_bit(CS_REMOVED, &cs->flags);
 	if (is_cpu_exclusive(cs))
 		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
-	if (list_empty(&parent->children))
-		check_for_release(parent, &pathbuf);
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
 	cs->dentry = NULL;
 	spin_unlock(&d->d_lock);
 	cpuset_d_remove_dir(d);
 	dput(d);
-	up(&cpuset_sem);
+	up(&callback_sem);
+	if (list_empty(&parent->children))
+		check_for_release(parent, &pathbuf);
+	up(&manage_sem);
 	cpuset_release_agent(pathbuf);
 	return 0;
 }
@@ -1462,16 +1567,26 @@ void __init cpuset_init_smp(void)
  * cpuset_fork - attach newly forked task to its parents cpuset.
  * @tsk: pointer to task_struct of forking parent process.
  *
- * Description: By default, on fork, a task inherits its
- * parent's cpuset.  The pointer to the shared cpuset is
- * automatically copied in fork.c by dup_task_struct().
- * This cpuset_fork() routine need only increment the usage
- * counter in that cpuset.
+ * Description: A task inherits its parent's cpuset at fork().
+ *
+ * A pointer to the shared cpuset was automatically copied in fork.c
+ * by dup_task_struct().  However, we ignore that copy, since it was
+ * not made under the protection of task_lock(), so might no longer be
+ * a valid cpuset pointer.  attach_task() might have already changed
+ * current->cpuset, allowing the previously referenced cpuset to
+ * be removed and freed.  Instead, we task_lock(current) and copy
+ * its present value of current->cpuset for our freshly forked child.
+ *
+ * At the point that cpuset_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
  **/
 
-void cpuset_fork(struct task_struct *tsk)
+void cpuset_fork(struct task_struct *child)
 {
-	atomic_inc(&tsk->cpuset->count);
+	task_lock(current);
+	child->cpuset = current->cpuset;
+	atomic_inc(&child->cpuset->count);
+	task_unlock(current);
 }
 
 /**
@@ -1480,35 +1595,42 @@ void cpuset_fork(struct task_struct *tsk)
  *
  * Description: Detach cpuset from @tsk and release it.
  *
- * Note that cpusets marked notify_on_release force every task
- * in them to take the global cpuset_sem semaphore when exiting.
- * This could impact scaling on very large systems.  Be reluctant
- * to use notify_on_release cpusets where very high task exit
- * scaling is required on large systems.
- *
- * Don't even think about derefencing 'cs' after the cpuset use
- * count goes to zero, except inside a critical section guarded
- * by the cpuset_sem semaphore.  If you don't hold cpuset_sem,
- * then a zero cpuset use count is a license to any other task to
- * nuke the cpuset immediately.
+ * Note that cpusets marked notify_on_release force every task in
+ * them to take the global manage_sem semaphore when exiting.
+ * This could impact scaling on very large systems.  Be reluctant to
+ * use notify_on_release cpusets where very high task exit scaling
+ * is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use count
+ * goes to zero, except inside a critical section guarded by manage_sem
+ * or callback_sem.   Otherwise a zero cpuset use count is a license to
+ * any other task to nuke the cpuset immediately, via cpuset_rmdir().
+ *
+ * This routine has to take manage_sem, not callback_sem, because
+ * it is holding that semaphore while calling check_for_release(),
+ * which calls kmalloc(), so can't be called holding callback__sem().
+ *
+ * We don't need to task_lock() this reference to tsk->cpuset,
+ * because tsk is already marked PF_EXITING, so attach_task() won't
+ * mess with it.
  **/
 
 void cpuset_exit(struct task_struct *tsk)
 {
 	struct cpuset *cs;
 
-	task_lock(tsk);
+	BUG_ON(!(tsk->flags & PF_EXITING));
+
 	cs = tsk->cpuset;
 	tsk->cpuset = NULL;
-	task_unlock(tsk);
 
 	if (notify_on_release(cs)) {
 		char *pathbuf = NULL;
 
-		down(&cpuset_sem);
+		down(&manage_sem);
 		if (atomic_dec_and_test(&cs->count))
 			check_for_release(cs, &pathbuf);
-		up(&cpuset_sem);
+		up(&manage_sem);
 		cpuset_release_agent(pathbuf);
 	} else {
 		atomic_dec(&cs->count);
@@ -1529,11 +1651,11 @@ cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk)
 {
 	cpumask_t mask;
 
-	down(&cpuset_sem);
+	down(&callback_sem);
 	task_lock((struct task_struct *)tsk);
 	guarantee_online_cpus(tsk->cpuset, &mask);
 	task_unlock((struct task_struct *)tsk);
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return mask;
 }
@@ -1549,19 +1671,28 @@ void cpuset_init_current_mems_allowed(void)
  * If the current tasks cpusets mems_allowed changed behind our backs,
  * update current->mems_allowed and mems_generation to the new value.
  * Do not call this routine if in_interrupt().
+ *
+ * Call without callback_sem or task_lock() held.  May be called
+ * with or without manage_sem held.  Unless exiting, it will acquire
+ * task_lock().  Also might acquire callback_sem during call to
+ * refresh_mems().
  */
 
 void cpuset_update_current_mems_allowed(void)
 {
-	struct cpuset *cs = current->cpuset;
+	struct cpuset *cs;
+	int need_to_refresh = 0;
 
+	task_lock(current);
+	cs = current->cpuset;
 	if (!cs)
-		return;		/* task is exiting */
-	if (current->cpuset_mems_generation != cs->mems_generation) {
-		down(&cpuset_sem);
+		goto done;
+	if (current->cpuset_mems_generation != cs->mems_generation)
+		need_to_refresh = 1;
+done:
+	task_unlock(current);
+	if (need_to_refresh)
 		refresh_mems();
-		up(&cpuset_sem);
-	}
 }
 
 /**
@@ -1595,7 +1726,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
 
 /*
  * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive
- * ancestor to the specified cpuset.  Call while holding cpuset_sem.
+ * ancestor to the specified cpuset.  Call holding callback_sem.
  * If no ancestor is mem_exclusive (an unusual configuration), then
  * returns the root cpuset.
  */
@@ -1622,12 +1753,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  * GFP_KERNEL allocations are not so marked, so can escape to the
  * nearest mem_exclusive ancestor cpuset.
  *
- * Scanning up parent cpusets requires cpuset_sem.  The __alloc_pages()
+ * Scanning up parent cpusets requires callback_sem.  The __alloc_pages()
  * routine only calls here with __GFP_HARDWALL bit _not_ set if
  * it's a GFP_KERNEL allocation, and all nodes in the current tasks
  * mems_allowed came up empty on the first pass over the zonelist.
  * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the cpuset_sem semaphore.
+ * short of memory, might require taking the callback_sem semaphore.
  *
  * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages()
  * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing
@@ -1659,14 +1790,16 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 		return 0;
 
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
-	down(&cpuset_sem);
-	cs = current->cpuset;
-	if (!cs)
-		goto done;		/* current task exiting */
-	cs = nearest_exclusive_ancestor(cs);
+	down(&callback_sem);
+
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return 1;
+	task_lock(current);
+	cs = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
 	allowed = node_isset(node, cs->mems_allowed);
-done:
-	up(&cpuset_sem);
+	up(&callback_sem);
 	return allowed;
 }
 
@@ -1679,7 +1812,7 @@ done:
  * determine if task @p's memory usage might impact the memory
  * available to the current task.
  *
- * Acquires cpuset_sem - not suitable for calling from a fast path.
+ * Acquires callback_sem - not suitable for calling from a fast path.
  **/
 
 int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1687,18 +1820,27 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
 	const struct cpuset *cs1, *cs2;	/* my and p's cpuset ancestors */
 	int overlap = 0;		/* do cpusets overlap? */
 
-	down(&cpuset_sem);
-	cs1 = current->cpuset;
-	if (!cs1)
-		goto done;		/* current task exiting */
-	cs2 = p->cpuset;
-	if (!cs2)
-		goto done;		/* task p is exiting */
-	cs1 = nearest_exclusive_ancestor(cs1);
-	cs2 = nearest_exclusive_ancestor(cs2);
+	down(&callback_sem);
+
+	task_lock(current);
+	if (current->flags & PF_EXITING) {
+		task_unlock(current);
+		goto done;
+	}
+	cs1 = nearest_exclusive_ancestor(current->cpuset);
+	task_unlock(current);
+
+	task_lock((struct task_struct *)p);
+	if (p->flags & PF_EXITING) {
+		task_unlock((struct task_struct *)p);
+		goto done;
+	}
+	cs2 = nearest_exclusive_ancestor(p->cpuset);
+	task_unlock((struct task_struct *)p);
+
 	overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
 done:
-	up(&cpuset_sem);
+	up(&callback_sem);
 
 	return overlap;
 }
@@ -1707,6 +1849,10 @@ done:
  * proc_cpuset_show()
  *  - Print tasks cpuset path into seq_file.
  *  - Used for /proc/<pid>/cpuset.
+ *  - No need to task_lock(tsk) on this tsk->cpuset reference, as it
+ *    doesn't really matter if tsk->cpuset changes after we read it,
+ *    and we take manage_sem, keeping attach_task() from changing it
+ *    anyway.
  */
 
 static int proc_cpuset_show(struct seq_file *m, void *v)
@@ -1721,10 +1867,8 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 		return -ENOMEM;
 
 	tsk = m->private;
-	down(&cpuset_sem);
-	task_lock(tsk);
+	down(&manage_sem);
 	cs = tsk->cpuset;
-	task_unlock(tsk);
 	if (!cs) {
 		retval = -EINVAL;
 		goto out;
@@ -1736,7 +1880,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
 	seq_puts(m, buf);
 	seq_putc(m, '\n');
 out:
-	up(&cpuset_sem);
+	up(&manage_sem);
 	kfree(buf);
 	return retval;
 }
-- 
cgit v1.2.3


From 18a19cb3047e454ee5ecbc35d7acf3f8e09e0466 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:31 -0800
Subject: [PATCH] cpusets: simple rename

Add support for renaming cpusets.  Only allow simple rename of cpuset
directories in place.  Don't allow moving cpusets elsewhere in hierarchy or
renaming the special cpuset files in each cpuset directory.

The usefulness of this simple rename became apparent when developing task
migration facilities.  It allows building a second cpuset hierarchy using
new names and containing new CPUs and Memory Nodes, moving tasks from the
old to the new cpusets, removing the old cpusets, and then renaming the new
cpusets to be just like the old names, so that any knowledge that the tasks
had of their cpuset names will still be valid.

Leaf node cpusets can be migrated to other CPUs or Memory Nodes by just
updating their 'cpus' and 'mems' files, but because no cpuset can contain
CPUs or Nodes not in its parent cpuset, one cannot do this in a cpuset
hierarchy without first expanding all the non-leaf cpusets to contain the
union of both the old and new CPUs and Nodes, which would obfuscate the
one-to-one migration of a task from one cpuset to another required to
correctly migrate the physical page frames currently allocated to that
task.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7491352276b2..6633f3fb6417 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1113,6 +1113,21 @@ static int cpuset_file_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+/*
+ * cpuset_rename - Only allow simple rename of directories in place.
+ */
+static int cpuset_rename(struct inode *old_dir, struct dentry *old_dentry,
+                  struct inode *new_dir, struct dentry *new_dentry)
+{
+	if (!S_ISDIR(old_dentry->d_inode->i_mode))
+		return -ENOTDIR;
+	if (new_dentry->d_inode)
+		return -EEXIST;
+	if (old_dir != new_dir)
+		return -EIO;
+	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+}
+
 static struct file_operations cpuset_file_operations = {
 	.read = cpuset_file_read,
 	.write = cpuset_file_write,
@@ -1125,6 +1140,7 @@ static struct inode_operations cpuset_dir_inode_operations = {
 	.lookup = simple_lookup,
 	.mkdir = cpuset_mkdir,
 	.rmdir = cpuset_rmdir,
+	.rename = cpuset_rename,
 };
 
 static int cpuset_create_file(struct dentry *dentry, int mode)
-- 
cgit v1.2.3


From 68860ec10bcc07ab4f89f9d940e3b77ae5ca13b3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:02:36 -0800
Subject: [PATCH] cpusets: automatic numa mempolicy rebinding

This patch automatically updates a tasks NUMA mempolicy when its cpuset
memory placement changes.  It does so within the context of the task,
without any need to support low level external mempolicy manipulation.

If a system is not using cpusets, or if running on a system with just the
root (all-encompassing) cpuset, then this remap is a no-op.  Only when a
task is moved between cpusets, or a cpusets memory placement is changed
does the following apply.  Otherwise, the main routine below,
rebind_policy() is not even called.

When mixing cpusets, scheduler affinity, and NUMA mempolicies, the
essential role of cpusets is to place jobs (several related tasks) on a set
of CPUs and Memory Nodes, the essential role of sched_setaffinity is to
manage a jobs processor placement within its allowed cpuset, and the
essential role of NUMA mempolicy (mbind, set_mempolicy) is to manage a jobs
memory placement within its allowed cpuset.

However, CPU affinity and NUMA memory placement are managed within the
kernel using absolute system wide numbering, not cpuset relative numbering.

This is ok until a job is migrated to a different cpuset, or what's the
same, a jobs cpuset is moved to different CPUs and Memory Nodes.

Then the CPU affinity and NUMA memory placement of the tasks in the job
need to be updated, to preserve their cpuset-relative position.  This can
be done for CPU affinity using sched_setaffinity() from user code, as one
task can modify anothers CPU affinity.  This cannot be done from an
external task for NUMA memory placement, as that can only be modified in
the context of the task using it.

However, it easy enough to remap a tasks NUMA mempolicy automatically when
a task is migrated, using the existing cpuset mechanism to trigger a
refresh of a tasks memory placement after its cpuset has changed.  All that
is needed is the old and new nodemask, and notice to the task that it needs
to rebind its mempolicy.  The tasks mems_allowed has the old mask, the
tasks cpuset has the new mask, and the existing
cpuset_update_current_mems_allowed() mechanism provides the notice.  The
bitmap/cpumask/nodemask remap operators provide the cpuset relative
calculations.

This patch leaves open a couple of issues:

 1) Updating vma and shmfs/tmpfs/hugetlbfs memory policies:

    These mempolicies may reference nodes outside of those allowed to
    the current task by its cpuset.  Tasks are migrated as part of jobs,
    which reside on what might be several cpusets in a subtree.  When such
    a job is migrated, all NUMA memory policy references to nodes within
    that cpuset subtree should be translated, and references to any nodes
    outside that subtree should be left untouched.  A future patch will
    provide the cpuset mechanism needed to mark such subtrees.  With that
    patch, we will be able to correctly migrate these other memory policies
    across a job migration.

 2) Updating cpuset, affinity and memory policies in user space:

    This is harder.  Any placement state stored in user space using
    system-wide numbering will be invalidated across a migration.  More
    work will be required to provide user code with a migration-safe means
    to manage its cpuset relative placement, while preserving the current
    API's that pass system wide numbers, not cpuset relative numbers across
    the kernel-user boundary.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  6 +++++
 kernel/cpuset.c           |  4 +++
 mm/mempolicy.c            | 64 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 74 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7af8cb836e78..8b67cf837ca9 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -154,6 +154,7 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
 
 extern void numa_default_policy(void);
 extern void numa_policy_init(void);
+extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
 extern struct mempolicy default_policy;
 
 #else
@@ -226,6 +227,11 @@ static inline void numa_default_policy(void)
 {
 }
 
+static inline void numa_policy_rebind(const nodemask_t *old,
+					const nodemask_t *new)
+{
+}
+
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6633f3fb6417..5a737ed9dac7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>
 #include <linux/kmod.h>
 #include <linux/list.h>
+#include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/mount.h>
@@ -600,6 +601,7 @@ static void refresh_mems(void)
 
 	if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
 		struct cpuset *cs;
+		nodemask_t oldmem = current->mems_allowed;
 
 		down(&callback_sem);
 		task_lock(current);
@@ -608,6 +610,8 @@ static void refresh_mems(void)
 		current->cpuset_mems_generation = cs->mems_generation;
 		task_unlock(current);
 		up(&callback_sem);
+		if (!nodes_equal(oldmem, current->mems_allowed))
+			numa_policy_rebind(&oldmem, &current->mems_allowed);
 	}
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 2076b1542b8a..5abc57c2b8bd 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -457,6 +457,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	struct vm_area_struct *vma = NULL;
 	struct mempolicy *pol = current->mempolicy;
 
+	cpuset_update_current_mems_allowed();
 	if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 		return -EINVAL;
 	if (flags & MPOL_F_ADDR) {
@@ -1206,3 +1207,66 @@ void numa_default_policy(void)
 {
 	do_set_mempolicy(MPOL_DEFAULT, NULL);
 }
+
+/* Migrate a policy to a different set of nodes */
+static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
+							const nodemask_t *new)
+{
+	nodemask_t tmp;
+
+	if (!pol)
+		return;
+
+	switch (pol->policy) {
+	case MPOL_DEFAULT:
+		break;
+	case MPOL_INTERLEAVE:
+		nodes_remap(tmp, pol->v.nodes, *old, *new);
+		pol->v.nodes = tmp;
+		current->il_next = node_remap(current->il_next, *old, *new);
+		break;
+	case MPOL_PREFERRED:
+		pol->v.preferred_node = node_remap(pol->v.preferred_node,
+								*old, *new);
+		break;
+	case MPOL_BIND: {
+		nodemask_t nodes;
+		struct zone **z;
+		struct zonelist *zonelist;
+
+		nodes_clear(nodes);
+		for (z = pol->v.zonelist->zones; *z; z++)
+			node_set((*z)->zone_pgdat->node_id, nodes);
+		nodes_remap(tmp, nodes, *old, *new);
+		nodes = tmp;
+
+		zonelist = bind_zonelist(&nodes);
+
+		/* If no mem, then zonelist is NULL and we keep old zonelist.
+		 * If that old zonelist has no remaining mems_allowed nodes,
+		 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
+		 */
+
+		if (zonelist) {
+			/* Good - got mem - substitute new zonelist */
+			kfree(pol->v.zonelist);
+			pol->v.zonelist = zonelist;
+		}
+		break;
+	}
+	default:
+		BUG();
+		break;
+	}
+}
+
+/*
+ * Someone moved this task to different nodes.  Fixup mempolicies.
+ *
+ * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
+ * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ */
+void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+{
+	rebind_policy(current->mempolicy, old, new);
+}
-- 
cgit v1.2.3


From 30e0fca6c1d7d26f3f2daa4dd2b12c51dadc778a Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <andrea@suse.de>
Date: Sun, 30 Oct 2005 15:02:38 -0800
Subject: [PATCH] ptrace/coredump/exit_group deadlock

I could seldom reproduce a deadlock with a task not killable in T state
(TASK_STOPPED, not TASK_TRACED) by attaching a NPTL threaded program to
gdb, by segfaulting the task and triggering a core dump while some other
task is executing exit_group and while one task is in ptrace_attached
TASK_STOPPED state (not TASK_TRACED yet).  This originated from a gdb
bugreport (the fact gdb was segfaulting the task wasn't a kernel bug), but
I just incidentally noticed the gdb bug triggered a real kernel bug as
well.

Most threads hangs in exit_mm because the core_dumping is still going, the
core dumping hangs because the stopped task doesn't exit, the stopped task
can't wakeup because it has SIGNAL_GROUP_EXIT set, hence the deadlock.

To me it seems that the problem is that the force_sig_specific(SIGKILL) in
zap_threads is a noop if the task has PF_PTRACED set (like in this case
because gdb is attached).  The __ptrace_unlink does nothing because the
signal->flags is set to SIGNAL_GROUP_EXIT|SIGNAL_STOP_DEQUEUED (verified).

The above info also shows that the stopped task hit a race and got the stop
signal (presumably by the ptrace_attach, only the attach, state is still
TASK_STOPPED and gdb hangs waiting the core before it can set it to
TASK_TRACED) after one of the thread invoked the core dump (it's the core
dump that sets signal->flags to SIGNAL_GROUP_EXIT).

So beside the fact nobody would wakeup the task in __ptrace_unlink (the
state is _not_ TASK_TRACED), there's a secondary problem in the signal
handling code, where a task should ignore the ptrace-sigstops as long as
SIGNAL_GROUP_EXIT is set (or the wakeup in __ptrace_unlink path wouldn't be
enough).

So I attempted to make this patch that seems to fix the problem.  There
were various ways to fix it, perhaps you prefer a different one, I just
opted to the one that looked safer to me.

I also removed the clearing of the stopped bits from the zap_other_threads
(zap_other_threads was safe unlike zap_threads).  I don't like useless
code, this whole NPTL signal/ptrace thing is already unreadable enough and
full of corner cases without confusing useless code into it to make it even
less readable.  And if this code is really needed, then you may want to
explain why it's not being done in the other paths that sets
SIGNAL_GROUP_EXIT at least.

Even after this patch I still wonder who serializes the read of
p->ptrace in zap_threads.

Patch is called ptrace-core_dump-exit_group-deadlock-1.

This was the trace I've got:

test          T ffff81003e8118c0     0 14305      1         14311 14309 (NOTLB)
ffff810058ccdde8 0000000000000082 000001f4000037e1 ffff810000000013
       00000000000000f8 ffff81003e811b00 ffff81003e8118c0 ffff810011362100
       0000000000000012 ffff810017ca4180
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff80141677>{finish_stop+87}
       <ffffffff8014367f>{get_signal_to_deliver+1359} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80111575>{sys_ptrace+2293}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80196399>{sys_ioctl+73}
       <ffffffff8010dd27>{sysret_signal+28} <ffffffff8010e00f>{ptregscall_common+103}

test          D ffff810011362100     0 14309      1         14305 14312 (NOTLB)
ffff810053c81cf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000195 ffff810011362340 ffff810011362100 ffff81002e338040
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80138d0c>{do_group_exit+252}
       <ffffffff801436db>{get_signal_to_deliver+1451} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff8014208a>{force_sig_info+186} <ffffffff804479a0>{do_int3+112}
       <ffffffff8010e308>{retint_signal+61}
test          D ffff81002e338040     0 14311      1         14716 14305 (NOTLB)
ffff81005ca8dcf8 0000000000000082 0000000000000286 0000000000000001
       0000000000000120 ffff81002e338280 ffff81002e338040 ffff8100481cb740
       ffff810001e0ca80 0000000000000001
Call Trace:<ffffffff801317ed>{try_to_wake_up+893} <ffffffff8044677d>{wait_for_completion+173}
       <ffffffff80131810>{default_wake_function+0} <ffffffff80137435>{exit_mm+149}
       <ffffffff801381af>{do_exit+479} <ffffffff80142d0e>{__dequeue_signal+558}
       <ffffffff80138d0c>{do_group_exit+252} <ffffffff801436db>{get_signal_to_deliver+1451}
       <ffffffff8010d3ad>{do_signal+157} <ffffffff8013deee>{ptrace_check_attach+222}
       <ffffffff80140850>{specific_send_sig_info+208} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804479a0>{do_int3+112} <ffffffff8010e308>{retint_signal+61}

test          D ffff810017ca4180     0 14312      1         14309 13882 (NOTLB)
ffff81005d15fcb8 0000000000000082 ffff81005d15fc58 ffffffff80130816
       0000000000000897 ffff810017ca43c0 ffff810017ca4180 ffff81003e8118c0
       0000000000000082 ffffffff801317ed
Call Trace:<ffffffff80130816>{activate_task+150} <ffffffff801317ed>{try_to_wake_up+893}
       <ffffffff8044677d>{wait_for_completion+173} <ffffffff80131810>{default_wake_function+0}
       <ffffffff8018cdc3>{do_coredump+819} <ffffffff80445f52>{thread_return+82}
       <ffffffff801436d4>{get_signal_to_deliver+1444} <ffffffff8010d3ad>{do_signal+157}
       <ffffffff8013deee>{ptrace_check_attach+222} <ffffffff80140850>{specific_send_sig_info+2

       <ffffffff804472e5>{_spin_unlock_irqrestore+5} <ffffffff8014208a>{force_sig_info+186}
       <ffffffff804476ff>{do_general_protection+159} <ffffffff8010e308>{retint_signal+61}

Signed-off-by: Andrea Arcangeli <andrea@suse.de>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 7 +++++--
 kernel/signal.c | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 019e04ec065a..863eee8bff47 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -56,6 +56,10 @@ void ptrace_untrace(task_t *child)
 			signal_wake_up(child, 1);
 		}
 	}
+	if (child->signal->flags & SIGNAL_GROUP_EXIT) {
+		sigaddset(&child->pending.signal, SIGKILL);
+		signal_wake_up(child, 1);
+	}
 	spin_unlock(&child->sighand->siglock);
 }
 
@@ -77,8 +81,7 @@ void __ptrace_unlink(task_t *child)
 		SET_LINKS(child);
 	}
 
-	if (child->state == TASK_TRACED)
-		ptrace_untrace(child);
+	ptrace_untrace(child);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d905ec74bde..9d1512dcf176 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1108,8 +1108,8 @@ void zap_other_threads(struct task_struct *p)
 		if (t != p->group_leader)
 			t->exit_signal = -1;
 
+		/* SIGKILL will be handled before any pending SIGSTOP */
 		sigaddset(&t->pending.signal, SIGKILL);
-		rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
 		signal_wake_up(t, 1);
 	}
 }
@@ -1879,9 +1879,9 @@ relock:
 			/* Let the debugger run.  */
 			ptrace_stop(signr, signr, info);
 
-			/* We're back.  Did the debugger cancel the sig?  */
+			/* We're back.  Did the debugger cancel the sig or group_exit? */
 			signr = current->exit_code;
-			if (signr == 0)
+			if (signr == 0 || current->signal->flags & SIGNAL_GROUP_EXIT)
 				continue;
 
 			current->exit_code = 0;
-- 
cgit v1.2.3


From 20e1129ab831486c811d50d3905343ad48c4275f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Sun, 30 Oct 2005 15:02:44 -0800
Subject: [PATCH] Keys: Get rid of warning in kmod.c if keys disabled

The attached patch gets rid of a "statement without effect" warning when
CONFIG_KEYS is disabled by making use of the return value of key_get().
The compiler will optimise all of this away when keys are disabled.

Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kmod.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kmod.c b/kernel/kmod.c
index 44166e3bb8af..51a892063aaa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -131,14 +131,14 @@ struct subprocess_info {
 static int ____call_usermodehelper(void *data)
 {
 	struct subprocess_info *sub_info = data;
-	struct key *old_session;
+	struct key *new_session, *old_session;
 	int retval;
 
 	/* Unblock all signals and set the session keyring. */
-	key_get(sub_info->ring);
+	new_session = key_get(sub_info->ring);
 	flush_signals(current);
 	spin_lock_irq(&current->sighand->siglock);
-	old_session = __install_session_keyring(current, sub_info->ring);
+	old_session = __install_session_keyring(current, new_session);
 	flush_signal_handlers(current, 1);
 	sigemptyset(&current->blocked);
 	recalc_sigpending();
-- 
cgit v1.2.3


From 40dc565122ed1e180a0637f88cdfca734d33db78 Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jesper.juhl@gmail.com>
Date: Sun, 30 Oct 2005 15:02:46 -0800
Subject: [PATCH] cleanup for kernel/printk.c

- Removes some trailing whitespace

- Breaks long lines and make other small changes to conform to CodingStyle

- Add explicit printk loglevels in two places.

Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 78 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 42 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 4b8f0f9230a4..3cb9708209bc 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -10,7 +10,7 @@
  * elsewhere, in preparation for a serial line console (someday).
  * Ted Ts'o, 2/11/93.
  * Modified for sysctl support, 1/8/97, Chris Horn.
- * Fixed SMP synchronization, 08/08/99, Manfred Spraul 
+ * Fixed SMP synchronization, 08/08/99, Manfred Spraul
  *     manfreds@colorfullife.com
  * Rewrote bits to get rid of console_lock
  *	01Mar01 Andrew Morton <andrewm@uow.edu.au>
@@ -148,7 +148,7 @@ static int __init console_setup(char *str)
 	if (!strcmp(str, "ttyb"))
 		strcpy(name, "ttyS1");
 #endif
-	for(s = name; *s; s++)
+	for (s = name; *s; s++)
 		if ((*s >= '0' && *s <= '9') || *s == ',')
 			break;
 	idx = simple_strtoul(s, NULL, 10);
@@ -169,11 +169,11 @@ static int __init log_buf_len_setup(char *str)
 		size = roundup_pow_of_two(size);
 	if (size > log_buf_len) {
 		unsigned long start, dest_idx, offset;
-		char * new_log_buf;
+		char *new_log_buf;
 
 		new_log_buf = alloc_bootmem(size);
 		if (!new_log_buf) {
-			printk("log_buf_len: allocation failed\n");
+			printk(KERN_WARNING "log_buf_len: allocation failed\n");
 			goto out;
 		}
 
@@ -193,10 +193,9 @@ static int __init log_buf_len_setup(char *str)
 		log_end -= offset;
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 
-		printk("log_buf_len: %d\n", log_buf_len);
+		printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
 	}
 out:
-
 	return 1;
 }
 
@@ -217,7 +216,7 @@ __setup("log_buf_len=", log_buf_len_setup);
  *	9 -- Return number of unread characters in the log buffer
  *     10 -- Return size of the log buffer
  */
-int do_syslog(int type, char __user * buf, int len)
+int do_syslog(int type, char __user *buf, int len)
 {
 	unsigned long i, j, limit, count;
 	int do_clear = 0;
@@ -244,7 +243,8 @@ int do_syslog(int type, char __user * buf, int len)
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait, (log_start - log_end));
+		error = wait_event_interruptible(log_wait,
+							(log_start - log_end));
 		if (error)
 			goto out;
 		i = 0;
@@ -264,7 +264,7 @@ int do_syslog(int type, char __user * buf, int len)
 			error = i;
 		break;
 	case 4:		/* Read/clear last kernel messages */
-		do_clear = 1; 
+		do_clear = 1;
 		/* FALL THRU */
 	case 3:		/* Read last kernel messages */
 		error = -EINVAL;
@@ -288,11 +288,11 @@ int do_syslog(int type, char __user * buf, int len)
 		limit = log_end;
 		/*
 		 * __put_user() could sleep, and while we sleep
-		 * printk() could overwrite the messages 
+		 * printk() could overwrite the messages
 		 * we try to copy to user space. Therefore
 		 * the messages are copied in reverse. <manfreds>
 		 */
-		for(i = 0; i < count && !error; i++) {
+		for (i = 0; i < count && !error; i++) {
 			j = limit-1-i;
 			if (j + log_buf_len < log_end)
 				break;
@@ -306,10 +306,10 @@ int do_syslog(int type, char __user * buf, int len)
 		if (error)
 			break;
 		error = i;
-		if(i != count) {
+		if (i != count) {
 			int offset = count-error;
 			/* buffer overflow during copy, correct user buffer. */
-			for(i=0;i<error;i++) {
+			for (i = 0; i < error; i++) {
 				if (__get_user(c,&buf[i+offset]) ||
 				    __put_user(c,&buf[i])) {
 					error = -EFAULT;
@@ -351,7 +351,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return do_syslog(type, buf, len);
 }
@@ -404,21 +404,19 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 	cur_index = start;
 	start_print = start;
 	while (cur_index != end) {
-		if (	msg_level < 0 &&
-			((end - cur_index) > 2) &&
-			LOG_BUF(cur_index + 0) == '<' &&
-			LOG_BUF(cur_index + 1) >= '0' &&
-			LOG_BUF(cur_index + 1) <= '7' &&
-			LOG_BUF(cur_index + 2) == '>')
-		{
+		if (msg_level < 0 && ((end - cur_index) > 2) &&
+				LOG_BUF(cur_index + 0) == '<' &&
+				LOG_BUF(cur_index + 1) >= '0' &&
+				LOG_BUF(cur_index + 1) <= '7' &&
+				LOG_BUF(cur_index + 2) == '>') {
 			msg_level = LOG_BUF(cur_index + 1) - '0';
 			cur_index += 3;
 			start_print = cur_index;
 		}
 		while (cur_index != end) {
 			char c = LOG_BUF(cur_index);
-			cur_index++;
 
+			cur_index++;
 			if (c == '\n') {
 				if (msg_level < 0) {
 					/*
@@ -461,7 +459,7 @@ static void zap_locks(void)
 	static unsigned long oops_timestamp;
 
 	if (time_after_eq(jiffies, oops_timestamp) &&
-			!time_after(jiffies, oops_timestamp + 30*HZ))
+			!time_after(jiffies, oops_timestamp + 30 * HZ))
 		return;
 
 	oops_timestamp = jiffies;
@@ -495,7 +493,7 @@ __attribute__((weak)) unsigned long long printk_clock(void)
 
 /*
  * This is printk.  It can be called from any context.  We want it to work.
- * 
+ *
  * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
  * call the console drivers.  If we fail to get the semaphore we place the output
  * into the log buffer and return.  The current holder of the console_sem will
@@ -639,13 +637,19 @@ EXPORT_SYMBOL(vprintk);
 
 #else
 
-asmlinkage long sys_syslog(int type, char __user * buf, int len)
+asmlinkage long sys_syslog(int type, char __user *buf, int len)
 {
 	return 0;
 }
 
-int do_syslog(int type, char __user * buf, int len) { return 0; }
-static void call_console_drivers(unsigned long start, unsigned long end) {}
+int do_syslog(int type, char __user *buf, int len)
+{
+	return 0;
+}
+
+static void call_console_drivers(unsigned long start, unsigned long end)
+{
+}
 
 #endif
 
@@ -851,9 +855,9 @@ EXPORT_SYMBOL(console_start);
  * print any messages that were printed by the kernel before the
  * console driver was initialized.
  */
-void register_console(struct console * console)
+void register_console(struct console *console)
 {
-	int     i;
+	int i;
 	unsigned long flags;
 
 	if (preferred_console < 0)
@@ -878,7 +882,8 @@ void register_console(struct console * console)
 	 *	See if this console matches one we selected on
 	 *	the command line.
 	 */
-	for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) {
+	for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0];
+			i++) {
 		if (strcmp(console_cmdline[i].name, console->name) != 0)
 			continue;
 		if (console->index >= 0 &&
@@ -933,9 +938,9 @@ void register_console(struct console * console)
 }
 EXPORT_SYMBOL(register_console);
 
-int unregister_console(struct console * console)
+int unregister_console(struct console *console)
 {
-        struct console *a,*b;
+        struct console *a, *b;
 	int res = 1;
 
 	acquire_console_sem();
@@ -949,10 +954,10 @@ int unregister_console(struct console * console)
 				b->next = a->next;
 				res = 0;
 				break;
-			}  
+			}
 		}
 	}
-	
+
 	/* If last console is removed, we re-enable picking the first
 	 * one that gets registered. Without that, pmac early boot console
 	 * would prevent fbcon from taking over.
@@ -994,7 +999,7 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
 	static DEFINE_SPINLOCK(ratelimit_lock);
-	static unsigned long toks = 10*5*HZ;
+	static unsigned long toks = 10 * 5 * HZ;
 	static unsigned long last_msg;
 	static int missed;
 	unsigned long flags;
@@ -1007,6 +1012,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 		toks = ratelimit_burst * ratelimit_jiffies;
 	if (toks >= ratelimit_jiffies) {
 		int lost = missed;
+
 		missed = 0;
 		toks -= ratelimit_jiffies;
 		spin_unlock_irqrestore(&ratelimit_lock, flags);
@@ -1021,7 +1027,7 @@ int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 EXPORT_SYMBOL(__printk_ratelimit);
 
 /* minimum time in jiffies between messages */
-int printk_ratelimit_jiffies = 5*HZ;
+int printk_ratelimit_jiffies = 5 * HZ;
 
 /* number of messages we send before ratelimiting */
 int printk_ratelimit_burst = 10;
-- 
cgit v1.2.3


From 7407251a0e2ed099e4b12b742b635503e981507c Mon Sep 17 00:00:00 2001
From: Coywolf Qi Hunt <qiyong@fc-cn.com>
Date: Sun, 30 Oct 2005 15:02:47 -0800
Subject: [PATCH] PF_DEAD cleanup

The PF_DEAD setting doesn't belong to exit_notify(), move it to a proper
place.

Signed-off-by: Coywolf Qi Hunt <qiyong@fc-cn.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 79f52b85d6ed..6ef8f7356a74 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -783,10 +783,6 @@ static void exit_notify(struct task_struct *tsk)
 	/* If the process is dead, release it - nobody will wait for it */
 	if (state == EXIT_DEAD)
 		release_task(tsk);
-
-	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
-	tsk->flags |= PF_DEAD;
 }
 
 fastcall NORET_TYPE void do_exit(long code)
@@ -873,7 +869,11 @@ fastcall NORET_TYPE void do_exit(long code)
 	tsk->mempolicy = NULL;
 #endif
 
-	BUG_ON(!(current->flags & PF_DEAD));
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+	preempt_disable();
+	BUG_ON(tsk->flags & PF_DEAD);
+	tsk->flags |= PF_DEAD;
+
 	schedule();
 	BUG();
 	/* Avoid "noreturn function does return".  */
-- 
cgit v1.2.3


From 7f2a52555998c699a7e89f24636c909d6fc08a60 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Oct 2005 15:02:50 -0800
Subject: [PATCH] wait4 PTRACE_ATTACH race fix

Back about a year ago when I last fiddled heavily with the do_wait code, I
was thinking too hard about the wrong thing and I now think I introduced a
bug whose inverse thought I was fixing.

Apparently noone was looking too hard over much shoulder, so as to cite my
bogus reasoning at the time.  In the race condition when PTRACE_ATTACH is
about to steal a child and then the child hits a tracing event (what
my_ptrace_child checks for), the real parent does need to set its flag
noting it has some eligible live children.  Otherwise a spurious ECHILD
error is possible, since the child in question is not yet on the
ptrace_children list.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6ef8f7356a74..2d39ccc367e6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1383,6 +1383,15 @@ repeat:
 
 			switch (p->state) {
 			case TASK_TRACED:
+				/*
+				 * When we hit the race with PTRACE_ATTACH,
+				 * we will not report this child.  But the
+				 * race means it has not yet been moved to
+				 * our ptrace_children list, so we need to
+				 * set the flag here to avoid a spurious ECHILD
+				 * when the race happens with the only child.
+				 */
+				flag = 1;
 				if (!my_ptrace_child(p))
 					continue;
 				/*FALLTHROUGH*/
-- 
cgit v1.2.3


From ecea8d19c9f0ebd62ddaa07fc919ff4e4b820d99 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 30 Oct 2005 15:03:00 -0800
Subject: [PATCH] jiffies_64 cleanup

Define jiffies_64 in kernel/timer.c rather than having 24 duplicated
defines in each architecture.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/kernel/time.c     | 4 ----
 arch/arm/kernel/time.c       | 4 ----
 arch/arm26/kernel/time.c     | 4 ----
 arch/cris/kernel/time.c      | 4 ----
 arch/frv/kernel/time.c       | 3 ---
 arch/h8300/kernel/time.c     | 4 ----
 arch/i386/kernel/time.c      | 4 ----
 arch/ia64/kernel/time.c      | 4 ----
 arch/m32r/kernel/time.c      | 4 ----
 arch/m68k/kernel/time.c      | 4 ----
 arch/m68knommu/kernel/time.c | 4 ----
 arch/mips/kernel/time.c      | 4 ----
 arch/parisc/kernel/time.c    | 4 ----
 arch/ppc/kernel/time.c       | 5 -----
 arch/ppc64/kernel/time.c     | 4 ----
 arch/s390/kernel/time.c      | 4 ----
 arch/sh/kernel/time.c        | 4 ----
 arch/sh64/kernel/time.c      | 2 --
 arch/sparc/kernel/time.c     | 4 ----
 arch/sparc64/kernel/time.c   | 4 ----
 arch/um/kernel/time_kern.c   | 4 ----
 arch/v850/kernel/time.c      | 4 ----
 arch/x86_64/kernel/time.c    | 4 ----
 arch/xtensa/kernel/time.c    | 3 ---
 kernel/timer.c               | 4 ++++
 25 files changed, 4 insertions(+), 93 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 67be50b7d80a..6b2921be1909 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -55,10 +55,6 @@
 #include "proto.h"
 #include "irq_impl.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;	/* kernel/timer.c */
 
 static int set_rtc_mmss(unsigned long);
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 69449a818dcc..fc4729106a32 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -36,10 +36,6 @@
 #include <asm/thread_info.h>
 #include <asm/mach/time.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * Our system timer.
  */
diff --git a/arch/arm26/kernel/time.c b/arch/arm26/kernel/time.c
index e66aedd02fad..335525339ad6 100644
--- a/arch/arm26/kernel/time.c
+++ b/arch/arm26/kernel/time.c
@@ -34,10 +34,6 @@
 #include <asm/irq.h>
 #include <asm/ioc.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 
 /* this needs a better home */
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index a2d99b4aedcd..b863815de78d 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -32,10 +32,6 @@
 #include <linux/init.h>
 #include <linux/profile.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 int have_rtc;  /* used to remember if we have an RTC or not */;
 
 #define TICK_SIZE tick
diff --git a/arch/frv/kernel/time.c b/arch/frv/kernel/time.c
index f43b734482e3..2e9741227b73 100644
--- a/arch/frv/kernel/time.c
+++ b/arch/frv/kernel/time.c
@@ -34,9 +34,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned long __nongprelbss __clkin_clock_speed_HZ;
 unsigned long __nongprelbss __ext_bus_clock_speed_HZ;
 unsigned long __nongprelbss __res_bus_clock_speed_HZ;
diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c
index af8c5d2057dd..688a5100604c 100644
--- a/arch/h8300/kernel/time.c
+++ b/arch/h8300/kernel/time.c
@@ -32,10 +32,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-u64 jiffies_64;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 46c35ec9137d..07471bba2dc6 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -74,10 +74,6 @@ int pit_latch_buggy;              /* extern */
 
 #include "do_timer.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);
 
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 8b8a5a45b621..5b7e736f3b49 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -32,10 +32,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
diff --git a/arch/m32r/kernel/time.c b/arch/m32r/kernel/time.c
index 539c562cd54d..2ebce2063fea 100644
--- a/arch/m32r/kernel/time.c
+++ b/arch/m32r/kernel/time.c
@@ -39,10 +39,6 @@ extern void send_IPI_allbutself(int, int);
 extern void smp_local_timer_interrupt(struct pt_regs *);
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 #define TICK_SIZE	(tick_nsec / 1000)
 
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index 4ec95e3cb874..98e4b1adfa29 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -27,10 +27,6 @@
 #include <linux/timex.h>
 #include <linux/profile.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static inline int set_rtc_mmss(unsigned long nowtime)
 {
   if (mach_set_clock_mmss)
diff --git a/arch/m68knommu/kernel/time.c b/arch/m68knommu/kernel/time.c
index b17c1ecba966..b9d8abb45430 100644
--- a/arch/m68knommu/kernel/time.c
+++ b/arch/m68knommu/kernel/time.c
@@ -27,10 +27,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 
 
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index a24651dfaaba..787ed541d442 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -45,10 +45,6 @@
 
 #define TICK_SIZE	(tick_nsec / 1000)
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * forward reference
  */
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index bc979e1abdec..cded25680787 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -33,10 +33,6 @@
 
 #include <linux/timex.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* xtime and wall_jiffies keep wall-clock time */
 extern unsigned long wall_jiffies;
 
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index 22d7fd1e0aea..67797184f4eb 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -66,11 +66,6 @@
 
 #include <asm/time.h>
 
-/* XXX false sharing with below? */
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned long disarm_decr[NR_CPUS];
 
 extern struct timezone sys_tz;
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index b56c6a324e17..ab565468578a 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -68,10 +68,6 @@
 #include <asm/systemcfg.h>
 #include <asm/firmware.h>
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;
 extern int piranha_simulator;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 2fd75da15495..9a1d95894f3d 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -49,10 +49,6 @@
 
 #define TICK_SIZE tick
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static ext_int_info_t ext_int_info_cc;
 static u64 init_timer_cc;
 static u64 jiffies_timer_cc;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 02ca69918d7c..671b876416bf 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -56,10 +56,6 @@ extern unsigned long wall_jiffies;
 #define TICK_SIZE (tick_nsec / 1000)
 DEFINE_SPINLOCK(tmu0_lock);
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* XXX: Can we initialize this in a routine somewhere?  Dreamcast doesn't want
  * these routines anywhere... */
 #ifdef CONFIG_SH_RTC
diff --git a/arch/sh64/kernel/time.c b/arch/sh64/kernel/time.c
index 43e395a14f49..870fe5327e09 100644
--- a/arch/sh64/kernel/time.c
+++ b/arch/sh64/kernel/time.c
@@ -116,8 +116,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
 static unsigned long tmu_base, rtc_base;
 unsigned long cprc_base;
 
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 279a62627c10..24814d58f9e1 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -45,10 +45,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 DEFINE_SPINLOCK(rtc_lock);
 enum sparc_clock_type sp_clock_typ;
 DEFINE_SPINLOCK(mostek_lock);
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 3f08a32f51a1..38c5525087a2 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -55,10 +55,6 @@ unsigned long ds1287_regs = 0UL;
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static void __iomem *mstk48t08_regs;
 static void __iomem *mstk48t59_regs;
 
diff --git a/arch/um/kernel/time_kern.c b/arch/um/kernel/time_kern.c
index 4e08f7545d63..020ca79b8d33 100644
--- a/arch/um/kernel/time_kern.c
+++ b/arch/um/kernel/time_kern.c
@@ -22,10 +22,6 @@
 #include "mode.h"
 #include "os.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 int hz(void)
 {
 	return(HZ);
diff --git a/arch/v850/kernel/time.c b/arch/v850/kernel/time.c
index ea3fd8844ff0..c1e85c2aef65 100644
--- a/arch/v850/kernel/time.c
+++ b/arch/v850/kernel/time.c
@@ -26,10 +26,6 @@
 
 #include "mach.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TICK_SIZE	(tick_nsec / 1000)
 
 /*
diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 47d25ad08160..bd5ea09bfdf9 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,10 +42,6 @@
 #include <asm/apic.h>
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #ifdef CONFIG_CPU_FREQ
 static void cpufreq_delayed_get(void);
 #endif
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 8e423d1335ce..cb6e38ed2b1d 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -29,9 +29,6 @@
 
 extern volatile unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-EXPORT_SYMBOL(jiffies_64);
-
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 EXPORT_SYMBOL(rtc_lock);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index 562d53eabb46..fd74268d8663 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -46,6 +46,10 @@ static void time_interpolator_update(long delta_nsec);
 #define time_interpolator_update(x)
 #endif
 
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
 /*
  * per-CPU timer vector definitions:
  */
-- 
cgit v1.2.3


From a241ec65aeac3d69a08a7b153cccbdb7ea35063f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 30 Oct 2005 15:03:12 -0800
Subject: [PATCH] RCU torture-testing kernel module

This patch is a rewrite of the one submitted on October 1st, using modules
(http://marc.theaimsgroup.com/?l=linux-kernel&m=112819093522998&w=2).

This rewrite adds a tristate CONFIG_RCU_TORTURE_TEST, which enables an
intense torture test of the RCU infratructure.  This is needed due to the
continued changes to the RCU infrastructure to accommodate dynamic ticks,
CPU hotplug, realtime, and so on.  Most of the code is in a separate file
that is compiled only if the CONFIG variable is set.  Documentation on how
to run the test and interpret the output is also included.

This code has been tested on i386 and ppc64, and an earlier version of the
code has received extensive testing on a number of architectures as part of
the PREEMPT_RT patchset.

Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/RCU/torture.txt | 122 +++++++++++
 include/linux/rcupdate.h      |   1 +
 kernel/Makefile               |   1 +
 kernel/rcupdate.c             |  10 +
 kernel/rcutorture.c           | 492 ++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug             |  21 ++
 mm/mmap.c                     |   2 +-
 7 files changed, 648 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/RCU/torture.txt
 create mode 100644 kernel/rcutorture.c

(limited to 'kernel')

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
new file mode 100644
index 000000000000..e4c38152f7f7
--- /dev/null
+++ b/Documentation/RCU/torture.txt
@@ -0,0 +1,122 @@
+RCU Torture Test Operation
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations.  It creates an rcutorture kernel module that can
+be loaded to run a torture test.  The test periodically outputs
+status messages via printk(), which can be examined via the dmesg
+command (perhaps grepping for "rcutorture").  The test is started
+when the module is loaded, and stops when the module is unloaded.
+
+However, actually setting this config option to "y" results in the system
+running the test immediately upon boot, and ending only when the system
+is taken down.  Normally, one will instead want to build the system
+with CONFIG_RCU_TORTURE_TEST=m and to use modprobe and rmmod to control
+the test, perhaps using a script similar to the one shown at the end of
+this document.  Note that you will need CONFIG_MODULE_UNLOAD in order
+to be able to end the test.
+
+
+MODULE PARAMETERS
+
+This module has the following parameters:
+
+nreaders	This is the number of RCU reading threads supported.
+		The default is twice the number of CPUs.  Why twice?
+		To properly exercise RCU implementations with preemptible
+		read-side critical sections.
+
+stat_interval	The number of seconds between output of torture
+		statistics (via printk()).  Regardless of the interval,
+		statistics are printed when the module is unloaded.
+		Setting the interval to zero causes the statistics to
+		be printed -only- when the module is unloaded, and this
+		is the default.
+
+verbose		Enable debug printk()s.  Default is disabled.
+
+
+OUTPUT
+
+The statistics output is as follows:
+
+	rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+	rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915
+	rcutorture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
+	rcutorture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
+	rcutorture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+	rcutorture: --- End of test
+
+The command "dmesg | grep rcutorture:" will extract this information on
+most systems.  On more esoteric configurations, it may be necessary to
+use other commands to access the output of the printk()s used by
+the RCU torture test.  The printk()s use KERN_ALERT, so they should
+be evident.  ;-)
+
+The entries are as follows:
+
+o	"ggp": The number of counter flips (or batches) since boot.
+
+o	"rtc": The hexadecimal address of the structure currently visible
+	to readers.
+
+o	"ver": The number of times since boot that the rcutw writer task
+	has changed the structure visible to readers.
+
+o	"tfle": If non-zero, indicates that the "torture freelist"
+	containing structure to be placed into the "rtc" area is empty.
+	This condition is important, since it can fool you into thinking
+	that RCU is working when it is not.  :-/
+
+o	"rta": Number of structures allocated from the torture freelist.
+
+o	"rtaf": Number of allocations from the torture freelist that have
+	failed due to the list being empty.
+
+o	"rtf": Number of frees into the torture freelist.
+
+o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+	If any entries past the first two are non-zero, RCU is broken.
+	And rcutorture prints the error flag string "!!!" to make sure
+	you notice.  The age of a newly allocated structure is zero,
+	it becomes one when removed from reader visibility, and is
+	incremented once per grace period subsequently -- and is freed
+	after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+	The output displayed above was taken from a correctly working
+	RCU.  If you want to see what it looks like when broken, break
+	it yourself.  ;-)
+
+o	"Reader Batch": Another histogram of "ages" of structures seen
+	by readers, but in terms of counter flips (or batches) rather
+	than in terms of grace periods.  The legal number of non-zero
+	entries is again two.  The reason for this separate view is
+	that it is easier to get the third entry to show up in the
+	"Reader Batch" list than in the "Reader Pipe" list.
+
+o	"Free-Block Circulation": Shows the number of torture structures
+	that have reached a given point in the pipeline.  The first element
+	should closely correspond to the number of structures allocated,
+	the second to the number that have been removed from reader view,
+	and all but the last remaining to the corresponding number of
+	passes through a grace period.  The last entry should be zero,
+	as it is only incremented if a torture structure's counter
+	somehow gets incremented farther than it should.
+
+
+USAGE
+
+The following script may be used to torture RCU:
+
+	#!/bin/sh
+
+	modprobe rcutorture
+	sleep 100
+	rmmod rcutorture
+	dmesg | grep rcutorture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 70191a5a148f..cce25591eec2 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -275,6 +275,7 @@ static inline int rcu_pending(int cpu)
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
diff --git a/kernel/Makefile b/kernel/Makefile
index 980b5e454441..4f5a1453093a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -31,6 +31,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2559d4b8f23f..c4d159a21e04 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -153,6 +153,15 @@ void fastcall call_rcu_bh(struct rcu_head *head,
 	local_irq_restore(flags);
 }
 
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+
 /*
  * Invoke the completed RCU callbacks. They are expected to be in
  * a per-cpu list.
@@ -501,6 +510,7 @@ void synchronize_kernel(void)
 }
 
 module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
 EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
new file mode 100644
index 000000000000..9b58f1eff3ca
--- /dev/null
+++ b/kernel/rcutorture.c
@@ -0,0 +1,492 @@
+/*
+ * Read-Copy Update /proc-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcuref.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/stat.h>
+
+MODULE_LICENSE("GPL");
+
+static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
+static int stat_interval = 0;	/* Interval between stats, in seconds. */
+				/*  Defaults to "only at end of test". */
+static int verbose = 0;		/* Print more debug info. */
+
+MODULE_PARM(nreaders, "i");
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+MODULE_PARM(stat_interval, "i");
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+MODULE_PARM(verbose, "i");
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+#define TORTURE_FLAG "rcutorture: "
+#define PRINTK_STRING(s) \
+	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+
+static char printk_buf[4096];
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+	struct rcu_head rtort_rcu;
+	int rtort_pipe_count;
+	struct list_head rtort_free;
+};
+
+static int fullstop = 0;	/* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+	{ 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+	{ 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+atomic_t n_rcu_torture_alloc;
+atomic_t n_rcu_torture_alloc_fail;
+atomic_t n_rcu_torture_free;
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+struct rcu_torture *
+rcu_torture_alloc(void)
+{
+	struct list_head *p;
+
+	spin_lock(&rcu_torture_lock);
+	if (list_empty(&rcu_torture_freelist)) {
+		atomic_inc(&n_rcu_torture_alloc_fail);
+		spin_unlock(&rcu_torture_lock);
+		return NULL;
+	}
+	atomic_inc(&n_rcu_torture_alloc);
+	p = rcu_torture_freelist.next;
+	list_del_init(p);
+	spin_unlock(&rcu_torture_lock);
+	return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+	atomic_inc(&n_rcu_torture_free);
+	spin_lock(&rcu_torture_lock);
+	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+	spin_unlock(&rcu_torture_lock);
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN)
+		rcu_torture_free(rp);
+	else
+		call_rcu(p, rcu_torture_cb);
+}
+
+struct rcu_random_state {
+	unsigned long rrs_state;
+	unsigned long rrs_count;
+};
+
+#define RCU_RANDOM_MULT 39916801  /* prime */
+#define RCU_RANDOM_ADD	479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static long
+rcu_random(struct rcu_random_state *rrsp)
+{
+	long refresh;
+
+	if (--rrsp->rrs_count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rrsp->rrs_state += refresh;
+		rrsp->rrs_count = RCU_RANDOM_REFRESH;
+	}
+	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+	return swahw32(rrsp->rrs_state);
+}
+
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+	int i;
+	long oldbatch = rcu_batches_completed();
+	struct rcu_torture *rp;
+	struct rcu_torture *old_rp;
+	static DEFINE_RCU_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	do {
+		schedule_timeout_uninterruptible(1);
+		if (rcu_batches_completed() == oldbatch)
+			continue;
+		if ((rp = rcu_torture_alloc()) == NULL)
+			continue;
+		rp->rtort_pipe_count = 0;
+		udelay(rcu_random(&rand) & 0x3ff);
+		old_rp = rcu_torture_current;
+		rcu_assign_pointer(rcu_torture_current, rp);
+		smp_wmb();
+		if (old_rp != NULL) {
+			i = old_rp->rtort_pipe_count;
+			if (i > RCU_TORTURE_PIPE_LEN)
+				i = RCU_TORTURE_PIPE_LEN;
+			atomic_inc(&rcu_torture_wcount[i]);
+			old_rp->rtort_pipe_count++;
+			call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+		}
+		rcu_torture_current_version++;
+		oldbatch = rcu_batches_completed();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+	int completed;
+	DEFINE_RCU_RANDOM(rand);
+	struct rcu_torture *p;
+	int pipe_count;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	do {
+		rcu_read_lock();
+		completed = rcu_batches_completed();
+		p = rcu_dereference(rcu_torture_current);
+		if (p == NULL) {
+			/* Wait for rcu_torture_writer to get underway */
+			rcu_read_unlock();
+			schedule_timeout_interruptible(HZ);
+			continue;
+		}
+		udelay(rcu_random(&rand) & 0x7f);
+		preempt_disable();
+		pipe_count = p->rtort_pipe_count;
+		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			pipe_count = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_count)[pipe_count];
+		completed = rcu_batches_completed() - completed;
+		if (completed > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			completed = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_batch)[completed];
+		preempt_enable();
+		rcu_read_unlock();
+		schedule();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int i;
+	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+		}
+	}
+	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+		if (pipesummary[i] != 0)
+			break;
+	}
+	cnt += sprintf(&page[cnt], "rcutorture: ");
+	cnt += sprintf(&page[cnt],
+		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d",
+		       rcu_torture_current,
+		       rcu_torture_current_version,
+		       list_empty(&rcu_torture_freelist),
+		       atomic_read(&n_rcu_torture_alloc),
+		       atomic_read(&n_rcu_torture_alloc_fail),
+		       atomic_read(&n_rcu_torture_free));
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	if (i > 1)
+		cnt += sprintf(&page[cnt], "!!! ");
+	cnt += sprintf(&page[cnt], "Reader Pipe: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Reader Batch: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+		cnt += sprintf(&page[cnt], " %d",
+			       atomic_read(&rcu_torture_wcount[i]));
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+	int cnt;
+
+	cnt = rcu_torture_printk(printk_buf);
+	printk(KERN_ALERT "%s", printk_buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		rcu_torture_stats_print();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	return 0;
+}
+
+static void
+rcu_torture_cleanup(void)
+{
+	int i;
+
+	fullstop = 1;
+	if (writer_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		kthread_stop(writer_task);
+	}
+	writer_task = NULL;
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++) {
+			if (reader_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_reader task");
+				kthread_stop(reader_tasks[i]);
+			}
+			reader_tasks[i] = NULL;
+		}
+		kfree(reader_tasks);
+		reader_tasks = NULL;
+	}
+	rcu_torture_current = NULL;
+
+	if (stats_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		kthread_stop(stats_task);
+	}
+	stats_task = NULL;
+
+	/* Wait for all RCU callbacks to fire.  */
+
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		synchronize_rcu();
+	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+	PRINTK_STRING("--- End of test");
+}
+
+static int
+rcu_torture_init(void)
+{
+	int i;
+	int cpu;
+	int firsterr = 0;
+
+	/* Process args and tell the world that the torturer is on the job. */
+
+	if (nreaders >= 0)
+		nrealreaders = nreaders;
+	else
+		nrealreaders = 2 * num_online_cpus();
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
+	       nrealreaders, stat_interval, verbose);
+	fullstop = 0;
+
+	/* Set up the freelist. */
+
+	INIT_LIST_HEAD(&rcu_torture_freelist);
+	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+		list_add_tail(&rcu_tortures[i].rtort_free,
+			      &rcu_torture_freelist);
+	}
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	rcu_torture_current = NULL;
+	rcu_torture_current_version = 0;
+	atomic_set(&n_rcu_torture_alloc, 0);
+	atomic_set(&n_rcu_torture_alloc_fail, 0);
+	atomic_set(&n_rcu_torture_free, 0);
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		atomic_set(&rcu_torture_wcount[i], 0);
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			per_cpu(rcu_torture_count, cpu)[i] = 0;
+			per_cpu(rcu_torture_batch, cpu)[i] = 0;
+		}
+	}
+
+	/* Start up the kthreads. */
+
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	writer_task = kthread_run(rcu_torture_writer, NULL,
+				  "rcu_torture_writer");
+	if (IS_ERR(writer_task)) {
+		firsterr = PTR_ERR(writer_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		writer_task = NULL;
+		goto unwind;
+	}
+	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+					      "rcu_torture_reader");
+		if (IS_ERR(reader_tasks[i])) {
+			firsterr = PTR_ERR(reader_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			reader_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	if (stat_interval > 0) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		stats_task = kthread_run(rcu_torture_stats, NULL,
+					"rcu_torture_stats");
+		if (IS_ERR(stats_task)) {
+			firsterr = PTR_ERR(stats_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			stats_task = NULL;
+			goto unwind;
+		}
+	}
+	return 0;
+
+unwind:
+	rcu_torture_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f9306b14d2a6..156822e3cc79 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -168,6 +168,14 @@ config DEBUG_FS
 
 	  If unsure, say N.
 
+config DEBUG_VM
+	bool "Debug VM"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to debug the virtual-memory system.
+
+	  If unsure, say N.
+
 config FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
 	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
@@ -178,3 +186,16 @@ config FRAME_POINTER
 	  some architectures or if you use external debuggers.
 	  If you don't debug the kernel, you can say N.
 
+config RCU_TORTURE_TEST
+	tristate "torture tests for RCU"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option provides a kernel module that runs torture tests
+	  on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU torture tests to start automatically
+	  at boot time (you probably don't).
+	  Say M if you want the RCU torture tests to build as a module.
+	  Say N if you are unsure.
diff --git a/mm/mmap.c b/mm/mmap.c
index 5ecc2cf3e1d7..320dda1778c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1840,7 +1840,7 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
 
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_VM
 	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
 		WARN_ON(1);
 		up_read(&mm->mmap_sem);
-- 
cgit v1.2.3


From 708f430dcc50787d1c0b5c31962a5ff0dd8e35eb Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Sun, 30 Oct 2005 15:03:13 -0800
Subject: [PATCH] posix-cpu-timers: fix overrun reporting

This change corrects an omission in posix_cpu_timer_schedule, so that it
correctly propagates the overrun calculation to where it will get reported
to the user.

Signed-off-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bf374fceb39c..91a894264941 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1225,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		/*
 		 * The task was cleaned up already, no future firings.
 		 */
-		return;
+		goto out;
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1235,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		bump_cpu_timer(timer, now);
 		if (unlikely(p->exit_state)) {
 			clear_dead_task(timer, now);
-			return;
+			goto out;
 		}
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
 	} else {
@@ -1248,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			put_task_struct(p);
 			timer->it.cpu.task = p = NULL;
 			timer->it.cpu.expires.sched = 0;
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/*
 			 * We've noticed that the thread is dead, but
@@ -1257,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * drop our task ref.
 			 */
 			clear_dead_task(timer, now);
-			read_unlock(&tasklist_lock);
-			return;
+			goto out_unlock;
 		}
 		cpu_clock_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
@@ -1270,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	 */
 	arm_timer(timer, now);
 
+out_unlock:
 	read_unlock(&tasklist_lock);
+
+out:
+	timer->it_overrun_last = timer->it_overrun;
+	timer->it_overrun = -1;
+	++timer->it_requeue_pending;
 }
 
 /*
-- 
cgit v1.2.3


From 4098f9918e068e51fed1727f6ba80efcec372378 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Sun, 30 Oct 2005 15:03:21 -0800
Subject: [PATCH] sched: hardcode non-smp set_cpus_allowed

Simplify the UP (1 CPU) implementatin of set_cpus_allowed.

The one CPU is hardcoded to be cpu 0 - so just test for that bit, and avoid
having to pick up the cpu_online_map.

Also, unexport cpu_online_map: it was only needed for set_cpus_allowed().

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b2d2dc14f0b9..41285a0e7258 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -940,7 +940,7 @@ extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
 #else
 static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
 {
-	if (!cpus_intersects(new_mask, cpu_online_map))
+	if (!cpu_isset(0, new_mask))
 		return -EINVAL;
 	return 0;
 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 4f26c544d02c..340dd238c16d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3877,7 +3877,6 @@ EXPORT_SYMBOL(cpu_present_map);
 
 #ifndef CONFIG_SMP
 cpumask_t cpu_online_map = CPU_MASK_ALL;
-EXPORT_SYMBOL_GPL(cpu_online_map);
 cpumask_t cpu_possible_map = CPU_MASK_ALL;
 #endif
 
-- 
cgit v1.2.3


From b67a1b9e4bf878aa5d4b6b44cb5a251a2f425f0d Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:44 -0800
Subject: [PATCH] remove hardcoded SEND_SIG_xxx constants

This patch replaces hardcoded SEND_SIG_xxx constants with
their symbolic names.

No changes in affected .o files.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/exit.c            | 10 +++++-----
 kernel/signal.c          | 35 ++++++++++++++++++++---------------
 security/selinux/hooks.c |  4 ++--
 3 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 2d39ccc367e6..537394b25e8d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -547,7 +547,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, (void *) 0, p);
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
 
 	/* Move the child from its dying parent to the new one.  */
 	if (unlikely(traced)) {
@@ -591,8 +591,8 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
-			__kill_pg_info(SIGHUP, (void *)1, pgrp);
-			__kill_pg_info(SIGCONT, (void *)1, pgrp);
+			__kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp);
+			__kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp);
 		}
 	}
 }
@@ -727,8 +727,8 @@ static void exit_notify(struct task_struct *tsk)
 	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
-		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
-		__kill_pg_info(SIGCONT, (void *)1, process_group(tsk));
+		__kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk));
+		__kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk));
 	}
 
 	/* Let father know we died 
diff --git a/kernel/signal.c b/kernel/signal.c
index 9d1512dcf176..1f7b2aaa4a39 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -651,8 +651,9 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((!info || ((unsigned long)info != 1 &&
-			(unsigned long)info != 2 && SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO ||
+			(info != SEND_SIG_PRIV && info != SEND_SIG_FORCED
+				&& SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -789,7 +790,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	 * fast-pathed signals for kernel-internal things like SIGSTOP
 	 * or SIGKILL.
 	 */
-	if ((unsigned long)info == 2)
+	if (info == SEND_SIG_FORCED)
 		goto out_set;
 
 	/* Real-time signals must be queued if sent by sigqueue, or
@@ -801,19 +802,19 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     ((unsigned long) info < 2 ||
+					     (info < SEND_SIG_FORCED ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
 		switch ((unsigned long) info) {
-		case 0:
+		case (unsigned long) SEND_SIG_NOINFO:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_USER;
 			q->info.si_pid = current->pid;
 			q->info.si_uid = current->uid;
 			break;
-		case 1:
+		case (unsigned long) SEND_SIG_PRIV:
 			q->info.si_signo = sig;
 			q->info.si_errno = 0;
 			q->info.si_code = SI_KERNEL;
@@ -825,14 +826,15 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		}
 	} else {
-		if (sig >= SIGRTMIN && info && (unsigned long)info != 1
+		if (sig >= SIGRTMIN
+		   && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV
 		   && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (((unsigned long)info > 1) && (info->si_code == SI_TIMER))
+		if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER))
 			/*
 			 * Set up a return to indicate that we dropped 
 			 * the signal.
@@ -858,7 +860,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -914,7 +916,7 @@ force_sig_specific(int sig, struct task_struct *t)
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
 	sigdelset(&t->blocked, sig);
 	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, (void *)2, t);
+	specific_send_sig_info(sig, SEND_SIG_FORCED, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
@@ -1050,7 +1052,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
+	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -1285,10 +1287,13 @@ send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	return ret;
 }
 
+#define __si_special(priv) \
+	((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO)
+
 int
 send_sig(int sig, struct task_struct *p, int priv)
 {
-	return send_sig_info(sig, (void*)(long)(priv != 0), p);
+	return send_sig_info(sig, __si_special(priv), p);
 }
 
 /*
@@ -1308,7 +1313,7 @@ send_group_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 void
 force_sig(int sig, struct task_struct *p)
 {
-	force_sig_info(sig, (void*)1L, p);
+	force_sig_info(sig, SEND_SIG_PRIV, p);
 }
 
 /*
@@ -1333,13 +1338,13 @@ force_sigsegv(int sig, struct task_struct *p)
 int
 kill_pg(pid_t pgrp, int sig, int priv)
 {
-	return kill_pg_info(sig, (void *)(long)(priv != 0), pgrp);
+	return kill_pg_info(sig, __si_special(priv), pgrp);
 }
 
 int
 kill_proc(pid_t pid, int sig, int priv)
 {
-	return kill_proc_info(sig, (void *)(long)(priv != 0), pid);
+	return kill_proc_info(sig, __si_special(priv), pid);
 }
 
 /*
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bb62838be496..295ac472faf1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2713,8 +2713,8 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int si
 	if (rc)
 		return rc;
 
-	if (info && ((unsigned long)info == 1 ||
-	             (unsigned long)info == 2 || SI_FROMKERNEL(info)))
+	if (info != SEND_SIG_NOINFO && (info == SEND_SIG_PRIV ||
+					info == SEND_SIG_FORCED || SI_FROMKERNEL(info)))
 		return 0;
 
 	if (!sig)
-- 
cgit v1.2.3


From 621d31219d9a788bda924a0613048053f3f5f211 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:45 -0800
Subject: [PATCH] cleanup the usage of SEND_SIG_xxx constants

This patch simplifies some checks for magic siginfo values.  It should not
change the behaviour in any way.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h    |  5 +++++
 kernel/signal.c          | 18 +++++++-----------
 security/selinux/hooks.c |  3 +--
 3 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41285a0e7258..03b68a7b4b82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1084,6 +1084,11 @@ extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned lon
 #define SEND_SIG_PRIV	((struct siginfo *) 1)
 #define SEND_SIG_FORCED	((struct siginfo *) 2)
 
+static inline int is_si_special(const struct siginfo *info)
+{
+	return info <= SEND_SIG_FORCED;
+}
+
 /* True if we are on the alternate signal stack.  */
 
 static inline int on_sig_stack(unsigned long sp)
diff --git a/kernel/signal.c b/kernel/signal.c
index 1f7b2aaa4a39..27533b90c347 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -651,9 +651,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	if (!valid_signal(sig))
 		return error;
 	error = -EPERM;
-	if ((info == SEND_SIG_NOINFO ||
-			(info != SEND_SIG_PRIV && info != SEND_SIG_FORCED
-				&& SI_FROMUSER(info)))
+	if ((info == SEND_SIG_NOINFO || (!is_si_special(info) && SI_FROMUSER(info)))
 	    && ((sig != SIGCONT) ||
 		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
@@ -802,7 +800,7 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	   pass on the info struct.  */
 
 	q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN &&
-					     (info < SEND_SIG_FORCED ||
+					     (is_si_special(info) ||
 					      info->si_code >= 0)));
 	if (q) {
 		list_add_tail(&q->list, &signals->list);
@@ -825,16 +823,14 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			copy_siginfo(&q->info, info);
 			break;
 		}
-	} else {
-		if (sig >= SIGRTMIN
-		   && info != SEND_SIG_NOINFO && info != SEND_SIG_PRIV
-		   && info->si_code != SI_USER)
+	} else if (!is_si_special(info)) {
+		if (sig >= SIGRTMIN && info->si_code != SI_USER)
 		/*
 		 * Queue overflow, abort.  We may abort if the signal was rt
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if ((info > SEND_SIG_PRIV) && (info->si_code == SI_TIMER))
+		if (info->si_code == SI_TIMER)
 			/*
 			 * Set up a return to indicate that we dropped 
 			 * the signal.
@@ -860,7 +856,7 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
+	if (!is_si_special(info) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
@@ -1052,7 +1048,7 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if ((info > SEND_SIG_FORCED) && (info->si_code == SI_TIMER))
+	if (!is_si_special(info) && (info->si_code == SI_TIMER))
 		/*
 		 * Set up a return to indicate that we dropped the signal.
 		 */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 295ac472faf1..45c41490d521 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2713,8 +2713,7 @@ static int selinux_task_kill(struct task_struct *p, struct siginfo *info, int si
 	if (rc)
 		return rc;
 
-	if (info != SEND_SIG_NOINFO && (info == SEND_SIG_PRIV ||
-					info == SEND_SIG_FORCED || SI_FROMKERNEL(info)))
+	if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info)))
 		return 0;
 
 	if (!sig)
-- 
cgit v1.2.3


From ae6866c377943de73e2c95398ff0120516f167ce Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Sun, 30 Oct 2005 15:03:46 -0800
Subject: [PATCH] remove unneeded SI_TIMER checks

This patch removes checks for ->si_code == SI_TIMER from send_signal,
specific_send_sig_info, __group_send_sig_info.

I think posix-timers.c used these functions some time ago, now it sends
signals via send_{,group_}sigqueue, so these hooks are unneeded.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 18 ------------------
 1 file changed, 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 27533b90c347..1d8f84c5c6ee 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -830,12 +830,6 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 		 * and sent by user using something other than kill().
 		 */
 			return -EAGAIN;
-		if (info->si_code == SI_TIMER)
-			/*
-			 * Set up a return to indicate that we dropped 
-			 * the signal.
-			 */
-			ret = info->si_sys_private;
 	}
 
 out_set:
@@ -856,12 +850,6 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 		BUG();
 	assert_spin_locked(&t->sighand->siglock);
 
-	if (!is_si_special(info) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(t, sig))
 		goto out;
@@ -1048,12 +1036,6 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 	assert_spin_locked(&p->sighand->siglock);
 	handle_stop_signal(sig, p);
 
-	if (!is_si_special(info) && (info->si_code == SI_TIMER))
-		/*
-		 * Set up a return to indicate that we dropped the signal.
-		 */
-		ret = info->si_sys_private;
-
 	/* Short-circuit ignored signals.  */
 	if (sig_ignored(p, sig))
 		return ret;
-- 
cgit v1.2.3


From b0423a0d9cc836b2c3d796623cd19236bfedfe63 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Sun, 30 Oct 2005 15:03:46 -0800
Subject: [PATCH] Remove duplicate code in signal.c

Combine a bit of redundant code between force_sig_info() and
force_sig_specific().

Signed-off-by: paulmck@us.ibm.com
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1d8f84c5c6ee..1bf3c39d6109 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -879,11 +879,13 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 	int ret;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (sigismember(&t->blocked, sig) || t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
+	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) {
 		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
+	}
+	if (sigismember(&t->blocked, sig)) {
 		sigdelset(&t->blocked, sig);
-		recalc_sigpending_tsk(t);
 	}
+	recalc_sigpending_tsk(t);
 	ret = specific_send_sig_info(sig, info, t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 
@@ -893,15 +895,7 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
 void
 force_sig_specific(int sig, struct task_struct *t)
 {
-	unsigned long int flags;
-
-	spin_lock_irqsave(&t->sighand->siglock, flags);
-	if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN)
-		t->sighand->action[sig-1].sa.sa_handler = SIG_DFL;
-	sigdelset(&t->blocked, sig);
-	recalc_sigpending_tsk(t);
-	specific_send_sig_info(sig, SEND_SIG_FORCED, t);
-	spin_unlock_irqrestore(&t->sighand->siglock, flags);
+	force_sig_info(sig, SEND_SIG_FORCED, t);
 }
 
 /*
-- 
cgit v1.2.3


From 4e57b6817880946a3a78d5d8cad1ace363f7e449 Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Sun, 30 Oct 2005 15:03:48 -0800
Subject: [PATCH] fix missing includes

I recently picked up my older work to remove unnecessary #includes of
sched.h, starting from a patch by Dave Jones to not include sched.h
from module.h. This reduces the number of indirect includes of sched.h
by ~300. Another ~400 pointless direct includes can be removed after
this disentangling (patch to follow later).
However, quite a few indirect includes need to be fixed up for this.

In order to feed the patches through -mm with as little disturbance as
possible, I've split out the fixes I accumulated up to now (complete for
i386 and x86_64, more archs to follow later) and post them before the real
patch.  This way this large part of the patch is kept simple with only
adding #includes, and all hunks are independent of each other.  So if any
hunk rejects or gets in the way of other patches, just drop it.  My scripts
will pick it up again in the next round.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/firmware_class/firmware_sample_driver.c         |  1 +
 Documentation/firmware_class/firmware_sample_firmware_class.c |  2 ++
 arch/arm/common/amba.c                                        |  2 ++
 arch/arm/common/scoop.c                                       |  3 +++
 arch/arm/kernel/arthur.c                                      |  1 +
 arch/arm/mach-imx/generic.c                                   |  2 ++
 arch/arm/mach-integrator/clock.c                              |  1 +
 arch/arm/mach-integrator/integrator_ap.c                      |  1 +
 arch/arm/mach-integrator/lm.c                                 |  1 +
 arch/arm/mach-iop3xx/iq31244-pci.c                            |  2 ++
 arch/arm/mach-iop3xx/iq80321-pci.c                            |  2 ++
 arch/arm/mach-iop3xx/iq80331-pci.c                            |  2 ++
 arch/arm/mach-iop3xx/iq80332-pci.c                            |  2 ++
 arch/arm/mach-pxa/generic.c                                   |  1 +
 arch/arm/mach-sa1100/generic.c                                |  1 +
 arch/arm/mach-versatile/clock.c                               |  1 +
 arch/arm/plat-omap/clock.c                                    |  1 +
 arch/cris/arch-v10/drivers/axisflashmap.c                     |  1 +
 arch/cris/arch-v32/drivers/axisflashmap.c                     |  1 +
 arch/cris/kernel/time.c                                       |  1 +
 arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c                   |  1 +
 arch/i386/kernel/cpu/cpufreq/p4-clockmod.c                    |  1 +
 arch/i386/kernel/cpu/cpufreq/powernow-k8.c                    |  1 +
 arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c             |  1 +
 arch/i386/kernel/cpu/intel_cacheinfo.c                        |  1 +
 arch/ia64/kernel/cyclone.c                                    |  1 +
 arch/m32r/lib/csum_partial_copy.c                             |  2 +-
 arch/mips/sgi-ip27/ip27-berr.c                                |  1 +
 arch/ppc/syslib/of_device.c                                   |  2 ++
 arch/ppc64/kernel/hvcserver.c                                 |  2 ++
 arch/ppc64/kernel/of_device.c                                 |  2 ++
 arch/ppc64/lib/locks.c                                        |  2 ++
 arch/sh/drivers/dma/dma-sysfs.c                               |  1 +
 arch/sh/kernel/cpufreq.c                                      |  1 +
 arch/xtensa/kernel/platform.c                                 |  1 +
 drivers/acpi/processor_idle.c                                 |  1 +
 drivers/base/class.c                                          |  1 +
 drivers/base/platform.c                                       |  1 +
 drivers/base/sys.c                                            |  1 +
 drivers/block/cciss_scsi.c                                    | 10 +++++++---
 drivers/block/paride/paride.c                                 |  1 +
 drivers/block/paride/pg.c                                     |  2 ++
 drivers/block/paride/pt.c                                     |  1 +
 drivers/char/agp/ali-agp.c                                    |  1 +
 drivers/char/agp/amd64-agp.c                                  |  1 +
 drivers/char/agp/ati-agp.c                                    |  2 ++
 drivers/char/agp/i460-agp.c                                   |  2 ++
 drivers/char/agp/isoch.c                                      |  1 +
 drivers/char/agp/sworks-agp.c                                 |  2 ++
 drivers/char/drm/drm_sysfs.c                                  |  2 ++
 drivers/char/mwave/3780i.c                                    |  2 ++
 drivers/char/watchdog/cpu5wdt.c                               |  1 +
 drivers/char/watchdog/mixcomwd.c                              |  2 ++
 drivers/char/watchdog/pcwd.c                                  |  2 +-
 drivers/char/watchdog/sc520_wdt.c                             |  1 +
 drivers/char/watchdog/softdog.c                               |  2 ++
 drivers/infiniband/core/cache.c                               |  1 +
 drivers/infiniband/core/sa_query.c                            |  1 +
 drivers/infiniband/hw/mthca/mthca_av.c                        |  2 ++
 drivers/infiniband/hw/mthca/mthca_mad.c                       |  3 +++
 drivers/infiniband/hw/mthca/mthca_mcg.c                       |  2 ++
 drivers/infiniband/hw/mthca/mthca_profile.c                   |  2 ++
 drivers/infiniband/hw/mthca/mthca_qp.c                        |  2 ++
 drivers/infiniband/hw/mthca/mthca_reset.c                     |  1 +
 drivers/infiniband/hw/mthca/mthca_uar.c                       |  2 ++
 drivers/input/gameport/gameport.c                             |  1 +
 drivers/input/joystick/a3d.c                                  |  1 +
 drivers/input/joystick/adi.c                                  |  1 +
 drivers/input/joystick/analog.c                               |  1 +
 drivers/input/joystick/cobra.c                                |  1 +
 drivers/input/joystick/gf2k.c                                 |  1 +
 drivers/input/joystick/grip.c                                 |  1 +
 drivers/input/joystick/grip_mp.c                              |  1 +
 drivers/input/joystick/guillemot.c                            |  1 +
 drivers/input/joystick/interact.c                             |  1 +
 drivers/input/joystick/joydump.c                              |  1 +
 drivers/input/joystick/sidewinder.c                           |  1 +
 drivers/input/joystick/tmdc.c                                 |  1 +
 drivers/input/serio/hp_sdc_mlc.c                              |  1 +
 drivers/isdn/capi/capifs.c                                    |  1 +
 drivers/macintosh/macio_asic.c                                |  2 ++
 drivers/mca/mca-device.c                                      |  1 +
 drivers/media/common/ir-common.c                              |  1 +
 drivers/media/dvb/dvb-core/dvb_ca_en50221.c                   |  1 +
 drivers/media/dvb/frontends/bcm3510.c                         |  3 +++
 drivers/media/dvb/frontends/dib3000mb.c                       |  2 ++
 drivers/media/dvb/frontends/dib3000mc.c                       |  2 ++
 drivers/media/dvb/frontends/dvb_dummy_fe.c                    |  2 ++
 drivers/media/dvb/frontends/lgdt330x.c                        |  2 ++
 drivers/media/dvb/frontends/mt312.c                           |  2 ++
 drivers/media/dvb/frontends/mt352.c                           |  2 ++
 drivers/media/dvb/frontends/nxt2002.c                         |  2 ++
 drivers/media/dvb/frontends/or51132.c                         |  2 ++
 drivers/media/dvb/frontends/or51211.c                         |  2 ++
 drivers/media/dvb/frontends/s5h1420.c                         |  2 ++
 drivers/media/dvb/frontends/sp8870.c                          |  2 ++
 drivers/media/dvb/frontends/sp887x.c                          |  2 ++
 drivers/media/dvb/frontends/stv0297.c                         |  2 ++
 drivers/media/dvb/frontends/stv0299.c                         |  1 +
 drivers/media/dvb/frontends/tda1004x.c                        |  4 ++++
 drivers/media/dvb/frontends/tda8083.c                         |  1 +
 drivers/media/radio/miropcm20-rds.c                           |  1 +
 drivers/message/i2o/device.c                                  |  2 ++
 drivers/message/i2o/driver.c                                  |  3 +++
 drivers/message/i2o/exec-osm.c                                |  4 ++++
 drivers/message/i2o/iop.c                                     |  1 +
 drivers/mtd/chips/jedec.c                                     |  1 +
 drivers/mtd/devices/lart.c                                    |  1 +
 drivers/mtd/devices/phram.c                                   |  1 +
 drivers/mtd/maps/bast-flash.c                                 |  1 +
 drivers/mtd/maps/ceiva.c                                      |  1 +
 drivers/mtd/maps/dc21285.c                                    |  1 +
 drivers/mtd/maps/dilnetpc.c                                   |  5 ++++-
 drivers/mtd/maps/epxa10db-flash.c                             |  5 ++++-
 drivers/mtd/maps/fortunet.c                                   |  5 ++++-
 drivers/mtd/maps/ixp2000.c                                    |  6 ++++--
 drivers/mtd/maps/ixp4xx.c                                     |  7 +++++--
 drivers/mtd/maps/lubbock-flash.c                              |  3 +++
 drivers/mtd/maps/mainstone-flash.c                            |  3 +++
 drivers/mtd/maps/omap-toto-flash.c                            |  2 +-
 drivers/mtd/maps/omap_nor.c                                   |  2 ++
 drivers/mtd/maps/pci.c                                        |  1 +
 drivers/mtd/maps/plat-ram.c                                   |  1 +
 drivers/mtd/maps/tqm8xxl.c                                    |  4 +++-
 drivers/mtd/mtdblock.c                                        |  1 +
 drivers/mtd/mtdchar.c                                         |  1 +
 drivers/mtd/mtdconcat.c                                       |  2 +-
 drivers/mtd/nand/s3c2410.c                                    |  1 +
 drivers/pci/hotplug/cpcihp_generic.c                          |  1 +
 drivers/pci/hotplug/cpcihp_zt5550.c                           |  1 +
 drivers/pci/hotplug/fakephp.c                                 |  2 ++
 drivers/pci/hotplug/pciehprm_nonacpi.c                        |  3 +++
 drivers/pci/hotplug/rpadlpar_core.c                           |  3 +++
 drivers/pci/hotplug/rpaphp_pci.c                              |  4 +++-
 drivers/pci/hotplug/rpaphp_slot.c                             |  3 +++
 drivers/pci/hotplug/shpchp.h                                  |  2 ++
 drivers/pci/hotplug/shpchprm_nonacpi.c                        |  2 ++
 drivers/pci/pci-driver.c                                      |  2 ++
 drivers/pci/pci.c                                             |  1 +
 drivers/pci/pcie/portdrv_core.c                               |  2 ++
 drivers/pci/pcie/portdrv_pci.c                                |  1 +
 drivers/pci/rom.c                                             |  1 +
 drivers/pnp/manager.c                                         |  2 ++
 drivers/pnp/pnpbios/rsparser.c                                |  2 ++
 drivers/s390/cio/cmf.c                                        |  3 +++
 drivers/s390/cio/device.c                                     |  1 +
 drivers/s390/cio/device_fsm.c                                 |  2 ++
 drivers/scsi/scsi_transport_fc.c                              |  1 +
 drivers/scsi/scsi_transport_iscsi.c                           |  3 +++
 drivers/scsi/sym53c8xx_2/sym_hipd.c                           |  3 +++
 drivers/scsi/sym53c8xx_2/sym_hipd.h                           |  2 ++
 drivers/sh/superhyway/superhyway.c                            |  2 ++
 drivers/usb/host/ohci-omap.c                                  |  3 +++
 drivers/usb/host/ohci-pci.c                                   |  2 ++
 drivers/usb/host/ohci-pxa27x.c                                |  1 +
 drivers/w1/w1_family.c                                        |  1 +
 drivers/zorro/zorro-sysfs.c                                   |  1 +
 drivers/zorro/zorro.c                                         |  2 ++
 fs/filesystems.c                                              |  1 +
 fs/jffs2/background.c                                         |  1 +
 fs/jffs2/wbuf.c                                               |  2 ++
 include/linux/cpufreq.h                                       |  1 +
 include/linux/gameport.h                                      |  1 +
 include/linux/i2c.h                                           |  1 +
 include/linux/i2o.h                                           |  8 ++++++--
 include/linux/kobj_map.h                                      |  2 ++
 include/linux/mtd/map.h                                       |  3 +++
 include/linux/serial.h                                        |  1 +
 include/linux/textsearch.h                                    |  1 +
 include/pcmcia/ss.h                                           |  1 +
 include/scsi/scsi_cmnd.h                                      |  1 +
 include/scsi/scsi_transport_fc.h                              |  1 +
 kernel/kallsyms.c                                             |  1 +
 kernel/kprobes.c                                              |  1 +
 kernel/params.c                                               |  1 +
 lib/kobject.c                                                 |  1 +
 lib/smp_processor_id.c                                        |  1 +
 lib/sort.c                                                    |  1 +
 lib/vsprintf.c                                                |  1 +
 sound/oss/ac97_codec.c                                        |  1 +
 180 files changed, 299 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/firmware_class/firmware_sample_driver.c b/Documentation/firmware_class/firmware_sample_driver.c
index 4bef8c25172c..d3ad2c24490a 100644
--- a/Documentation/firmware_class/firmware_sample_driver.c
+++ b/Documentation/firmware_class/firmware_sample_driver.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/string.h>
 
 #include "linux/firmware.h"
 
diff --git a/Documentation/firmware_class/firmware_sample_firmware_class.c b/Documentation/firmware_class/firmware_sample_firmware_class.c
index 09eab2f1b373..57b956aecbc5 100644
--- a/Documentation/firmware_class/firmware_sample_firmware_class.c
+++ b/Documentation/firmware_class/firmware_sample_firmware_class.c
@@ -14,6 +14,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/timer.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 #include <linux/firmware.h>
 
 
diff --git a/arch/arm/common/amba.c b/arch/arm/common/amba.c
index c6beb751f2a9..e1013112c354 100644
--- a/arch/arm/common/amba.c
+++ b/arch/arm/common/amba.c
@@ -10,6 +10,8 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
diff --git a/arch/arm/common/scoop.c b/arch/arm/common/scoop.c
index e8356b76d7c6..4af0cf5f3bfb 100644
--- a/arch/arm/common/scoop.c
+++ b/arch/arm/common/scoop.c
@@ -12,6 +12,9 @@
  */
 
 #include <linux/device.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <asm/io.h>
 #include <asm/hardware/scoop.h>
 
diff --git a/arch/arm/kernel/arthur.c b/arch/arm/kernel/arthur.c
index a418dad6692c..0ee2e9819631 100644
--- a/arch/arm/kernel/arthur.c
+++ b/arch/arm/kernel/arthur.c
@@ -18,6 +18,7 @@
 #include <linux/stddef.h>
 #include <linux/signal.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <asm/ptrace.h>
 
diff --git a/arch/arm/mach-imx/generic.c b/arch/arm/mach-imx/generic.c
index cb14b0682cef..837d7f0bda4c 100644
--- a/arch/arm/mach-imx/generic.c
+++ b/arch/arm/mach-imx/generic.c
@@ -26,6 +26,8 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/string.h>
+
 #include <asm/arch/imxfb.h>
 #include <asm/hardware.h>
 #include <asm/arch/imx-regs.h>
diff --git a/arch/arm/mach-integrator/clock.c b/arch/arm/mach-integrator/clock.c
index 56200594db3c..73c360685cad 100644
--- a/arch/arm/mach-integrator/clock.c
+++ b/arch/arm/mach-integrator/clock.c
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/semaphore.h>
 #include <asm/hardware/clock.h>
diff --git a/arch/arm/mach-integrator/integrator_ap.c b/arch/arm/mach-integrator/integrator_ap.c
index f368b85f0447..764ceb49470a 100644
--- a/arch/arm/mach-integrator/integrator_ap.c
+++ b/arch/arm/mach-integrator/integrator_ap.c
@@ -30,6 +30,7 @@
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <asm/setup.h>
+#include <asm/param.h>		/* HZ */
 #include <asm/mach-types.h>
 #include <asm/hardware/amba.h>
 #include <asm/hardware/amba_kmi.h>
diff --git a/arch/arm/mach-integrator/lm.c b/arch/arm/mach-integrator/lm.c
index c5f19d160598..5b41e3a724e1 100644
--- a/arch/arm/mach-integrator/lm.c
+++ b/arch/arm/mach-integrator/lm.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 
 #include <asm/arch/lm.h>
 
diff --git a/arch/arm/mach-iop3xx/iq31244-pci.c b/arch/arm/mach-iop3xx/iq31244-pci.c
index f997daa800bf..c6a973ba8fc6 100644
--- a/arch/arm/mach-iop3xx/iq31244-pci.c
+++ b/arch/arm/mach-iop3xx/iq31244-pci.c
@@ -14,6 +14,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-iop3xx/iq80321-pci.c b/arch/arm/mach-iop3xx/iq80321-pci.c
index 79fea3d20b66..802f6d091b75 100644
--- a/arch/arm/mach-iop3xx/iq80321-pci.c
+++ b/arch/arm/mach-iop3xx/iq80321-pci.c
@@ -14,6 +14,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-iop3xx/iq80331-pci.c b/arch/arm/mach-iop3xx/iq80331-pci.c
index f37a0e26b466..654e450a1311 100644
--- a/arch/arm/mach-iop3xx/iq80331-pci.c
+++ b/arch/arm/mach-iop3xx/iq80331-pci.c
@@ -13,6 +13,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-iop3xx/iq80332-pci.c b/arch/arm/mach-iop3xx/iq80332-pci.c
index b9807aa2aade..65951ffe4631 100644
--- a/arch/arm/mach-iop3xx/iq80332-pci.c
+++ b/arch/arm/mach-iop3xx/iq80332-pci.c
@@ -13,6 +13,8 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-pxa/generic.c b/arch/arm/mach-pxa/generic.c
index 3248bc9b9495..9c0289333301 100644
--- a/arch/arm/mach-pxa/generic.c
+++ b/arch/arm/mach-pxa/generic.c
@@ -23,6 +23,7 @@
 #include <linux/device.h>
 #include <linux/ioport.h>
 #include <linux/pm.h>
+#include <linux/string.h>
 
 #include <asm/hardware.h>
 #include <asm/irq.h>
diff --git a/arch/arm/mach-sa1100/generic.c b/arch/arm/mach-sa1100/generic.c
index f94b0fbcdcc8..83eba8b54816 100644
--- a/arch/arm/mach-sa1100/generic.c
+++ b/arch/arm/mach-sa1100/generic.c
@@ -17,6 +17,7 @@
 #include <linux/pm.h>
 #include <linux/cpufreq.h>
 #include <linux/ioport.h>
+#include <linux/sched.h>	/* just for sched_clock() - funny that */
 
 #include <asm/div64.h>
 #include <asm/hardware.h>
diff --git a/arch/arm/mach-versatile/clock.c b/arch/arm/mach-versatile/clock.c
index 48025c2b9987..b96a2ea15d41 100644
--- a/arch/arm/mach-versatile/clock.c
+++ b/arch/arm/mach-versatile/clock.c
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/semaphore.h>
 #include <asm/hardware/clock.h>
diff --git a/arch/arm/plat-omap/clock.c b/arch/arm/plat-omap/clock.c
index 52a58b2da288..a020fe16428f 100644
--- a/arch/arm/plat-omap/clock.c
+++ b/arch/arm/plat-omap/clock.c
@@ -13,6 +13,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/io.h>
 #include <asm/semaphore.h>
diff --git a/arch/cris/arch-v10/drivers/axisflashmap.c b/arch/cris/arch-v10/drivers/axisflashmap.c
index 11ab3836aac6..56b038c8d482 100644
--- a/arch/cris/arch-v10/drivers/axisflashmap.c
+++ b/arch/cris/arch-v10/drivers/axisflashmap.c
@@ -140,6 +140,7 @@
 #include <linux/kernel.h>
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/concat.h>
 #include <linux/mtd/map.h>
diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index 78ed52b1cdac..b679f983b90a 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -20,6 +20,7 @@
 #include <linux/kernel.h>
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/concat.h>
 #include <linux/mtd/map.h>
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index b863815de78d..66ba8898db07 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -31,6 +31,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/profile.h>
+#include <linux/sched.h>	/* just for sched_clock() - funny that */
 
 int have_rtc;  /* used to remember if we have an RTC or not */;
 
diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
index 822c8ce9d1f1..caa9f7711343 100644
--- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -32,6 +32,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/compiler.h>
+#include <linux/sched.h>	/* current */
 #include <asm/io.h>
 #include <asm/delay.h>
 #include <asm/uaccess.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
index aa622d52c6e5..270f2188d68b 100644
--- a/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/i386/kernel/cpu/cpufreq/p4-clockmod.c
@@ -28,6 +28,7 @@
 #include <linux/cpufreq.h>
 #include <linux/slab.h>
 #include <linux/cpumask.h>
+#include <linux/sched.h>	/* current / set_cpus_allowed() */
 
 #include <asm/processor.h> 
 #include <asm/msr.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
index 58ca98fdc2ca..2d5c9adba0cd 100644
--- a/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/i386/kernel/cpu/cpufreq/powernow-k8.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/cpumask.h>
+#include <linux/sched.h>	/* for current / set_cpus_allowed() */
 
 #include <asm/msr.h>
 #include <asm/io.h>
diff --git a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
index c397b6220430..1465974256c9 100644
--- a/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/i386/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/cpufreq.h>
 #include <linux/config.h>
+#include <linux/sched.h>	/* current */
 #include <linux/delay.h>
 #include <linux/compiler.h>
 
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index f0839334881c..4dc42a189ae5 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -11,6 +11,7 @@
 #include <linux/device.h>
 #include <linux/compiler.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 
 #include <asm/processor.h>
 #include <asm/smp.h>
diff --git a/arch/ia64/kernel/cyclone.c b/arch/ia64/kernel/cyclone.c
index 768c7e46957c..6ade3790ce07 100644
--- a/arch/ia64/kernel/cyclone.c
+++ b/arch/ia64/kernel/cyclone.c
@@ -2,6 +2,7 @@
 #include <linux/smp.h>
 #include <linux/time.h>
 #include <linux/errno.h>
+#include <linux/timex.h>
 #include <asm/io.h>
 
 /* IBM Summit (EXA) Cyclone counter code*/
diff --git a/arch/m32r/lib/csum_partial_copy.c b/arch/m32r/lib/csum_partial_copy.c
index ddb16a83a8ce..3d5f06145854 100644
--- a/arch/m32r/lib/csum_partial_copy.c
+++ b/arch/m32r/lib/csum_partial_copy.c
@@ -18,10 +18,10 @@
 
 #include <linux/module.h>
 #include <linux/types.h>
+#include <linux/string.h>
 
 #include <net/checksum.h>
 #include <asm/byteorder.h>
-#include <asm/string.h>
 #include <asm/uaccess.h>
 
 /*
diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c
index e1829a5d3b19..07631a97670b 100644
--- a/arch/mips/sgi-ip27/ip27-berr.c
+++ b/arch/mips/sgi-ip27/ip27-berr.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/signal.h>	/* for SIGBUS */
 
 #include <asm/module.h>
 #include <asm/sn/addrs.h>
diff --git a/arch/ppc/syslib/of_device.c b/arch/ppc/syslib/of_device.c
index 93c7231ea709..85b821251635 100644
--- a/arch/ppc/syslib/of_device.c
+++ b/arch/ppc/syslib/of_device.c
@@ -4,6 +4,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
 #include <asm/errno.h>
 #include <asm/of_device.h>
 
diff --git a/arch/ppc64/kernel/hvcserver.c b/arch/ppc64/kernel/hvcserver.c
index bde8f42da854..4d584172055a 100644
--- a/arch/ppc64/kernel/hvcserver.c
+++ b/arch/ppc64/kernel/hvcserver.c
@@ -22,6 +22,8 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+
 #include <asm/hvcall.h>
 #include <asm/hvcserver.h>
 #include <asm/io.h>
diff --git a/arch/ppc64/kernel/of_device.c b/arch/ppc64/kernel/of_device.c
index 9f200f0f2ad5..3aabfd0d3dda 100644
--- a/arch/ppc64/kernel/of_device.c
+++ b/arch/ppc64/kernel/of_device.c
@@ -4,6 +4,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mod_devicetable.h>
+#include <linux/slab.h>
+
 #include <asm/errno.h>
 #include <asm/of_device.h>
 
diff --git a/arch/ppc64/lib/locks.c b/arch/ppc64/lib/locks.c
index 033643ab69e0..d622c1d58e4e 100644
--- a/arch/ppc64/lib/locks.c
+++ b/arch/ppc64/lib/locks.c
@@ -17,6 +17,8 @@
 #include <linux/spinlock.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
+#include <linux/smp.h>
+
 #include <asm/hvcall.h>
 #include <asm/iSeries/HvCall.h>
 
diff --git a/arch/sh/drivers/dma/dma-sysfs.c b/arch/sh/drivers/dma/dma-sysfs.c
index 71a6d4e7809f..6e3b58bd8795 100644
--- a/arch/sh/drivers/dma/dma-sysfs.c
+++ b/arch/sh/drivers/dma/dma-sysfs.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/sysdev.h>
 #include <linux/module.h>
+#include <linux/string.h>
 #include <asm/dma.h>
 
 static struct sysdev_class dma_sysclass = {
diff --git a/arch/sh/kernel/cpufreq.c b/arch/sh/kernel/cpufreq.c
index e0b384bef55f..47abf6e49dfb 100644
--- a/arch/sh/kernel/cpufreq.c
+++ b/arch/sh/kernel/cpufreq.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/cpumask.h>
 #include <linux/smp.h>
+#include <linux/sched.h>	/* set_cpus_allowed() */
 
 #include <asm/processor.h>
 #include <asm/watchdog.h>
diff --git a/arch/xtensa/kernel/platform.c b/arch/xtensa/kernel/platform.c
index 03674daabc66..a17930747f20 100644
--- a/arch/xtensa/kernel/platform.c
+++ b/arch/xtensa/kernel/platform.c
@@ -18,6 +18,7 @@
 #include <linux/time.h>
 #include <asm/platform.h>
 #include <asm/timex.h>
+#include <asm/param.h>		/* HZ */
 
 #define _F(r,f,a,b)							\
 	r __platform_##f a b;                                   	\
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 26a3a4016115..161db4acfb91 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -37,6 +37,7 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
+#include <linux/sched.h>	/* need_resched() */
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/base/class.c b/drivers/base/class.c
index c3e569730afe..db65fd0babe9 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -17,6 +17,7 @@
 #include <linux/string.h>
 #include <linux/kdev_t.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 #include "base.h"
 
 #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr)
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 75ce8711bca5..08d9cc99c7de 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -16,6 +16,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/bootmem.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #include "base.h"
 
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index 3431eb6004c3..66ed8f2fece5 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/pm.h>
+#include <asm/semaphore.h>
 
 extern struct subsystem devices_subsys;
 
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e183a3ef7839..ec27976a57da 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -28,13 +28,17 @@
    through the array controller.  Note in particular, neither 
    physical nor logical disks are presented through the scsi layer. */
 
+#include <linux/timer.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include <asm/atomic.h>
+
 #include <scsi/scsi.h> 
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h> 
-#include <asm/atomic.h>
-#include <linux/timer.h>
-#include <linux/completion.h>
 
 #include "cciss_scsi.h"
 
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
index 1fef136c0e41..ce94aa11f6a7 100644
--- a/drivers/block/paride/paride.c
+++ b/drivers/block/paride/paride.c
@@ -29,6 +29,7 @@
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/wait.h>
+#include <linux/sched.h>	/* TASK_* */
 
 #ifdef CONFIG_PARPORT_MODULE
 #define CONFIG_PARPORT
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index 82f2d6d2eeef..6f5df0fad703 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -162,6 +162,8 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
 #include <linux/mtio.h>
 #include <linux/pg.h>
 #include <linux/device.h>
+#include <linux/sched.h>	/* current, TASK_* */
+#include <linux/jiffies.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index 686c95573452..715ae5dc88fb 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -146,6 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
 #include <linux/slab.h>
 #include <linux/mtio.h>
 #include <linux/device.h>
+#include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/char/agp/ali-agp.c b/drivers/char/agp/ali-agp.c
index 9c9c9c2247ce..b02fc2267159 100644
--- a/drivers/char/agp/ali-agp.c
+++ b/drivers/char/agp/ali-agp.c
@@ -7,6 +7,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/agp_backend.h>
+#include <asm/page.h>		/* PAGE_SIZE */
 #include "agp.h"
 
 #define ALI_AGPCTRL	0xb8
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 0a7624a9b1c1..0e6c3a31d344 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -13,6 +13,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/agp_backend.h>
+#include <asm/page.h>		/* PAGE_SIZE */
 #include "agp.h"
 
 /* Will need to be increased if AMD64 ever goes >8-way. */
diff --git a/drivers/char/agp/ati-agp.c b/drivers/char/agp/ati-agp.c
index e572ced9100a..0b6e72642d6e 100644
--- a/drivers/char/agp/ati-agp.c
+++ b/drivers/char/agp/ati-agp.c
@@ -6,6 +6,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/agp_backend.h>
 #include <asm/agp.h>
 #include "agp.h"
diff --git a/drivers/char/agp/i460-agp.c b/drivers/char/agp/i460-agp.c
index 94943298c03e..a2d9e5e48bbe 100644
--- a/drivers/char/agp/i460-agp.c
+++ b/drivers/char/agp/i460-agp.c
@@ -10,6 +10,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/agp_backend.h>
 
 #include "agp.h"
diff --git a/drivers/char/agp/isoch.c b/drivers/char/agp/isoch.c
index c9ac731504f2..40083241804e 100644
--- a/drivers/char/agp/isoch.c
+++ b/drivers/char/agp/isoch.c
@@ -6,6 +6,7 @@
 #include <linux/pci.h>
 #include <linux/agp_backend.h>
 #include <linux/module.h>
+#include <linux/slab.h>
 
 #include "agp.h"
 
diff --git a/drivers/char/agp/sworks-agp.c b/drivers/char/agp/sworks-agp.c
index a9fb12c20eb7..71ea59a1dbeb 100644
--- a/drivers/char/agp/sworks-agp.c
+++ b/drivers/char/agp/sworks-agp.c
@@ -5,6 +5,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/agp_backend.h>
 #include "agp.h"
 
diff --git a/drivers/char/drm/drm_sysfs.c b/drivers/char/drm/drm_sysfs.c
index 475cc5e555e1..6d3449761914 100644
--- a/drivers/char/drm/drm_sysfs.c
+++ b/drivers/char/drm/drm_sysfs.c
@@ -15,6 +15,8 @@
 #include <linux/device.h>
 #include <linux/kdev_t.h>
 #include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 
 #include "drm_core.h"
 #include "drmP.h"
diff --git a/drivers/char/mwave/3780i.c b/drivers/char/mwave/3780i.c
index 613aed9e1840..d1fe05e83882 100644
--- a/drivers/char/mwave/3780i.c
+++ b/drivers/char/mwave/3780i.c
@@ -53,6 +53,8 @@
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>	/* cond_resched() */
+
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/drivers/char/watchdog/cpu5wdt.c b/drivers/char/watchdog/cpu5wdt.c
index 2865dac0a813..e75045fe2641 100644
--- a/drivers/char/watchdog/cpu5wdt.c
+++ b/drivers/char/watchdog/cpu5wdt.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/timer.h>
+#include <linux/jiffies.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/char/watchdog/mixcomwd.c b/drivers/char/watchdog/mixcomwd.c
index 7fc2188386d9..d8dede575402 100644
--- a/drivers/char/watchdog/mixcomwd.c
+++ b/drivers/char/watchdog/mixcomwd.c
@@ -45,6 +45,8 @@
 #include <linux/fs.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/timer.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
diff --git a/drivers/char/watchdog/pcwd.c b/drivers/char/watchdog/pcwd.c
index 427ad51b7a35..37c9e13ad3ac 100644
--- a/drivers/char/watchdog/pcwd.c
+++ b/drivers/char/watchdog/pcwd.c
@@ -66,7 +66,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/reboot.h>
-
+#include <linux/sched.h>	/* TASK_INTERRUPTIBLE, set_current_state() and friends */
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
diff --git a/drivers/char/watchdog/sc520_wdt.c b/drivers/char/watchdog/sc520_wdt.c
index 72501be79b0c..4ee9974ad8cb 100644
--- a/drivers/char/watchdog/sc520_wdt.c
+++ b/drivers/char/watchdog/sc520_wdt.c
@@ -63,6 +63,7 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
diff --git a/drivers/char/watchdog/softdog.c b/drivers/char/watchdog/softdog.c
index 20e5eb8667f2..a91edaf3a350 100644
--- a/drivers/char/watchdog/softdog.c
+++ b/drivers/char/watchdog/softdog.c
@@ -47,6 +47,8 @@
 #include <linux/notifier.h>
 #include <linux/reboot.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+
 #include <asm/uaccess.h>
 
 #define PFX "SoftDog: "
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index f014e639088c..c57a3871184c 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
+#include <linux/sched.h>	/* INIT_WORK, schedule_work(), flush_scheduled_work() */
 
 #include <rdma/ib_cache.h>
 
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 89ce9dc210d4..acda7d63d6fe 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -43,6 +43,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/kref.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #include <rdma/ib_pack.h>
 #include <rdma/ib_sa.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
index 889e85096736..22fdc446f25c 100644
--- a/drivers/infiniband/hw/mthca/mthca_av.c
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -34,6 +34,8 @@
  */
 
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
index 8561b297a19b..1229c604c6e0 100644
--- a/drivers/infiniband/hw/mthca/mthca_mad.c
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -34,6 +34,9 @@
  * $Id: mthca_mad.c 1349 2004-12-16 21:09:43Z roland $
  */
 
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_mad.h>
 #include <rdma/ib_smi.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
index b47ea7daf088..2fc449da418d 100644
--- a/drivers/infiniband/hw/mthca/mthca_mcg.c
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -33,6 +33,8 @@
  */
 
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
index 0576056b34f4..bd1338682074 100644
--- a/drivers/infiniband/hw/mthca/mthca_profile.c
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -35,6 +35,8 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "mthca_profile.h"
 
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
index 62ff091505da..7c9afde5ace5 100644
--- a/drivers/infiniband/hw/mthca/mthca_qp.c
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -36,6 +36,8 @@
  */
 
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
index 4f995391dd1d..df5e494a9d38 100644
--- a/drivers/infiniband/hw/mthca/mthca_reset.c
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -37,6 +37,7 @@
 #include <linux/errno.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
index 1c8791ded6ff..8e9219842be4 100644
--- a/drivers/infiniband/hw/mthca/mthca_uar.c
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -32,6 +32,8 @@
  * $Id$
  */
 
+#include <asm/page.h>		/* PAGE_SHIFT */
+
 #include "mthca_dev.h"
 #include "mthca_memfree.h"
 
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index ab09cf4093e3..0506934244f0 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/sched.h>	/* HZ */
 
 /*#include <asm/io.h>*/
 
diff --git a/drivers/input/joystick/a3d.c b/drivers/input/joystick/a3d.c
index bf65430181fa..4571ea3a4b92 100644
--- a/drivers/input/joystick/a3d.c
+++ b/drivers/input/joystick/a3d.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"FP-Gaming Assasin 3D joystick driver"
 
diff --git a/drivers/input/joystick/adi.c b/drivers/input/joystick/adi.c
index 9d95459f4bcb..704bf70f1db7 100644
--- a/drivers/input/joystick/adi.c
+++ b/drivers/input/joystick/adi.c
@@ -34,6 +34,7 @@
 #include <linux/input.h>
 #include <linux/gameport.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Logitech ADI joystick family driver"
 
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index c75ac6eb1ffb..3121961e3e7c 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/gameport.h>
+#include <linux/jiffies.h>
 #include <asm/timex.h>
 
 #define DRIVER_DESC	"Analog joystick and gamepad driver"
diff --git a/drivers/input/joystick/cobra.c b/drivers/input/joystick/cobra.c
index 9a3dfc724a41..1909f7ef340c 100644
--- a/drivers/input/joystick/cobra.c
+++ b/drivers/input/joystick/cobra.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Creative Labs Blaster GamePad Cobra driver"
 
diff --git a/drivers/input/joystick/gf2k.c b/drivers/input/joystick/gf2k.c
index e151f8c5bcb9..8a3ad455eb38 100644
--- a/drivers/input/joystick/gf2k.c
+++ b/drivers/input/joystick/gf2k.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/gameport.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Genius Flight 2000 joystick driver"
 
diff --git a/drivers/input/joystick/grip.c b/drivers/input/joystick/grip.c
index e206bb56e53c..a936e7aedb10 100644
--- a/drivers/input/joystick/grip.c
+++ b/drivers/input/joystick/grip.c
@@ -34,6 +34,7 @@
 #include <linux/slab.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Gravis GrIP protocol joystick driver"
 
diff --git a/drivers/input/joystick/grip_mp.c b/drivers/input/joystick/grip_mp.c
index a0ba93ccac72..51a912222e85 100644
--- a/drivers/input/joystick/grip_mp.c
+++ b/drivers/input/joystick/grip_mp.c
@@ -19,6 +19,7 @@
 #include <linux/input.h>
 #include <linux/delay.h>
 #include <linux/proc_fs.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Gravis Grip Multiport driver"
 
diff --git a/drivers/input/joystick/guillemot.c b/drivers/input/joystick/guillemot.c
index c528473c09d8..6e2c721c26ba 100644
--- a/drivers/input/joystick/guillemot.c
+++ b/drivers/input/joystick/guillemot.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Guillemot Digital joystick driver"
 
diff --git a/drivers/input/joystick/interact.c b/drivers/input/joystick/interact.c
index 8511ee7bb263..c4ed01758226 100644
--- a/drivers/input/joystick/interact.c
+++ b/drivers/input/joystick/interact.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"InterAct digital joystick driver"
 
diff --git a/drivers/input/joystick/joydump.c b/drivers/input/joystick/joydump.c
index 4234ccaf9146..88ec5a918f2e 100644
--- a/drivers/input/joystick/joydump.c
+++ b/drivers/input/joystick/joydump.c
@@ -34,6 +34,7 @@
 #include <linux/kernel.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #define DRIVER_DESC	"Gameport data dumper module"
 
diff --git a/drivers/input/joystick/sidewinder.c b/drivers/input/joystick/sidewinder.c
index eaaad45cc750..78dd163cd702 100644
--- a/drivers/input/joystick/sidewinder.c
+++ b/drivers/input/joystick/sidewinder.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/input.h>
 #include <linux/gameport.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"Microsoft SideWinder joystick family driver"
 
diff --git a/drivers/input/joystick/tmdc.c b/drivers/input/joystick/tmdc.c
index 3a7d1bb46472..60e2aac7d06e 100644
--- a/drivers/input/joystick/tmdc.c
+++ b/drivers/input/joystick/tmdc.c
@@ -38,6 +38,7 @@
 #include <linux/init.h>
 #include <linux/gameport.h>
 #include <linux/input.h>
+#include <linux/jiffies.h>
 
 #define DRIVER_DESC	"ThrustMaster DirectConnect joystick driver"
 
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index e3c44ffae674..1c9426fd5205 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -40,6 +40,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/string.h>
+#include <asm/semaphore.h>
 
 #define PREFIX "HP SDC MLC: "
 
diff --git a/drivers/isdn/capi/capifs.c b/drivers/isdn/capi/capifs.c
index 3abd7fc6e5ef..7b564c0dd996 100644
--- a/drivers/isdn/capi/capifs.c
+++ b/drivers/isdn/capi/capifs.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/ctype.h>
+#include <linux/sched.h>	/* current */
 
 MODULE_DESCRIPTION("CAPI4Linux: /dev/capi/ filesystem");
 MODULE_AUTHOR("Carsten Paeth");
diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c
index 1ee003346923..c34c96d18907 100644
--- a/drivers/macintosh/macio_asic.c
+++ b/drivers/macintosh/macio_asic.c
@@ -17,6 +17,8 @@
 #include <linux/pci_ids.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/slab.h>
+
 #include <asm/machdep.h>
 #include <asm/macio.h>
 #include <asm/pmac_feature.h>
diff --git a/drivers/mca/mca-device.c b/drivers/mca/mca-device.c
index 76d430aa243f..e7adf89fae41 100644
--- a/drivers/mca/mca-device.c
+++ b/drivers/mca/mca-device.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/mca.h>
+#include <linux/string.h>
 
 /**
  *	mca_device_read_stored_pos - read POS register from stored data
diff --git a/drivers/media/common/ir-common.c b/drivers/media/common/ir-common.c
index 06f4d4686a6c..31fccb4f05d6 100644
--- a/drivers/media/common/ir-common.c
+++ b/drivers/media/common/ir-common.c
@@ -22,6 +22,7 @@
 
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
 #include <media/ir-common.h>
 
 /* -------------------------------------------------------------------------- */
diff --git a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
index 88757e2634e5..2aa767f9bd7d 100644
--- a/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
+++ b/drivers/media/dvb/dvb-core/dvb_ca_en50221.c
@@ -36,6 +36,7 @@
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
 #include <linux/rwsem.h>
+#include <linux/sched.h>
 
 #include "dvb_ca_en50221.h"
 #include "dvb_ringbuffer.h"
diff --git a/drivers/media/dvb/frontends/bcm3510.c b/drivers/media/dvb/frontends/bcm3510.c
index f5fdc5c3e605..f6d4ee78bdd4 100644
--- a/drivers/media/dvb/frontends/bcm3510.c
+++ b/drivers/media/dvb/frontends/bcm3510.c
@@ -36,6 +36,9 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "bcm3510.h"
diff --git a/drivers/media/dvb/frontends/dib3000mb.c b/drivers/media/dvb/frontends/dib3000mb.c
index 21433e1831e7..6b0553608610 100644
--- a/drivers/media/dvb/frontends/dib3000mb.c
+++ b/drivers/media/dvb/frontends/dib3000mb.c
@@ -27,6 +27,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dib3000-common.h"
 #include "dib3000mb_priv.h"
diff --git a/drivers/media/dvb/frontends/dib3000mc.c b/drivers/media/dvb/frontends/dib3000mc.c
index 441de665fec3..c024fad17337 100644
--- a/drivers/media/dvb/frontends/dib3000mc.c
+++ b/drivers/media/dvb/frontends/dib3000mc.c
@@ -26,6 +26,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dib3000-common.h"
 #include "dib3000mc_priv.h"
diff --git a/drivers/media/dvb/frontends/dvb_dummy_fe.c b/drivers/media/dvb/frontends/dvb_dummy_fe.c
index cff93b9d8ab2..794be520d590 100644
--- a/drivers/media/dvb/frontends/dvb_dummy_fe.c
+++ b/drivers/media/dvb/frontends/dvb_dummy_fe.c
@@ -22,6 +22,8 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "dvb_dummy_fe.h"
diff --git a/drivers/media/dvb/frontends/lgdt330x.c b/drivers/media/dvb/frontends/lgdt330x.c
index 7142b9c51dd2..8dde72bd1046 100644
--- a/drivers/media/dvb/frontends/lgdt330x.c
+++ b/drivers/media/dvb/frontends/lgdt330x.c
@@ -37,6 +37,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/mt312.c b/drivers/media/dvb/frontends/mt312.c
index e455aecd76b2..e38454901dd1 100644
--- a/drivers/media/dvb/frontends/mt312.c
+++ b/drivers/media/dvb/frontends/mt312.c
@@ -29,6 +29,8 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "mt312_priv.h"
diff --git a/drivers/media/dvb/frontends/mt352.c b/drivers/media/dvb/frontends/mt352.c
index cc1bc0edd65e..f0c610f2c2df 100644
--- a/drivers/media/dvb/frontends/mt352.c
+++ b/drivers/media/dvb/frontends/mt352.c
@@ -35,6 +35,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "mt352_priv.h"
diff --git a/drivers/media/dvb/frontends/nxt2002.c b/drivers/media/dvb/frontends/nxt2002.c
index 35a1d60f1927..30786b1911bd 100644
--- a/drivers/media/dvb/frontends/nxt2002.c
+++ b/drivers/media/dvb/frontends/nxt2002.c
@@ -32,6 +32,8 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "nxt2002.h"
diff --git a/drivers/media/dvb/frontends/or51132.c b/drivers/media/dvb/frontends/or51132.c
index b6d0eecc59eb..817b044c7fd1 100644
--- a/drivers/media/dvb/frontends/or51132.c
+++ b/drivers/media/dvb/frontends/or51132.c
@@ -36,6 +36,8 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/or51211.c b/drivers/media/dvb/frontends/or51211.c
index ad56a9958404..8a9db23dd1b7 100644
--- a/drivers/media/dvb/frontends/or51211.c
+++ b/drivers/media/dvb/frontends/or51211.c
@@ -34,6 +34,8 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <asm/byteorder.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/s5h1420.c b/drivers/media/dvb/frontends/s5h1420.c
index c7fe27fd530c..f265418e3261 100644
--- a/drivers/media/dvb/frontends/s5h1420.c
+++ b/drivers/media/dvb/frontends/s5h1420.c
@@ -26,6 +26,8 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <asm/div64.h>
 
 #include "dvb_frontend.h"
 #include "s5h1420.h"
diff --git a/drivers/media/dvb/frontends/sp8870.c b/drivers/media/dvb/frontends/sp8870.c
index 764a95a2e212..1c6b2e9264bc 100644
--- a/drivers/media/dvb/frontends/sp8870.c
+++ b/drivers/media/dvb/frontends/sp8870.c
@@ -32,6 +32,8 @@
 #include <linux/device.h>
 #include <linux/firmware.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "sp8870.h"
diff --git a/drivers/media/dvb/frontends/sp887x.c b/drivers/media/dvb/frontends/sp887x.c
index d868a6927a16..73384e75625e 100644
--- a/drivers/media/dvb/frontends/sp887x.c
+++ b/drivers/media/dvb/frontends/sp887x.c
@@ -14,6 +14,8 @@
 #include <linux/moduleparam.h>
 #include <linux/device.h>
 #include <linux/firmware.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "sp887x.h"
diff --git a/drivers/media/dvb/frontends/stv0297.c b/drivers/media/dvb/frontends/stv0297.c
index 8d09afd7545d..6122ba754bc5 100644
--- a/drivers/media/dvb/frontends/stv0297.c
+++ b/drivers/media/dvb/frontends/stv0297.c
@@ -24,6 +24,8 @@
 #include <linux/module.h>
 #include <linux/string.h>
 #include <linux/delay.h>
+#include <linux/jiffies.h>
+#include <linux/slab.h>
 
 #include "dvb_frontend.h"
 #include "stv0297.h"
diff --git a/drivers/media/dvb/frontends/stv0299.c b/drivers/media/dvb/frontends/stv0299.c
index 2d62931f20b5..889d9257215d 100644
--- a/drivers/media/dvb/frontends/stv0299.c
+++ b/drivers/media/dvb/frontends/stv0299.c
@@ -48,6 +48,7 @@
 #include <linux/moduleparam.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include <asm/div64.h>
 
 #include "dvb_frontend.h"
diff --git a/drivers/media/dvb/frontends/tda1004x.c b/drivers/media/dvb/frontends/tda1004x.c
index 74cea9f8d721..3529c618f828 100644
--- a/drivers/media/dvb/frontends/tda1004x.c
+++ b/drivers/media/dvb/frontends/tda1004x.c
@@ -32,6 +32,10 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/device.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include "dvb_frontend.h"
 #include "tda1004x.h"
 
diff --git a/drivers/media/dvb/frontends/tda8083.c b/drivers/media/dvb/frontends/tda8083.c
index 168e013d23bd..c05cf1861051 100644
--- a/drivers/media/dvb/frontends/tda8083.c
+++ b/drivers/media/dvb/frontends/tda8083.c
@@ -30,6 +30,7 @@
 #include <linux/moduleparam.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/jiffies.h>
 #include "dvb_frontend.h"
 #include "tda8083.h"
 
diff --git a/drivers/media/radio/miropcm20-rds.c b/drivers/media/radio/miropcm20-rds.c
index df79d5e0aaed..e09214082e01 100644
--- a/drivers/media/radio/miropcm20-rds.c
+++ b/drivers/media/radio/miropcm20-rds.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/miscdevice.h>
+#include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
 #include <linux/delay.h>
 #include <asm/uaccess.h>
 #include "miropcm20-rds-core.h"
diff --git a/drivers/message/i2o/device.c b/drivers/message/i2o/device.c
index d9879965eb50..8eb50cdb8ae1 100644
--- a/drivers/message/i2o/device.c
+++ b/drivers/message/i2o/device.c
@@ -16,6 +16,8 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "core.h"
 
 /**
diff --git a/drivers/message/i2o/driver.c b/drivers/message/i2o/driver.c
index 0079a4be0af2..0fb9c4e2ad4c 100644
--- a/drivers/message/i2o/driver.c
+++ b/drivers/message/i2o/driver.c
@@ -17,6 +17,9 @@
 #include <linux/module.h>
 #include <linux/rwsem.h>
 #include <linux/i2o.h>
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "core.h"
 
 #define OSM_NAME	"i2o"
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index bda2c62648ba..b675b4ebbebd 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -30,6 +30,10 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <asm/param.h>		/* HZ */
 #include "core.h"
 
 #define OSM_NAME "exec-osm"
diff --git a/drivers/message/i2o/iop.c b/drivers/message/i2o/iop.c
index 361da8d1d5e7..61b837de4b6a 100644
--- a/drivers/message/i2o/iop.c
+++ b/drivers/message/i2o/iop.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/i2o.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include "core.h"
 
 #define OSM_NAME	"i2o"
diff --git a/drivers/mtd/chips/jedec.c b/drivers/mtd/chips/jedec.c
index 62d235a9a4e2..4f6778f3ee3e 100644
--- a/drivers/mtd/chips/jedec.c
+++ b/drivers/mtd/chips/jedec.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/slab.h>
 #include <linux/mtd/jedec.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/mtd.h>
diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c
index dfd335e4a2a8..df987a53ed9c 100644
--- a/drivers/mtd/devices/lart.c
+++ b/drivers/mtd/devices/lart.c
@@ -44,6 +44,7 @@
 #include <linux/types.h>
 #include <linux/init.h>
 #include <linux/errno.h>
+#include <linux/string.h>
 #include <linux/mtd/mtd.h>
 #ifdef HAVE_PARTITIONS
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/devices/phram.c b/drivers/mtd/devices/phram.c
index a423a382095a..765c0179c8df 100644
--- a/drivers/mtd/devices/phram.c
+++ b/drivers/mtd/devices/phram.c
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/slab.h>
 #include <linux/mtd/mtd.h>
 
 #define ERROR(fmt, args...) printk(KERN_ERR "phram: " fmt , ## args)
diff --git a/drivers/mtd/maps/bast-flash.c b/drivers/mtd/maps/bast-flash.c
index 0ba0ff7d43b9..63104c73ca3c 100644
--- a/drivers/mtd/maps/bast-flash.c
+++ b/drivers/mtd/maps/bast-flash.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/ceiva.c b/drivers/mtd/maps/ceiva.c
index da8584a662f4..c68b31dc7e6d 100644
--- a/drivers/mtd/maps/ceiva.c
+++ b/drivers/mtd/maps/ceiva.c
@@ -20,6 +20,7 @@
 #include <linux/ioport.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/dc21285.c b/drivers/mtd/maps/dc21285.c
index 938c41f2f056..e5b74169fde6 100644
--- a/drivers/mtd/maps/dc21285.c
+++ b/drivers/mtd/maps/dc21285.c
@@ -13,6 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/dilnetpc.c b/drivers/mtd/maps/dilnetpc.c
index 0bc79c93a584..f99519692cb7 100644
--- a/drivers/mtd/maps/dilnetpc.c
+++ b/drivers/mtd/maps/dilnetpc.c
@@ -30,12 +30,15 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/string.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 #include <linux/mtd/concat.h>
 
+#include <asm/io.h>
+
 /*
 ** The DIL/NetPC keeps its BIOS in two distinct flash blocks.
 ** Destroying any of these blocks transforms the DNPC into
diff --git a/drivers/mtd/maps/epxa10db-flash.c b/drivers/mtd/maps/epxa10db-flash.c
index ab6dbe2b8cce..1df6188926b3 100644
--- a/drivers/mtd/maps/epxa10db-flash.c
+++ b/drivers/mtd/maps/epxa10db-flash.c
@@ -27,12 +27,15 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/slab.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
+#include <asm/io.h>
 #include <asm/hardware.h>
+
 #ifdef CONFIG_EPXA10DB
 #define BOARD_NAME "EPXA10DB"
 #else
diff --git a/drivers/mtd/maps/fortunet.c b/drivers/mtd/maps/fortunet.c
index 068bb6a54520..00f7bbe5479e 100644
--- a/drivers/mtd/maps/fortunet.c
+++ b/drivers/mtd/maps/fortunet.c
@@ -7,11 +7,14 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/string.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
+#include <asm/io.h>
+
 #define MAX_NUM_REGIONS		4
 #define MAX_NUM_PARTITIONS	8
 
diff --git a/drivers/mtd/maps/ixp2000.c b/drivers/mtd/maps/ixp2000.c
index a9f86c7fbd52..1e5d6e1d05f3 100644
--- a/drivers/mtd/maps/ixp2000.c
+++ b/drivers/mtd/maps/ixp2000.c
@@ -22,11 +22,13 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/device.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
-#include <linux/ioport.h>
-#include <linux/device.h>
 
 #include <asm/io.h>
 #include <asm/hardware.h>
diff --git a/drivers/mtd/maps/ixp4xx.c b/drivers/mtd/maps/ixp4xx.c
index 3fcc32884074..da316e543237 100644
--- a/drivers/mtd/maps/ixp4xx.c
+++ b/drivers/mtd/maps/ixp4xx.c
@@ -20,11 +20,14 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/device.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
-#include <linux/ioport.h>
-#include <linux/device.h>
+
 #include <asm/io.h>
 #include <asm/mach/flash.h>
 
diff --git a/drivers/mtd/maps/lubbock-flash.c b/drivers/mtd/maps/lubbock-flash.c
index 1298de475c9a..2337e0c46750 100644
--- a/drivers/mtd/maps/lubbock-flash.c
+++ b/drivers/mtd/maps/lubbock-flash.c
@@ -15,10 +15,13 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/slab.h>
+
 #include <linux/dma-mapping.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
+
 #include <asm/io.h>
 #include <asm/hardware.h>
 #include <asm/arch/pxa-regs.h>
diff --git a/drivers/mtd/maps/mainstone-flash.c b/drivers/mtd/maps/mainstone-flash.c
index 87e93fa60588..da0f8a692628 100644
--- a/drivers/mtd/maps/mainstone-flash.c
+++ b/drivers/mtd/maps/mainstone-flash.c
@@ -16,9 +16,12 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
+
 #include <asm/io.h>
 #include <asm/hardware.h>
 #include <asm/arch/pxa-regs.h>
diff --git a/drivers/mtd/maps/omap-toto-flash.c b/drivers/mtd/maps/omap-toto-flash.c
index 496109071cb1..da36e8dddd17 100644
--- a/drivers/mtd/maps/omap-toto-flash.c
+++ b/drivers/mtd/maps/omap-toto-flash.c
@@ -12,9 +12,9 @@
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/kernel.h>
-
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c
index b17bca657daf..fa84566245a7 100644
--- a/drivers/mtd/maps/omap_nor.c
+++ b/drivers/mtd/maps/omap_nor.c
@@ -36,6 +36,8 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/ioport.h>
+#include <linux/slab.h>
+
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/maps/pci.c b/drivers/mtd/maps/pci.c
index 18dbd3af1eaa..d9c64e99ee32 100644
--- a/drivers/mtd/maps/pci.c
+++ b/drivers/mtd/maps/pci.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/plat-ram.c b/drivers/mtd/maps/plat-ram.c
index 118b04544cad..a0577ea00c3c 100644
--- a/drivers/mtd/maps/plat-ram.c
+++ b/drivers/mtd/maps/plat-ram.c
@@ -30,6 +30,7 @@
 #include <linux/string.h>
 #include <linux/ioport.h>
 #include <linux/device.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
diff --git a/drivers/mtd/maps/tqm8xxl.c b/drivers/mtd/maps/tqm8xxl.c
index 995e9991cb8d..4e28b977f224 100644
--- a/drivers/mtd/maps/tqm8xxl.c
+++ b/drivers/mtd/maps/tqm8xxl.c
@@ -27,12 +27,14 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
-#include <asm/io.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/map.h>
 #include <linux/mtd/partitions.h>
 
+#include <asm/io.h>
+
 #define FLASH_ADDR 0x40000000
 #define FLASH_SIZE 0x00800000
 #define FLASH_BANK_MAX 4
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index b7c32c242bc7..400dd9c89883 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/sched.h>	/* TASK_* */
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/blktrans.h>
 
diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
index c534fd5d95cb..16df1e4fb0e9 100644
--- a/drivers/mtd/mtdchar.c
+++ b/drivers/mtd/mtdchar.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/sched.h>	/* TASK_* */
 #include <asm/uaccess.h>
 
 #include <linux/device.h>
diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c
index 8f66d093c80d..f3e65af33a9c 100644
--- a/drivers/mtd/mtdconcat.c
+++ b/drivers/mtd/mtdconcat.c
@@ -14,7 +14,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
-
+#include <linux/sched.h>	/* TASK_* */
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/concat.h>
 
diff --git a/drivers/mtd/nand/s3c2410.c b/drivers/mtd/nand/s3c2410.c
index b47ebcb31e0f..b58ba236a9eb 100644
--- a/drivers/mtd/nand/s3c2410.c
+++ b/drivers/mtd/nand/s3c2410.c
@@ -51,6 +51,7 @@
 #include <linux/device.h>
 #include <linux/delay.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/nand.h>
diff --git a/drivers/pci/hotplug/cpcihp_generic.c b/drivers/pci/hotplug/cpcihp_generic.c
index a62a4345b466..2d4639d6841f 100644
--- a/drivers/pci/hotplug/cpcihp_generic.c
+++ b/drivers/pci/hotplug/cpcihp_generic.c
@@ -39,6 +39,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
+#include <linux/string.h>
 #include "cpci_hotplug.h"
 
 #define DRIVER_VERSION	"0.1"
diff --git a/drivers/pci/hotplug/cpcihp_zt5550.c b/drivers/pci/hotplug/cpcihp_zt5550.c
index 790abadd816c..f7cb00da38df 100644
--- a/drivers/pci/hotplug/cpcihp_zt5550.c
+++ b/drivers/pci/hotplug/cpcihp_zt5550.c
@@ -36,6 +36,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/pci.h>
+#include <linux/signal.h>	/* SA_SHIRQ */
 #include "cpci_hotplug.h"
 #include "cpcihp_zt5550.h"
 
diff --git a/drivers/pci/hotplug/fakephp.c b/drivers/pci/hotplug/fakephp.c
index 8e47fa66e25e..060d74775d7b 100644
--- a/drivers/pci/hotplug/fakephp.c
+++ b/drivers/pci/hotplug/fakephp.c
@@ -37,6 +37,8 @@
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "pci_hotplug.h"
 #include "../pci.h"
 
diff --git a/drivers/pci/hotplug/pciehprm_nonacpi.c b/drivers/pci/hotplug/pciehprm_nonacpi.c
index 3622965f8961..33b2c69a0829 100644
--- a/drivers/pci/hotplug/pciehprm_nonacpi.c
+++ b/drivers/pci/hotplug/pciehprm_nonacpi.c
@@ -33,10 +33,13 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/slab.h>
+
 #include <asm/uaccess.h>
 #ifdef CONFIG_IA64
 #include <asm/iosapic.h>
 #endif
+
 #include "pciehp.h"
 #include "pciehprm.h"
 #include "pciehprm_nonacpi.h"
diff --git a/drivers/pci/hotplug/rpadlpar_core.c b/drivers/pci/hotplug/rpadlpar_core.c
index ad1017da8656..fcb66b9a0e28 100644
--- a/drivers/pci/hotplug/rpadlpar_core.c
+++ b/drivers/pci/hotplug/rpadlpar_core.c
@@ -16,10 +16,13 @@
  */
 #include <linux/init.h>
 #include <linux/pci.h>
+#include <linux/string.h>
+
 #include <asm/pci-bridge.h>
 #include <asm/semaphore.h>
 #include <asm/rtas.h>
 #include <asm/vio.h>
+
 #include "../pci.h"
 #include "rpaphp.h"
 #include "rpadlpar.h"
diff --git a/drivers/pci/hotplug/rpaphp_pci.c b/drivers/pci/hotplug/rpaphp_pci.c
index 46c157d26a2f..f7c12d7dfcfc 100644
--- a/drivers/pci/hotplug/rpaphp_pci.c
+++ b/drivers/pci/hotplug/rpaphp_pci.c
@@ -23,11 +23,13 @@
  *
  */
 #include <linux/pci.h>
+#include <linux/string.h>
+
 #include <asm/pci-bridge.h>
 #include <asm/rtas.h>
 #include <asm/machdep.h>
-#include "../pci.h"		/* for pci_add_new_bus */
 
+#include "../pci.h"		/* for pci_add_new_bus */
 #include "rpaphp.h"
 
 static struct pci_bus *find_bus_among_children(struct pci_bus *bus,
diff --git a/drivers/pci/hotplug/rpaphp_slot.c b/drivers/pci/hotplug/rpaphp_slot.c
index 0e8815495083..daa89ae57123 100644
--- a/drivers/pci/hotplug/rpaphp_slot.c
+++ b/drivers/pci/hotplug/rpaphp_slot.c
@@ -27,6 +27,9 @@
 #include <linux/kobject.h>
 #include <linux/sysfs.h>
 #include <linux/pci.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <asm/rtas.h>
 #include "rpaphp.h"
 
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index abe2cf411e68..08ad26a0cae7 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -32,6 +32,8 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>	/* signal_pending(), struct timer_list */
+
 #include "pci_hotplug.h"
 
 #if !defined(MODULE)
diff --git a/drivers/pci/hotplug/shpchprm_nonacpi.c b/drivers/pci/hotplug/shpchprm_nonacpi.c
index d70fe5408417..c6b40998eeb3 100644
--- a/drivers/pci/hotplug/shpchprm_nonacpi.c
+++ b/drivers/pci/hotplug/shpchprm_nonacpi.c
@@ -32,6 +32,8 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
+
 #include "shpchp.h"
 
 int shpchprm_get_physical_slot_number(struct controller *ctrl, u32 *sun, u8 busnum, u8 devnum)
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 8972e6a3aac0..ae986e590b48 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -8,6 +8,8 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/mempolicy.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include "pci.h"
 
 /*
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 61b855c99e39..e74d75843047 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -15,6 +15,7 @@
 #include <linux/pci.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
+#include <linux/string.h>
 #include <asm/dma.h>	/* isa_dma_bridge_buggy */
 #include "pci.h"
 
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 14f05d22bb70..467a4ceccf10 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -11,6 +11,8 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
 #include "portdrv.h"
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index 3c565ce7f77b..02260141dc81 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -12,6 +12,7 @@
 #include <linux/errno.h>
 #include <linux/pm.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/pcieport_if.h>
 
 #include "portdrv.h"
diff --git a/drivers/pci/rom.c b/drivers/pci/rom.c
index 49bd21702314..598a115cd00e 100644
--- a/drivers/pci/rom.c
+++ b/drivers/pci/rom.c
@@ -9,6 +9,7 @@
 #include <linux/config.h>
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/slab.h>
 
 #include "pci.h"
 
diff --git a/drivers/pnp/manager.c b/drivers/pnp/manager.c
index 94442ffd4aed..cbb2749db178 100644
--- a/drivers/pnp/manager.c
+++ b/drivers/pnp/manager.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/pnp.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
 #include "base.h"
 
 DECLARE_MUTEX(pnp_res_mutex);
diff --git a/drivers/pnp/pnpbios/rsparser.c b/drivers/pnp/pnpbios/rsparser.c
index b0ca65b68645..5e38cd7335f7 100644
--- a/drivers/pnp/pnpbios/rsparser.c
+++ b/drivers/pnp/pnpbios/rsparser.c
@@ -7,6 +7,8 @@
 #include <linux/ctype.h>
 #include <linux/pnp.h>
 #include <linux/pnpbios.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 #ifdef CONFIG_PCI
 #include <linux/pci.h>
diff --git a/drivers/s390/cio/cmf.c b/drivers/s390/cio/cmf.c
index 8cc4f1a940dc..c05b069c2996 100644
--- a/drivers/s390/cio/cmf.c
+++ b/drivers/s390/cio/cmf.c
@@ -30,10 +30,13 @@
 #include <linux/list.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/timex.h>	/* get_clock() */
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
 #include <asm/cmb.h>
+#include <asm/div64.h>
 
 #include "cio.h"
 #include "css.h"
diff --git a/drivers/s390/cio/device.c b/drivers/s390/cio/device.c
index 1c2659766c09..811c9d150637 100644
--- a/drivers/s390/cio/device.c
+++ b/drivers/s390/cio/device.c
@@ -22,6 +22,7 @@
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
+#include <asm/param.h>		/* HZ */
 
 #include "cio.h"
 #include "css.h"
diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c
index fbe4202a3f6f..c1c89f4fd4e3 100644
--- a/drivers/s390/cio/device_fsm.c
+++ b/drivers/s390/cio/device_fsm.c
@@ -11,6 +11,8 @@
 #include <linux/module.h>
 #include <linux/config.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/string.h>
 
 #include <asm/ccwdev.h>
 #include <asm/cio.h>
diff --git a/drivers/scsi/scsi_transport_fc.c b/drivers/scsi/scsi_transport_fc.c
index 771e97ef136e..b856e140e65f 100644
--- a/drivers/scsi/scsi_transport_fc.c
+++ b/drivers/scsi/scsi_transport_fc.c
@@ -26,6 +26,7 @@
  */
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/sched.h>	/* workqueue stuff, HZ */
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_transport.h>
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 8bb8222ea589..d2caa35059d9 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -19,6 +19,9 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  */
 #include <linux/module.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
 #include <scsi/scsi.h>
 #include <scsi/scsi_host.h>
 #include <scsi/scsi_device.h>
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index e753ba27dc59..a1a58e1d5ad3 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -37,6 +37,9 @@
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
+
+#include <linux/slab.h>
+
 #include "sym_glue.h"
 #include "sym_nvram.h"
 
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.h b/drivers/scsi/sym53c8xx_2/sym_hipd.h
index 3131a6bf7ab7..3a264a408216 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.h
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.h
@@ -37,6 +37,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
+#include <linux/gfp.h>
+
 #ifndef SYM_HIPD_H
 #define SYM_HIPD_H
 
diff --git a/drivers/sh/superhyway/superhyway.c b/drivers/sh/superhyway/superhyway.c
index f056276b08a1..28757cb9d246 100644
--- a/drivers/sh/superhyway/superhyway.c
+++ b/drivers/sh/superhyway/superhyway.c
@@ -16,6 +16,8 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/superhyway.h>
+#include <linux/string.h>
+#include <linux/slab.h>
 
 static int superhyway_devices;
 
diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c
index 45efeed1fcc3..49815ec4b842 100644
--- a/drivers/usb/host/ohci-omap.c
+++ b/drivers/usb/host/ohci-omap.c
@@ -14,6 +14,9 @@
  * This file is licenced under the GPL.
  */
 
+#include <linux/signal.h>	/* SA_INTERRUPT */
+#include <linux/jiffies.h>
+
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/mach-types.h>
diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c
index bf1d5ab4aa3a..7ce1d9ef0289 100644
--- a/drivers/usb/host/ohci-pci.c
+++ b/drivers/usb/host/ohci-pci.c
@@ -14,6 +14,8 @@
  * This file is licenced under the GPL.
  */
  
+#include <linux/jiffies.h>
+
 #ifdef CONFIG_PPC_PMAC
 #include <asm/machdep.h>
 #include <asm/pmac_feature.h>
diff --git a/drivers/usb/host/ohci-pxa27x.c b/drivers/usb/host/ohci-pxa27x.c
index d287dcccd415..f4a4aeda40b7 100644
--- a/drivers/usb/host/ohci-pxa27x.c
+++ b/drivers/usb/host/ohci-pxa27x.c
@@ -20,6 +20,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/signal.h>
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
 #include <asm/arch/pxa-regs.h>
diff --git a/drivers/w1/w1_family.c b/drivers/w1/w1_family.c
index 88c517a4c178..9e293e139a0e 100644
--- a/drivers/w1/w1_family.c
+++ b/drivers/w1/w1_family.c
@@ -21,6 +21,7 @@
 
 #include <linux/spinlock.h>
 #include <linux/list.h>
+#include <linux/sched.h>	/* schedule_timeout() */
 #include <linux/delay.h>
 
 #include "w1_family.h"
diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c
index 04ca8840acf1..87c29d7b6c17 100644
--- a/drivers/zorro/zorro-sysfs.c
+++ b/drivers/zorro/zorro-sysfs.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/zorro.h>
 #include <linux/stat.h>
+#include <linux/string.h>
 
 #include "zorro.h"
 
diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c
index d3c05dfe20d2..0f2b40605b06 100644
--- a/drivers/zorro/zorro.c
+++ b/drivers/zorro/zorro.c
@@ -16,6 +16,8 @@
 #include <linux/init.h>
 #include <linux/zorro.h>
 #include <linux/bitops.h>
+#include <linux/string.h>
+
 #include <asm/setup.h>
 #include <asm/amigahw.h>
 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 44082bfdfec9..9f1072836c8e 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -12,6 +12,7 @@
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/sched.h>	/* for 'current' */
 #include <asm/uaccess.h>
 
 /*
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 0f224384f176..8210ac16a368 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -15,6 +15,7 @@
 #include <linux/jffs2.h>
 #include <linux/mtd/mtd.h>
 #include <linux/completion.h>
+#include <linux/sched.h>
 #include "nodelist.h"
 
 
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 996d922e503e..316133c626b7 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -18,6 +18,8 @@
 #include <linux/mtd/mtd.h>
 #include <linux/crc32.h>
 #include <linux/mtd/nand.h>
+#include <linux/jiffies.h>
+
 #include "nodelist.h"
 
 /* For testing write failures */
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index ff7f80f48df1..d068176b7ad7 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -23,6 +23,7 @@
 #include <linux/completion.h>
 #include <linux/workqueue.h>
 #include <linux/cpumask.h>
+#include <asm/div64.h>
 
 #define CPUFREQ_NAME_LEN 16
 
diff --git a/include/linux/gameport.h b/include/linux/gameport.h
index cd623eccdbea..2401dea2b867 100644
--- a/include/linux/gameport.h
+++ b/include/linux/gameport.h
@@ -12,6 +12,7 @@
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/device.h>
+#include <linux/timer.h>
 
 struct gameport {
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index f88577ca3b3a..5e19a7ba69b2 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -31,6 +31,7 @@
 #include <linux/i2c-id.h>
 #include <linux/mod_devicetable.h>
 #include <linux/device.h>	/* for struct device */
+#include <linux/sched.h>	/* for completion */
 #include <asm/semaphore.h>
 
 /* --- For i2c-isa ---------------------------------------------------- */
diff --git a/include/linux/i2o.h b/include/linux/i2o.h
index 92300325dbcd..d79c8a4bc4f8 100644
--- a/include/linux/i2o.h
+++ b/include/linux/i2o.h
@@ -25,10 +25,14 @@
 /* How many different OSM's are we allowing */
 #define I2O_MAX_DRIVERS		8
 
-#include <asm/io.h>
-#include <asm/semaphore.h>	/* Needed for MUTEX init macros */
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>	/* work_struct */
+
+#include <asm/io.h>
+#include <asm/semaphore.h>	/* Needed for MUTEX init macros */
 
 /* message queue empty */
 #define I2O_QUEUE_EMPTY		0xffffffff
diff --git a/include/linux/kobj_map.h b/include/linux/kobj_map.h
index b6cc10bf8dfc..cbe7d8008042 100644
--- a/include/linux/kobj_map.h
+++ b/include/linux/kobj_map.h
@@ -1,5 +1,7 @@
 #ifdef __KERNEL__
 
+#include <asm/semaphore.h>
+
 typedef struct kobject *kobj_probe_t(dev_t, int *, void *);
 struct kobj_map;
 
diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h
index 142963f01d29..fc28841f3409 100644
--- a/include/linux/mtd/map.h
+++ b/include/linux/mtd/map.h
@@ -8,7 +8,10 @@
 #include <linux/config.h>
 #include <linux/types.h>
 #include <linux/list.h>
+#include <linux/string.h>
+
 #include <linux/mtd/compatmac.h>
+
 #include <asm/unaligned.h>
 #include <asm/system.h>
 #include <asm/io.h>
diff --git a/include/linux/serial.h b/include/linux/serial.h
index 12cd9cf65e8f..33fc8cb8ddfb 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -11,6 +11,7 @@
 #define _LINUX_SERIAL_H
 
 #ifdef __KERNEL__
+#include <linux/types.h>
 #include <asm/page.h>
 
 /*
diff --git a/include/linux/textsearch.h b/include/linux/textsearch.h
index fc5bb4e91a58..7dac8f04d28e 100644
--- a/include/linux/textsearch.h
+++ b/include/linux/textsearch.h
@@ -8,6 +8,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 struct ts_config;
 
diff --git a/include/pcmcia/ss.h b/include/pcmcia/ss.h
index c8592c7e8eaa..e788bbc5657d 100644
--- a/include/pcmcia/ss.h
+++ b/include/pcmcia/ss.h
@@ -17,6 +17,7 @@
 
 #include <linux/config.h>
 #include <linux/device.h>
+#include <linux/sched.h>	/* task_struct, completion */
 
 #include <pcmcia/cs_types.h>
 #include <pcmcia/cs.h>
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index e6b61fab66dd..7529f4388bb4 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -4,6 +4,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/list.h>
 #include <linux/types.h>
+#include <linux/timer.h>
 
 struct request;
 struct scatterlist;
diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h
index b0d445437372..c04405bead2d 100644
--- a/include/scsi/scsi_transport_fc.h
+++ b/include/scsi/scsi_transport_fc.h
@@ -28,6 +28,7 @@
 #define SCSI_TRANSPORT_FC_H
 
 #include <linux/config.h>
+#include <linux/sched.h>
 
 struct scsi_transport_template;
 
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13bcec151b57..39277dd6bf90 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -18,6 +18,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>	/* for cond_resched */
 #include <linux/mm.h>
 
 #include <asm/sections.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f3ea492ab44d..ce4915dd683a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -35,6 +35,7 @@
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/moduleloader.h>
 #include <asm-generic/sections.h>
diff --git a/kernel/params.c b/kernel/params.c
index 1a8614bac5d5..47ba69547945 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/err.h>
+#include <linux/slab.h>
 
 #if 0
 #define DEBUGP printk
diff --git a/lib/kobject.c b/lib/kobject.c
index 253d3004ace9..a181abed89f6 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -14,6 +14,7 @@
 #include <linux/string.h>
 #include <linux/module.h>
 #include <linux/stat.h>
+#include <linux/slab.h>
 
 /**
  *	populate_dir - populate directory with attributes.
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 42c08ef828c5..eddc9b3d3876 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -5,6 +5,7 @@
  */
 #include <linux/module.h>
 #include <linux/kallsyms.h>
+#include <linux/sched.h>
 
 unsigned int debug_smp_processor_id(void)
 {
diff --git a/lib/sort.c b/lib/sort.c
index ddc4d35df289..5f3b51ffa1dc 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -7,6 +7,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/sort.h>
+#include <linux/slab.h>
 
 static void u32_swap(void *a, void *b, int size)
 {
diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index e4e9031dd9c3..b07db5ca3f66 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -23,6 +23,7 @@
 #include <linux/ctype.h>
 #include <linux/kernel.h>
 
+#include <asm/page.h>		/* for PAGE_SIZE */
 #include <asm/div64.h>
 
 /**
diff --git a/sound/oss/ac97_codec.c b/sound/oss/ac97_codec.c
index 3ecef4689f1b..fd25aca25120 100644
--- a/sound/oss/ac97_codec.c
+++ b/sound/oss/ac97_codec.c
@@ -55,6 +55,7 @@
 #include <linux/pci.h>
 #include <linux/ac97_codec.h>
 #include <asm/uaccess.h>
+#include <asm/semaphore.h>
 
 #define CODEC_ID_BUFSZ 14
 
-- 
cgit v1.2.3


From 889dfafe836ac9bb711f73d07a4c044cae177c0b Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Fri, 4 Nov 2005 18:54:30 +0300
Subject: [PATCH] improve scheduler fairness a bit

Do not transfer remaining time slice to another cpu on process exit.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 340dd238c16d..b4f4eb613537 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1468,7 +1468,7 @@ void fastcall sched_exit(task_t *p)
 	 * the sleep_avg of the parent as well.
 	 */
 	rq = task_rq_lock(p->parent, &flags);
-	if (p->first_time_slice) {
+	if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) {
 		p->parent->time_slice += p->time_slice;
 		if (unlikely(p->parent->time_slice > task_timeslice(p)))
 			p->parent->time_slice = task_timeslice(p);
-- 
cgit v1.2.3


From 7fd93cf30c531fd8b014e827e7a85fcfc010b2c6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 7 Nov 2005 00:57:59 -0800
Subject: [PATCH] posix-timers `unlikely' rejig

!unlikely(expr) hurts my brain.   likely(!expr) is more straightforward.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 91a894264941..84af54c39e1b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -497,7 +497,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->flags & PF_EXITING)) {
+			if (likely(!(t->flags & PF_EXITING))) {
 				ticks = cputime_add(prof_ticks(t), left);
 				if (cputime_eq(t->it_prof_expires,
 					       cputime_zero) ||
@@ -512,7 +512,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		left = cputime_div(cputime_sub(expires.cpu, val.cpu),
 				   nthreads);
 		do {
-			if (!unlikely(t->flags & PF_EXITING)) {
+			if (likely(!(t->flags & PF_EXITING))) {
 				ticks = cputime_add(virt_ticks(t), left);
 				if (cputime_eq(t->it_virt_expires,
 					       cputime_zero) ||
@@ -527,7 +527,7 @@ static void process_timer_rebalance(struct task_struct *p,
 		nsleft = expires.sched - val.sched;
 		do_div(nsleft, nthreads);
 		do {
-			if (!unlikely(t->flags & PF_EXITING)) {
+			if (likely(!(t->flags & PF_EXITING))) {
 				ns = t->sched_time + nsleft;
 				if (t->it_sched_expires == 0 ||
 				    t->it_sched_expires > ns) {
-- 
cgit v1.2.3


From a4c4af7c8dc1eccdfb8c57e1684f08179b4407e6 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 7 Nov 2005 00:58:38 -0800
Subject: [PATCH] cpu hoptlug: avoid usage of smp_processor_id() in preemptible
 code

Replace smp_processor_id() with any_online_cpu(cpu_online_map) in order to
avoid lots of "BUG: using smp_processor_id() in preemptible [00000001]
code:..." messages in case taking a cpu online fails.

All the traces start at the last notifier_call_chain(...) in kernel/cpu.c.
Since we hold the cpu_control semaphore it shouldn't be any problem to access
cpu_online_map.

The reason why cpu_up failed is simply that the cpu that was supposed to be
taken online wasn't even there.  That is because on s390 we never know when a
new cpu comes and therefore cpu_possible_map consists of only ones and doesn't
reflect reality.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c      | 3 ++-
 kernel/softirq.c    | 3 ++-
 kernel/softlockup.c | 3 ++-
 kernel/workqueue.c  | 2 +-
 4 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b4f4eb613537..013f1448006b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4680,7 +4680,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind it from offline cpu so it can run.  Fall thru. */
-		kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
+		kthread_bind(cpu_rq(cpu)->migration_thread,
+			     any_online_cpu(cpu_online_map));
 		kthread_stop(cpu_rq(cpu)->migration_thread);
 		cpu_rq(cpu)->migration_thread = NULL;
 		break;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f766b2fc48be..ad3295cdded5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -470,7 +470,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
+		kthread_bind(per_cpu(ksoftirqd, hotcpu),
+			     any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
 		p = per_cpu(ksoftirqd, hotcpu);
 		per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 75976209cea7..a2dcceb9437d 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -123,7 +123,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id());
+		kthread_bind(per_cpu(watchdog_task, hotcpu),
+			     any_online_cpu(cpu_online_map));
 	case CPU_DEAD:
 		p = per_cpu(watchdog_task, hotcpu);
 		per_cpu(watchdog_task, hotcpu) = NULL;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7cee222231bc..42df83d7fad2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -524,7 +524,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		list_for_each_entry(wq, &workqueues, list) {
 			/* Unbind so it can run. */
 			kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
-				     smp_processor_id());
+				     any_online_cpu(cpu_online_map));
 			cleanup_workqueue_thread(wq, hotcpu);
 		}
 		break;
-- 
cgit v1.2.3


From dc19d507b17135069d9c5d6093d4458dc60e1861 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 7 Nov 2005 00:58:40 -0800
Subject: [PATCH] swsusp cleanups

This cleans spaces between * and pointer up, and adds "int" in "unsigned
int".

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/snapshot.c | 19 +++++++++----------
 kernel/power/swsusp.c   | 36 ++++++++++++++++++------------------
 2 files changed, 27 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 42a628704398..723f5179883e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -168,9 +168,8 @@ static unsigned count_data_pages(void)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	unsigned n;
+	unsigned int n = 0;
 
-	n = 0;
 	for_each_zone (zone) {
 		if (is_highmem(zone))
 			continue;
@@ -250,10 +249,10 @@ static inline void fill_pb_page(struct pbe *pbpage)
  *	of memory pages allocated with alloc_pagedir()
  */
 
-void create_pbe_list(struct pbe *pblist, unsigned nr_pages)
+void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 {
 	struct pbe *pbpage, *p;
-	unsigned num = PBES_PER_PAGE;
+	unsigned int num = PBES_PER_PAGE;
 
 	for_each_pb_page (pbpage, pblist) {
 		if (num >= nr_pages)
@@ -293,9 +292,9 @@ static void *alloc_image_page(void)
  *	On each page we set up a list of struct_pbe elements.
  */
 
-struct pbe *alloc_pagedir(unsigned nr_pages)
+struct pbe *alloc_pagedir(unsigned int nr_pages)
 {
-	unsigned num;
+	unsigned int num;
 	struct pbe *pblist, *pbe;
 
 	if (!nr_pages)
@@ -329,7 +328,7 @@ void swsusp_free(void)
 	for_each_zone(zone) {
 		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
 			if (pfn_valid(zone_pfn + zone->zone_start_pfn)) {
-				struct page * page;
+				struct page *page;
 				page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
 				if (PageNosave(page) && PageNosaveFree(page)) {
 					ClearPageNosave(page);
@@ -348,7 +347,7 @@ void swsusp_free(void)
  *	free pages.
  */
 
-static int enough_free_mem(unsigned nr_pages)
+static int enough_free_mem(unsigned int nr_pages)
 {
 	pr_debug("swsusp: available memory: %u pages\n", nr_free_pages());
 	return nr_free_pages() > (nr_pages + PAGES_FOR_IO +
@@ -356,7 +355,7 @@ static int enough_free_mem(unsigned nr_pages)
 }
 
 
-static struct pbe *swsusp_alloc(unsigned nr_pages)
+static struct pbe *swsusp_alloc(unsigned int nr_pages)
 {
 	struct pbe *pblist, *p;
 
@@ -380,7 +379,7 @@ static struct pbe *swsusp_alloc(unsigned nr_pages)
 
 asmlinkage int swsusp_save(void)
 {
-	unsigned nr_pages;
+	unsigned int nr_pages;
 
 	pr_debug("swsusp: critical section: \n");
 	if (save_highmem()) {
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 12db1d2ad61f..a1300717dc3f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -122,8 +122,8 @@ static struct swsusp_info swsusp_info;
 static unsigned short swapfile_used[MAX_SWAPFILES];
 static unsigned short root_swap;
 
-static int write_page(unsigned long addr, swp_entry_t * loc);
-static int bio_read_page(pgoff_t page_off, void * page);
+static int write_page(unsigned long addr, swp_entry_t *loc);
+static int bio_read_page(pgoff_t page_off, void *page);
 
 static u8 key_iv[MAXKEY+MAXIV];
 
@@ -355,7 +355,7 @@ static void lock_swapdevices(void)
  *	This is a partial improvement, since we will at least return other
  *	errors, though we need to eventually fix the damn code.
  */
-static int write_page(unsigned long addr, swp_entry_t * loc)
+static int write_page(unsigned long addr, swp_entry_t *loc)
 {
 	swp_entry_t entry;
 	int error = 0;
@@ -383,7 +383,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
 static void data_free(void)
 {
 	swp_entry_t entry;
-	struct pbe * p;
+	struct pbe *p;
 
 	for_each_pbe(p, pagedir_nosave) {
 		entry = p->swap_address;
@@ -492,8 +492,8 @@ static void free_pagedir_entries(void)
 static int write_pagedir(void)
 {
 	int error = 0;
-	unsigned n = 0;
-	struct pbe * pbe;
+	unsigned int n = 0;
+	struct pbe *pbe;
 
 	printk( "Writing pagedir...");
 	for_each_pb_page (pbe, pagedir_nosave) {
@@ -543,7 +543,7 @@ static int write_suspend_image(void)
  *	We should only consider resume_device.
  */
 
-int enough_swap(unsigned nr_pages)
+int enough_swap(unsigned int nr_pages)
 {
 	struct sysinfo i;
 
@@ -694,7 +694,7 @@ static int check_pagedir(struct pbe *pblist)
  *	restore from the loaded pages later.  We relocate them here.
  */
 
-static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
+static struct pbe *swsusp_pagedir_relocate(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
@@ -770,7 +770,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
 
 static atomic_t io_done = ATOMIC_INIT(0);
 
-static int end_io(struct bio * bio, unsigned int num, int err)
+static int end_io(struct bio *bio, unsigned int num, int err)
 {
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		panic("I/O error reading memory image");
@@ -778,7 +778,7 @@ static int end_io(struct bio * bio, unsigned int num, int err)
 	return 0;
 }
 
-static struct block_device * resume_bdev;
+static struct block_device *resume_bdev;
 
 /**
  *	submit - submit BIO request.
@@ -791,10 +791,10 @@ static struct block_device * resume_bdev;
  *	Then submit it and wait.
  */
 
-static int submit(int rw, pgoff_t page_off, void * page)
+static int submit(int rw, pgoff_t page_off, void *page)
 {
 	int error = 0;
-	struct bio * bio;
+	struct bio *bio;
 
 	bio = bio_alloc(GFP_ATOMIC, 1);
 	if (!bio)
@@ -823,12 +823,12 @@ static int submit(int rw, pgoff_t page_off, void * page)
 	return error;
 }
 
-static int bio_read_page(pgoff_t page_off, void * page)
+static int bio_read_page(pgoff_t page_off, void *page)
 {
 	return submit(READ, page_off, page);
 }
 
-static int bio_write_page(pgoff_t page_off, void * page)
+static int bio_write_page(pgoff_t page_off, void *page)
 {
 	return submit(WRITE, page_off, page);
 }
@@ -838,7 +838,7 @@ static int bio_write_page(pgoff_t page_off, void * page)
  * I really don't think that it's foolproof but more than nothing..
  */
 
-static const char * sanity_check(void)
+static const char *sanity_check(void)
 {
 	dump_info();
 	if (swsusp_info.version_code != LINUX_VERSION_CODE)
@@ -864,7 +864,7 @@ static const char * sanity_check(void)
 
 static int check_header(void)
 {
-	const char * reason = NULL;
+	const char *reason = NULL;
 	int error;
 
 	if ((error = bio_read_page(swp_offset(swsusp_header.swsusp_info), &swsusp_info)))
@@ -912,7 +912,7 @@ static int check_sig(void)
 
 static int data_read(struct pbe *pblist)
 {
-	struct pbe * p;
+	struct pbe *p;
 	int error = 0;
 	int i = 0;
 	int mod = swsusp_info.image_pages / 100;
@@ -950,7 +950,7 @@ static int data_read(struct pbe *pblist)
 static int read_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbpage, *p;
-	unsigned i = 0;
+	unsigned int i = 0;
 	int error;
 
 	if (!pblist)
-- 
cgit v1.2.3


From 47b90ffe5c10ab9b5cfd14087b28b13109673ee5 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Mon, 7 Nov 2005 00:58:41 -0800
Subject: [PATCH] swsusp: remove unused variable

Remove unused variable, and make code less evil that way.  Fix whitespace
around for-loop-like macro.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/swsusp.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a1300717dc3f..e1ab28b9b217 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -85,18 +85,11 @@ unsigned int nr_copy_pages __nosavedata = 0;
 /* Suspend pagedir is allocated before final copy, therefore it
    must be freed after resume
 
-   Warning: this is evil. There are actually two pagedirs at time of
-   resume. One is "pagedir_save", which is empty frame allocated at
-   time of suspend, that must be freed. Second is "pagedir_nosave",
-   allocated at time of resume, that travels through memory not to
-   collide with anything.
-
    Warning: this is even more evil than it seems. Pagedirs this file
    talks about are completely different from page directories used by
    MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
-suspend_pagedir_t *pagedir_save;
 
 #define SWSUSP_SIG	"S1SUSPEND"
 
@@ -385,7 +378,7 @@ static void data_free(void)
 	swp_entry_t entry;
 	struct pbe *p;
 
-	for_each_pbe(p, pagedir_nosave) {
+	for_each_pbe (p, pagedir_nosave) {
 		entry = p->swap_address;
 		if (entry.val)
 			swap_free(entry);
-- 
cgit v1.2.3


From 9f46080c41d5f3f7c00b4e169ba4b0b2865258bf Mon Sep 17 00:00:00 2001
From: Matt Helsley <matthltc@us.ibm.com>
Date: Mon, 7 Nov 2005 00:59:16 -0800
Subject: [PATCH] Process Events Connector

This patch adds a connector that reports fork, exec, id change, and exit
events for all processes to userspace.  It replaces the fork_advisor patch
that ELSA is currently using.  Applications that may find these events
useful include accounting/auditing (e.g.  ELSA), system activity monitoring
(e.g.  top), security, and resource management (e.g.  CKRM).

Signed-off-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/connector/Kconfig   |   8 ++
 drivers/connector/Makefile  |   1 +
 drivers/connector/cn_proc.c | 222 ++++++++++++++++++++++++++++++++++++++++++++
 fs/exec.c                   |   2 +
 include/linux/cn_proc.h     | 127 +++++++++++++++++++++++++
 include/linux/connector.h   |   6 ++
 kernel/exit.c               |   2 +
 kernel/fork.c               |   2 +
 kernel/sys.c                |   9 ++
 9 files changed, 379 insertions(+)
 create mode 100644 drivers/connector/cn_proc.c
 create mode 100644 include/linux/cn_proc.h

(limited to 'kernel')

diff --git a/drivers/connector/Kconfig b/drivers/connector/Kconfig
index 0bc2059c1e08..e0bdc0db9640 100644
--- a/drivers/connector/Kconfig
+++ b/drivers/connector/Kconfig
@@ -10,4 +10,12 @@ config CONNECTOR
 	  Connector support can also be built as a module.  If so, the module
 	  will be called cn.ko.
 
+config PROC_EVENTS
+	boolean "Report process events to userspace"
+	depends on CONNECTOR=y
+	default y
+	---help---
+	  Provide a connector that reports process events to userspace. Send
+	  events such as fork, exec, id change (uid, gid, suid, etc), and exit.
+
 endmenu
diff --git a/drivers/connector/Makefile b/drivers/connector/Makefile
index 12ca79e8234d..1f255e46e916 100644
--- a/drivers/connector/Makefile
+++ b/drivers/connector/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_CONNECTOR)		+= cn.o
+obj-$(CONFIG_PROC_EVENTS)	+= cn_proc.o
 
 cn-y				+= cn_queue.o connector.o
diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
new file mode 100644
index 000000000000..fcdf0fff13a6
--- /dev/null
+++ b/drivers/connector/cn_proc.c
@@ -0,0 +1,222 @@
+/*
+ * cn_proc.c - process events connector
+ *
+ * Copyright (C) Matt Helsley, IBM Corp. 2005
+ * Based on cn_fork.c by Guillaume Thouvenin <guillaume.thouvenin@bull.net>
+ * Original copyright notice follows:
+ * Copyright (C) 2005 BULL SA.
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+#include <linux/cn_proc.h>
+
+#define CN_PROC_MSG_SIZE (sizeof(struct cn_msg) + sizeof(struct proc_event))
+
+static atomic_t proc_event_num_listeners = ATOMIC_INIT(0);
+static struct cb_id cn_proc_event_id = { CN_IDX_PROC, CN_VAL_PROC };
+
+/* proc_counts is used as the sequence number of the netlink message */
+static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
+
+static inline void get_seq(__u32 *ts, int *cpu)
+{
+	*ts = get_cpu_var(proc_event_counts)++;
+	*cpu = smp_processor_id();
+	put_cpu_var(proc_counts);
+}
+
+void proc_fork_connector(struct task_struct *task)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg*)buffer;
+	ev = (struct proc_event*)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ev->what = PROC_EVENT_FORK;
+	ev->event_data.fork.parent_pid = task->real_parent->pid;
+	ev->event_data.fork.parent_tgid = task->real_parent->tgid;
+	ev->event_data.fork.child_pid = task->pid;
+	ev->event_data.fork.child_tgid = task->tgid;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	/*  If cn_netlink_send() failed, the data is not sent */
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
+void proc_exec_connector(struct task_struct *task)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg*)buffer;
+	ev = (struct proc_event*)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ev->what = PROC_EVENT_EXEC;
+	ev->event_data.exec.process_pid = task->pid;
+	ev->event_data.exec.process_tgid = task->tgid;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
+void proc_id_connector(struct task_struct *task, int which_id)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg*)buffer;
+	ev = (struct proc_event*)msg->data;
+	ev->what = which_id;
+	ev->event_data.id.process_pid = task->pid;
+	ev->event_data.id.process_tgid = task->tgid;
+	if (which_id == PROC_EVENT_UID) {
+	 	ev->event_data.id.r.ruid = task->uid;
+	 	ev->event_data.id.e.euid = task->euid;
+	} else if (which_id == PROC_EVENT_GID) {
+	   	ev->event_data.id.r.rgid = task->gid;
+	   	ev->event_data.id.e.egid = task->egid;
+	} else
+	     	return;
+	get_seq(&msg->seq, &ev->cpu);
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
+void proc_exit_connector(struct task_struct *task)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg*)buffer;
+	ev = (struct proc_event*)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ev->what = PROC_EVENT_EXIT;
+	ev->event_data.exit.process_pid = task->pid;
+	ev->event_data.exit.process_tgid = task->tgid;
+	ev->event_data.exit.exit_code = task->exit_code;
+	ev->event_data.exit.exit_signal = task->exit_signal;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
+/*
+ * Send an acknowledgement message to userspace
+ *
+ * Use 0 for success, EFOO otherwise.
+ * Note: this is the negative of conventional kernel error
+ * values because it's not being returned via syscall return
+ * mechanisms.
+ */
+static void cn_proc_ack(int err, int rcvd_seq, int rcvd_ack)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg*)buffer;
+	ev = (struct proc_event*)msg->data;
+	msg->seq = rcvd_seq;
+	ev->cpu = -1;
+	ev->what = PROC_EVENT_NONE;
+	ev->event_data.ack.err = err;
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = rcvd_ack + 1;
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
+/**
+ * cn_proc_mcast_ctl
+ * @data: message sent from userspace via the connector
+ */
+static void cn_proc_mcast_ctl(void *data)
+{
+	struct cn_msg *msg = data;
+	enum proc_cn_mcast_op *mc_op = NULL;
+	int err = 0;
+
+	if (msg->len != sizeof(*mc_op))
+		return;
+
+	mc_op = (enum proc_cn_mcast_op*)msg->data;
+	switch (*mc_op) {
+	case PROC_CN_MCAST_LISTEN:
+		atomic_inc(&proc_event_num_listeners);
+		break;
+	case PROC_CN_MCAST_IGNORE:
+		atomic_dec(&proc_event_num_listeners);
+		break;
+	default:
+		err = EINVAL;
+		break;
+	}
+	cn_proc_ack(err, msg->seq, msg->ack);
+}
+
+/*
+ * cn_proc_init - initialization entry point
+ *
+ * Adds the connector callback to the connector driver.
+ */
+static int __init cn_proc_init(void)
+{
+	int err;
+
+	if ((err = cn_add_callback(&cn_proc_event_id, "cn_proc",
+	 			   &cn_proc_mcast_ctl))) {
+		printk(KERN_WARNING "cn_proc failed to register\n");
+		return err;
+	}
+	return 0;
+}
+
+module_init(cn_proc_init);
diff --git a/fs/exec.c b/fs/exec.c
index 10d493fea7ce..ce76b33f25ac 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -48,6 +48,7 @@
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/cn_proc.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1096,6 +1097,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 					fput(bprm->file);
 				bprm->file = NULL;
 				current->did_exec = 1;
+				proc_exec_connector(current);
 				return retval;
 			}
 			read_lock(&binfmt_lock);
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
new file mode 100644
index 000000000000..70ab56317380
--- /dev/null
+++ b/include/linux/cn_proc.h
@@ -0,0 +1,127 @@
+/*
+ * cn_proc.h - process events connector
+ *
+ * Copyright (C) Matt Helsley, IBM Corp. 2005
+ * Based on cn_fork.h by Nguyen Anh Quynh and Guillaume Thouvenin
+ * Original copyright notice follows:
+ * Copyright (C) 2005 Nguyen Anh Quynh <aquynh@gmail.com>
+ * Copyright (C) 2005 Guillaume Thouvenin <guillaume.thouvenin@bull.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef CN_PROC_H
+#define CN_PROC_H
+
+#include <linux/types.h>
+#include <linux/connector.h>
+
+/*
+ * Userspace sends this enum to register with the kernel that it is listening
+ * for events on the connector.
+ */
+enum proc_cn_mcast_op {
+	PROC_CN_MCAST_LISTEN = 1,
+	PROC_CN_MCAST_IGNORE = 2
+};
+
+/*
+ * From the user's point of view, the process
+ * ID is the thread group ID and thread ID is the internal
+ * kernel "pid". So, fields are assigned as follow:
+ *
+ *  In user space     -  In  kernel space
+ *
+ * parent process ID  =  parent->tgid
+ * parent thread  ID  =  parent->pid
+ * child  process ID  =  child->tgid
+ * child  thread  ID  =  child->pid
+ */
+
+struct proc_event {
+	enum what {
+		/* Use successive bits so the enums can be used to record
+		 * sets of events as well
+		 */
+		PROC_EVENT_NONE = 0x00000000,
+		PROC_EVENT_FORK = 0x00000001,
+		PROC_EVENT_EXEC = 0x00000002,
+		PROC_EVENT_UID  = 0x00000004,
+		PROC_EVENT_GID  = 0x00000040,
+		/* "next" should be 0x00000400 */
+		/* "last" is the last process event: exit */
+		PROC_EVENT_EXIT = 0x80000000
+	} what;
+	__u32 cpu;
+	union { /* must be last field of proc_event struct */
+		struct {
+			__u32 err;
+		} ack;
+
+		struct fork_proc_event {
+			pid_t parent_pid;
+			pid_t parent_tgid;
+			pid_t child_pid;
+			pid_t child_tgid;
+		} fork;
+
+		struct exec_proc_event {
+			pid_t process_pid;
+			pid_t process_tgid;
+		} exec;
+
+		struct id_proc_event {
+			pid_t process_pid;
+			pid_t process_tgid;
+			union {
+				uid_t ruid; /* current->uid */
+				gid_t rgid; /* current->gid */
+			} r;
+			union {
+				uid_t euid;
+				gid_t egid;
+			} e;
+		} id;
+
+		struct exit_proc_event {
+			pid_t process_pid;
+			pid_t process_tgid;
+			__u32 exit_code, exit_signal;
+		} exit;
+	} event_data;
+};
+
+#ifdef __KERNEL__
+#ifdef CONFIG_PROC_EVENTS
+void proc_fork_connector(struct task_struct *task);
+void proc_exec_connector(struct task_struct *task);
+void proc_id_connector(struct task_struct *task, int which_id);
+void proc_exit_connector(struct task_struct *task);
+#else
+static inline void proc_fork_connector(struct task_struct *task)
+{}
+
+static inline void proc_exec_connector(struct task_struct *task)
+{}
+
+static inline void proc_id_connector(struct task_struct *task,
+				     int which_id)
+{}
+
+static inline void proc_exit_connector(struct task_struct *task)
+{}
+#endif	/* CONFIG_PROC_EVENTS */
+#endif	/* __KERNEL__ */
+#endif	/* CN_PROC_H */
diff --git a/include/linux/connector.h b/include/linux/connector.h
index 95952cc1f525..c5769c6585f4 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -27,6 +27,12 @@
 #define CN_IDX_CONNECTOR		0xffffffff
 #define CN_VAL_CONNECTOR		0xffffffff
 
+/*
+ * Process Events connector unique ids -- used for message routing
+ */
+#define CN_IDX_PROC			0x1
+#define CN_VAL_PROC			0x1
+
 #define CN_NETLINK_USERS		1
 
 /*
diff --git a/kernel/exit.c b/kernel/exit.c
index 537394b25e8d..452a1d116178 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -28,6 +28,7 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/cn_proc.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -863,6 +864,7 @@ fastcall NORET_TYPE void do_exit(long code)
 		module_put(tsk->binfmt->module);
 
 	tsk->exit_code = code;
+	proc_exit_connector(tsk);
 	exit_notify(tsk);
 #ifdef CONFIG_NUMA
 	mpol_free(tsk->mempolicy);
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a069612eac3..efac2c58ec7d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -42,6 +42,7 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/cn_proc.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -1143,6 +1144,7 @@ static task_t *copy_process(unsigned long clone_flags,
 			__get_cpu_var(process_counts)++;
 	}
 
+	proc_fork_connector(p);
 	if (!current->signal->tty && p->signal->tty)
 		p->signal->tty = NULL;
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 2fa1ed18123c..1e1f41b3fdf6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -28,6 +28,7 @@
 #include <linux/suspend.h>
 #include <linux/tty.h>
 #include <linux/signal.h>
+#include <linux/cn_proc.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -623,6 +624,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
 	current->egid = new_egid;
 	current->gid = new_rgid;
 	key_fsgid_changed(current);
+	proc_id_connector(current, PROC_EVENT_GID);
 	return 0;
 }
 
@@ -662,6 +664,7 @@ asmlinkage long sys_setgid(gid_t gid)
 		return -EPERM;
 
 	key_fsgid_changed(current);
+	proc_id_connector(current, PROC_EVENT_GID);
 	return 0;
 }
   
@@ -751,6 +754,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
 	current->fsuid = current->euid;
 
 	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
 
 	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RE);
 }
@@ -798,6 +802,7 @@ asmlinkage long sys_setuid(uid_t uid)
 	current->suid = new_suid;
 
 	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
 
 	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_ID);
 }
@@ -846,6 +851,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
 		current->suid = suid;
 
 	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
 
 	return security_task_post_setuid(old_ruid, old_euid, old_suid, LSM_SETID_RES);
 }
@@ -898,6 +904,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
 		current->sgid = sgid;
 
 	key_fsgid_changed(current);
+	proc_id_connector(current, PROC_EVENT_GID);
 	return 0;
 }
 
@@ -940,6 +947,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
 	}
 
 	key_fsuid_changed(current);
+	proc_id_connector(current, PROC_EVENT_UID);
 
 	security_task_post_setuid(old_fsuid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS);
 
@@ -968,6 +976,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
 		}
 		current->fsgid = gid;
 		key_fsgid_changed(current);
+		proc_id_connector(current, PROC_EVENT_GID);
 	}
 	return old_fsgid;
 }
-- 
cgit v1.2.3


From d55b5fdaf40846221d543937b786956e27837fda Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Mon, 7 Nov 2005 00:59:31 -0800
Subject: [PATCH] aio: remove aio_max_nr accounting race

AIO was adding a new context's max requests to the global total before
testing if that resulting total was over the global limit.  This let
innocent tasks get their new limit tested along with a racing guilty task
that was crossing the limit.  This serializes the _nr accounting with a
spinlock It also switches to using unsigned long for the global totals.
Individual contexts are still limited to an unsigned int's worth of
requests by the syscall interface.

The problem and fix were verified with a simple program that spun creating
and destroying a context while holding on to another long lived context.
Before the patch a task creating a tiny context could get a spurious EAGAIN
if it raced with a task creating a very large context that overran the
limit.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c            | 31 +++++++++++++++++++++----------
 include/linux/aio.h |  5 +++--
 kernel/sysctl.c     |  4 ++--
 3 files changed, 26 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/fs/aio.c b/fs/aio.c
index edfca5b75535..20bb919eb195 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,8 +42,9 @@
 #endif
 
 /*------ sysctl variables----*/
-atomic_t aio_nr = ATOMIC_INIT(0);	/* current system wide number of aio requests */
-unsigned aio_max_nr = 0x10000;	/* system wide maximum number of aio requests */
+static DEFINE_SPINLOCK(aio_nr_lock);
+unsigned long aio_nr;		/* current system wide number of aio requests */
+unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
 /*----end sysctl variables---*/
 
 static kmem_cache_t	*kiocb_cachep;
@@ -208,7 +209,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (nr_events > aio_max_nr)
+	if ((unsigned long)nr_events > aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_alloc(kioctx_cachep, GFP_KERNEL);
@@ -233,8 +234,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		goto out_freectx;
 
 	/* limit the number of system wide aios */
-	atomic_add(ctx->max_reqs, &aio_nr);	/* undone by __put_ioctx */
-	if (unlikely(atomic_read(&aio_nr) > aio_max_nr))
+	spin_lock(&aio_nr_lock);
+	if (aio_nr + ctx->max_reqs > aio_max_nr ||
+	    aio_nr + ctx->max_reqs < aio_nr)
+		ctx->max_reqs = 0;
+	else
+		aio_nr += ctx->max_reqs;
+	spin_unlock(&aio_nr_lock);
+	if (ctx->max_reqs == 0)
 		goto out_cleanup;
 
 	/* now link into global list.  kludge.  FIXME */
@@ -248,8 +255,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	return ctx;
 
 out_cleanup:
-	atomic_sub(ctx->max_reqs, &aio_nr);
-	ctx->max_reqs = 0;	/* prevent __put_ioctx from sub'ing aio_nr */
 	__put_ioctx(ctx);
 	return ERR_PTR(-EAGAIN);
 
@@ -374,7 +379,12 @@ void fastcall __put_ioctx(struct kioctx *ctx)
 	pr_debug("__put_ioctx: freeing %p\n", ctx);
 	kmem_cache_free(kioctx_cachep, ctx);
 
-	atomic_sub(nr_events, &aio_nr);
+	if (nr_events) {
+		spin_lock(&aio_nr_lock);
+		BUG_ON(aio_nr - nr_events > aio_nr);
+		aio_nr -= nr_events;
+		spin_unlock(&aio_nr_lock);
+	}
 }
 
 /* aio_get_req
@@ -1258,8 +1268,9 @@ asmlinkage long sys_io_setup(unsigned nr_events, aio_context_t __user *ctxp)
 		goto out;
 
 	ret = -EINVAL;
-	if (unlikely(ctx || (int)nr_events <= 0)) {
-		pr_debug("EINVAL: io_setup: ctx or nr_events > max\n");
+	if (unlikely(ctx || nr_events == 0)) {
+		pr_debug("EINVAL: io_setup: ctx %lu nr_events %u\n",
+		         ctx, nr_events);
 		goto out;
 	}
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 0decf66117c1..403d71dcb7c8 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -183,6 +183,7 @@ struct kioctx {
 	struct list_head	active_reqs;	/* used for cancellation */
 	struct list_head	run_list;	/* used for kicked reqs */
 
+	/* sys_io_setup currently limits this to an unsigned int */
 	unsigned		max_reqs;
 
 	struct aio_ring_info	ring_info;
@@ -234,7 +235,7 @@ static inline struct kiocb *list_kiocb(struct list_head *h)
 }
 
 /* for sysctl: */
-extern atomic_t aio_nr;
-extern unsigned aio_max_nr;
+extern unsigned long aio_nr;
+extern unsigned long aio_max_nr;
 
 #endif /* __LINUX__AIO_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8e56e2495542..e1351200ce85 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -952,7 +952,7 @@ static ctl_table fs_table[] = {
 		.data		= &aio_nr,
 		.maxlen		= sizeof(aio_nr),
 		.mode		= 0444,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_doulongvec_minmax,
 	},
 	{
 		.ctl_name	= FS_AIO_MAX_NR,
@@ -960,7 +960,7 @@ static ctl_table fs_table[] = {
 		.data		= &aio_max_nr,
 		.maxlen		= sizeof(aio_max_nr),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_doulongvec_minmax,
 	},
 #ifdef CONFIG_INOTIFY
 	{
-- 
cgit v1.2.3


From 796f8d9b98fc92a5e9aaea8cf932957850332f51 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Mon, 7 Nov 2005 00:59:33 -0800
Subject: [PATCH] FUTEX_WAKE_OP: enhanced error handling

The code for FUTEX_WAKE_OP calls an arch callback,
futex_atomic_op_inuser().  That callback can return an error code, but
currently the caller assumes any error is EFAULT, and will try various
things to resolve the fault before eventually giving up with EFAULT
(regardless of the original error code).  This is not a theoretical case -
arch callbacks currently return -ENOSYS if the opcode they are given is
bogus.

This patch alters the code to detect non-EFAULT errors and return them
directly to the user.

Of course, whether -ENOSYS is the correct return value for the bogus opcode
case, or whether EINVAL would be more appropriate is another question.

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Jamie Lokier <jamie@shareable.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 3b4d5ad44cc6..aca8d10704f6 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -365,6 +365,11 @@ retry:
 		if (bh1 != bh2)
 			spin_unlock(&bh2->lock);
 
+		if (unlikely(op_ret != -EFAULT)) {
+			ret = op_ret;
+			goto out;
+		}
+
 		/* futex_atomic_op_inuser needs to both read and write
 		 * *(int __user *)uaddr2, but we can't modify it
 		 * non-atomically.  Therefore, if get_user below is not
-- 
cgit v1.2.3


From 8c65b4a60450590e79a28e9717ceffa9e4debb3f Mon Sep 17 00:00:00 2001
From: Tim Schmielau <tim@physik3.uni-rostock.de>
Date: Mon, 7 Nov 2005 00:59:43 -0800
Subject: [PATCH] fix remaining missing includes

Fix more include file problems that surfaced since I submitted the previous
fix-missing-includes.patch.  This should now allow not to include sched.h
from module.h, which is done by a followup patch.

Signed-off-by: Tim Schmielau <tim@physik3.uni-rostock.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mach-aaec2000/clock.c            | 1 +
 arch/arm/mach-epxa10db/mm.c               | 1 +
 arch/arm/mach-pxa/corgi_lcd.c             | 1 +
 arch/ppc/syslib/ppc_sys.c                 | 1 +
 drivers/base/power/sysfs.c                | 1 +
 drivers/char/agp/amd64-agp.c              | 1 +
 drivers/hwmon/hwmon.c                     | 1 +
 drivers/infiniband/core/agent.c           | 3 +++
 drivers/infiniband/core/packer.c          | 2 ++
 drivers/infiniband/core/sysfs.c           | 3 +++
 drivers/infiniband/core/ud_header.c       | 1 +
 drivers/infiniband/core/verbs.c           | 1 +
 drivers/infiniband/hw/mthca/mthca_catas.c | 3 +++
 drivers/infiniband/hw/mthca/mthca_srq.c   | 3 +++
 drivers/media/dvb/frontends/cx24110.c     | 1 +
 drivers/message/i2o/exec-osm.c            | 1 +
 drivers/mfd/mcp-core.c                    | 2 ++
 drivers/pci/hotplug/pciehprm_nonacpi.c    | 1 +
 drivers/pci/pci-driver.c                  | 1 +
 drivers/scsi/atari_dma_emul.c             | 2 ++
 drivers/scsi/raid_class.c                 | 2 ++
 drivers/scsi/scsi_transport_sas.c         | 2 ++
 drivers/scsi/sym53c8xx_2/sym_hipd.c       | 1 +
 fs/9p/error.c                             | 1 +
 include/asm-alpha/pgtable.h               | 3 +++
 include/asm-cris/processor.h              | 2 ++
 include/asm-frv/pgtable.h                 | 2 ++
 include/asm-generic/pgtable.h             | 1 +
 include/asm-i386/elf.h                    | 2 ++
 include/asm-i386/pgtable.h                | 3 +++
 include/asm-ia64/pgtable.h                | 1 +
 include/asm-m32r/pgtable.h                | 3 +++
 include/asm-mips/elf.h                    | 2 ++
 include/asm-mips/pgtable.h                | 3 +++
 include/asm-parisc/pgtable.h              | 3 ++-
 include/asm-powerpc/elf.h                 | 2 ++
 include/asm-ppc/pgtable.h                 | 1 +
 include/asm-ppc64/pgtable.h               | 1 +
 include/asm-s390/elf.h                    | 1 +
 include/asm-s390/pgtable.h                | 1 +
 include/asm-sh/elf.h                      | 1 +
 include/asm-sh/pgtable.h                  | 2 ++
 include/asm-sh64/pgtable.h                | 2 ++
 include/asm-x86_64/elf.h                  | 2 ++
 include/asm-x86_64/pgtable.h              | 2 ++
 include/asm-xtensa/elf.h                  | 2 ++
 include/asm-xtensa/pgtable.h              | 3 +++
 include/linux/irq.h                       | 1 +
 include/linux/memory.h                    | 3 +++
 include/linux/sem.h                       | 2 ++
 include/linux/wait.h                      | 1 +
 kernel/module.c                           | 1 +
 52 files changed, 89 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-aaec2000/clock.c b/arch/arm/mach-aaec2000/clock.c
index 99e019169dda..0340ddc4824e 100644
--- a/arch/arm/mach-aaec2000/clock.c
+++ b/arch/arm/mach-aaec2000/clock.c
@@ -14,6 +14,7 @@
 #include <linux/list.h>
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <asm/semaphore.h>
 #include <asm/hardware/clock.h>
diff --git a/arch/arm/mach-epxa10db/mm.c b/arch/arm/mach-epxa10db/mm.c
index e8832d0910ee..cfd0d2182d44 100644
--- a/arch/arm/mach-epxa10db/mm.c
+++ b/arch/arm/mach-epxa10db/mm.c
@@ -25,6 +25,7 @@
 #include <asm/hardware.h>
 #include <asm/io.h>
 #include <asm/sizes.h>
+#include <asm/page.h>
  
 #include <asm/mach/map.h>
 
diff --git a/arch/arm/mach-pxa/corgi_lcd.c b/arch/arm/mach-pxa/corgi_lcd.c
index 54162ba95414..698eb06545c4 100644
--- a/arch/arm/mach-pxa/corgi_lcd.c
+++ b/arch/arm/mach-pxa/corgi_lcd.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/module.h>
+#include <linux/string.h>
 #include <asm/arch/akita.h>
 #include <asm/arch/corgi.h>
 #include <asm/arch/hardware.h>
diff --git a/arch/ppc/syslib/ppc_sys.c b/arch/ppc/syslib/ppc_sys.c
index 62ee86e80711..603f01190816 100644
--- a/arch/ppc/syslib/ppc_sys.c
+++ b/arch/ppc/syslib/ppc_sys.c
@@ -14,6 +14,7 @@
  * option) any later version.
  */
 
+#include <linux/string.h>
 #include <asm/ppc_sys.h>
 
 int (*ppc_sys_device_fixup) (struct platform_device * pdev);
diff --git a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
index 89c57875f3e5..f3a0c562bcb5 100644
--- a/drivers/base/power/sysfs.c
+++ b/drivers/base/power/sysfs.c
@@ -3,6 +3,7 @@
  */
 
 #include <linux/device.h>
+#include <linux/string.h>
 #include "power.h"
 
 
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 0e6c3a31d344..78ce98a69f37 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -13,6 +13,7 @@
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/agp_backend.h>
+#include <linux/mmzone.h>
 #include <asm/page.h>		/* PAGE_SIZE */
 #include "agp.h"
 
diff --git a/drivers/hwmon/hwmon.c b/drivers/hwmon/hwmon.c
index 6f48579799b5..dddd3eb9b387 100644
--- a/drivers/hwmon/hwmon.c
+++ b/drivers/hwmon/hwmon.c
@@ -16,6 +16,7 @@
 #include <linux/kdev_t.h>
 #include <linux/idr.h>
 #include <linux/hwmon.h>
+#include <linux/gfp.h>
 
 #define HWMON_ID_PREFIX "hwmon"
 #define HWMON_ID_FORMAT HWMON_ID_PREFIX "%d"
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
index 7545775d38ef..34b724afd28d 100644
--- a/drivers/infiniband/core/agent.c
+++ b/drivers/infiniband/core/agent.c
@@ -37,6 +37,9 @@
  * $Id: agent.c 1389 2004-12-27 22:56:47Z roland $
  */
 
+#include <linux/slab.h>
+#include <linux/string.h>
+
 #include "agent.h"
 #include "smi.h"
 
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
index 35df5010e723..c972d7235764 100644
--- a/drivers/infiniband/core/packer.c
+++ b/drivers/infiniband/core/packer.c
@@ -33,6 +33,8 @@
  * $Id: packer.c 1349 2004-12-16 21:09:43Z roland $
  */
 
+#include <linux/string.h>
+
 #include <rdma/ib_pack.h>
 
 static u64 value_read(int offset, int size, void *structure)
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index b8120650e711..08648b1a387e 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -36,6 +36,9 @@
 
 #include "core_priv.h"
 
+#include <linux/slab.h>
+#include <linux/string.h>
+
 #include <rdma/ib_mad.h>
 
 struct ib_port {
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
index 527b23450ab3..997c07db6d8f 100644
--- a/drivers/infiniband/core/ud_header.c
+++ b/drivers/infiniband/core/ud_header.c
@@ -34,6 +34,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/string.h>
 
 #include <rdma/ib_pack.h>
 
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 72d3ef786db5..4186cc888ea5 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -40,6 +40,7 @@
 
 #include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/string.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
index 7ac52af43b99..25ebab64bc42 100644
--- a/drivers/infiniband/hw/mthca/mthca_catas.c
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -32,6 +32,9 @@
  * $Id$
  */
 
+#include <linux/jiffies.h>
+#include <linux/timer.h>
+
 #include "mthca_dev.h"
 
 enum {
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 292f55be8cbd..26d5161fde07 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -32,6 +32,9 @@
  * $Id: mthca_srq.c 3047 2005-08-10 03:59:35Z roland $
  */
 
+#include <linux/slab.h>
+#include <linux/string.h>
+
 #include "mthca_dev.h"
 #include "mthca_cmd.h"
 #include "mthca_memfree.h"
diff --git a/drivers/media/dvb/frontends/cx24110.c b/drivers/media/dvb/frontends/cx24110.c
index d4b97989e3ed..654d7dc879d9 100644
--- a/drivers/media/dvb/frontends/cx24110.c
+++ b/drivers/media/dvb/frontends/cx24110.c
@@ -27,6 +27,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/init.h>
+#include <linux/jiffies.h>
 
 #include "dvb_frontend.h"
 #include "cx24110.h"
diff --git a/drivers/message/i2o/exec-osm.c b/drivers/message/i2o/exec-osm.c
index b675b4ebbebd..9c339a2505b0 100644
--- a/drivers/message/i2o/exec-osm.c
+++ b/drivers/message/i2o/exec-osm.c
@@ -33,6 +33,7 @@
 #include <linux/workqueue.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/sched.h>   /* wait_event_interruptible_timeout() needs this */
 #include <asm/param.h>		/* HZ */
 #include "core.h"
 
diff --git a/drivers/mfd/mcp-core.c b/drivers/mfd/mcp-core.c
index c75d713c01e4..55ba23075c90 100644
--- a/drivers/mfd/mcp-core.c
+++ b/drivers/mfd/mcp-core.c
@@ -15,6 +15,8 @@
 #include <linux/errno.h>
 #include <linux/smp.h>
 #include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 
 #include <asm/dma.h>
 #include <asm/system.h>
diff --git a/drivers/pci/hotplug/pciehprm_nonacpi.c b/drivers/pci/hotplug/pciehprm_nonacpi.c
index 33b2c69a0829..76c727c74cc0 100644
--- a/drivers/pci/hotplug/pciehprm_nonacpi.c
+++ b/drivers/pci/hotplug/pciehprm_nonacpi.c
@@ -31,6 +31,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/slab.h>
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index ae986e590b48..94e68c54d273 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -10,6 +10,7 @@
 #include <linux/mempolicy.h>
 #include <linux/string.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 #include "pci.h"
 
 /*
diff --git a/drivers/scsi/atari_dma_emul.c b/drivers/scsi/atari_dma_emul.c
index 7026045527fd..8d5d2a5da961 100644
--- a/drivers/scsi/atari_dma_emul.c
+++ b/drivers/scsi/atari_dma_emul.c
@@ -19,6 +19,8 @@
  * this code.
  */
 
+#include <linux/compiler.h>
+#include <asm/thread_info.h>
 #include <asm/uaccess.h>
 
 #define hades_dma_ctrl		(*(unsigned char *) 0xffff8717)
diff --git a/drivers/scsi/raid_class.c b/drivers/scsi/raid_class.c
index f1ea5027865f..caa0c3629626 100644
--- a/drivers/scsi/raid_class.c
+++ b/drivers/scsi/raid_class.c
@@ -4,6 +4,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 #include <linux/raid_class.h>
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 0cc766a9aa65..edabbd05d258 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -26,6 +26,8 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/string.h>
 
 #include <scsi/scsi_device.h>
 #include <scsi/scsi_host.h>
diff --git a/drivers/scsi/sym53c8xx_2/sym_hipd.c b/drivers/scsi/sym53c8xx_2/sym_hipd.c
index a1a58e1d5ad3..a7420cad4547 100644
--- a/drivers/scsi/sym53c8xx_2/sym_hipd.c
+++ b/drivers/scsi/sym53c8xx_2/sym_hipd.c
@@ -39,6 +39,7 @@
  */
 
 #include <linux/slab.h>
+#include <asm/param.h>		/* for timeouts in units of HZ */
 
 #include "sym_glue.h"
 #include "sym_nvram.h"
diff --git a/fs/9p/error.c b/fs/9p/error.c
index fee5d19179c5..834cb179e388 100644
--- a/fs/9p/error.c
+++ b/fs/9p/error.c
@@ -33,6 +33,7 @@
 
 #include <linux/list.h>
 #include <linux/jhash.h>
+#include <linux/string.h>
 
 #include "debug.h"
 #include "error.h"
diff --git a/include/asm-alpha/pgtable.h b/include/asm-alpha/pgtable.h
index 8393bf374b2b..a985cd29b6db 100644
--- a/include/asm-alpha/pgtable.h
+++ b/include/asm-alpha/pgtable.h
@@ -17,6 +17,9 @@
 #include <asm/processor.h>	/* For TASK_SIZE */
 #include <asm/machvec.h>
 
+struct mm_struct;
+struct vm_area_struct;
+
 /* Certain architectures need to do special things when PTEs
  * within a page table are directly modified.  Thus, the following
  * hook is made available.
diff --git a/include/asm-cris/processor.h b/include/asm-cris/processor.h
index e8b2abb2ae59..dce41009eeb0 100644
--- a/include/asm-cris/processor.h
+++ b/include/asm-cris/processor.h
@@ -16,6 +16,8 @@
 #include <asm/ptrace.h>
 #include <asm/arch/processor.h>
 
+struct task_struct;
+
 /* This decides where the kernel will search for a free chunk of vm
  * space during mmap's.
  */
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index b247e99dff49..844666377dcb 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -26,6 +26,8 @@
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
+struct mm_struct;
+struct vm_area_struct;
 #endif
 
 #ifndef __ASSEMBLY__
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 7dca30a26c53..358e4d309ceb 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -128,6 +128,7 @@ do {									\
 #endif
 
 #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT
+struct mm_struct;
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
 {
 	pte_t old_pte = *ptep;
diff --git a/include/asm-i386/elf.h b/include/asm-i386/elf.h
index fa11117d3cfa..4153d80e4d2b 100644
--- a/include/asm-i386/elf.h
+++ b/include/asm-i386/elf.h
@@ -119,6 +119,8 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
  */
 #define elf_read_implies_exec(ex, executable_stack)	(executable_stack != EXSTACK_DISABLE_X)
 
+struct task_struct;
+
 extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
 extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
 extern int dump_task_extended_fpu (struct task_struct *, struct user_fxsr_struct *);
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 03f3c8ac6383..088a945bf26b 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -25,6 +25,9 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 
+struct mm_struct;
+struct vm_area_struct;
+
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 21e32a06bc82..c34ba80c1c31 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -127,6 +127,7 @@
 
 # ifndef __ASSEMBLY__
 
+#include <linux/sched.h>	/* for mm_struct */
 #include <asm/bitops.h>
 #include <asm/cacheflush.h>
 #include <asm/mmu_context.h>
diff --git a/include/asm-m32r/pgtable.h b/include/asm-m32r/pgtable.h
index 1cd5fd4a5b2c..75740debcd01 100644
--- a/include/asm-m32r/pgtable.h
+++ b/include/asm-m32r/pgtable.h
@@ -27,6 +27,9 @@
 #include <asm/bitops.h>
 #include <asm/page.h>
 
+struct mm_struct;
+struct vm_area_struct;
+
 extern pgd_t swapper_pg_dir[1024];
 extern void paging_init(void);
 
diff --git a/include/asm-mips/elf.h b/include/asm-mips/elf.h
index 7420f12742bb..d2c9a25f8459 100644
--- a/include/asm-mips/elf.h
+++ b/include/asm-mips/elf.h
@@ -275,6 +275,8 @@ do {									\
 
 #endif /* CONFIG_64BIT */
 
+struct task_struct;
+
 extern void dump_regs(elf_greg_t *, struct pt_regs *regs);
 extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
 extern int dump_task_fpu(struct task_struct *, elf_fpregset_t *);
diff --git a/include/asm-mips/pgtable.h b/include/asm-mips/pgtable.h
index 34facd996503..702a28fa7a34 100644
--- a/include/asm-mips/pgtable.h
+++ b/include/asm-mips/pgtable.h
@@ -19,6 +19,9 @@
 #include <asm/io.h>
 #include <asm/pgtable-bits.h>
 
+struct mm_struct;
+struct vm_area_struct;
+
 #define PAGE_NONE	__pgprot(_PAGE_PRESENT | _CACHE_CACHABLE_NONCOHERENT)
 #define PAGE_SHARED	__pgprot(_PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE | \
 			PAGE_CACHABLE_DEFAULT)
diff --git a/include/asm-parisc/pgtable.h b/include/asm-parisc/pgtable.h
index c28fb6f48c6c..b4554711c3e7 100644
--- a/include/asm-parisc/pgtable.h
+++ b/include/asm-parisc/pgtable.h
@@ -12,6 +12,7 @@
  */
 
 #include <linux/spinlock.h>
+#include <linux/mm.h>		/* for vm_area_struct */
 #include <asm/processor.h>
 #include <asm/cache.h>
 #include <asm/bitops.h>
@@ -418,7 +419,6 @@ extern void paging_init (void);
 
 #define PG_dcache_dirty         PG_arch_1
 
-struct vm_area_struct; /* forward declaration (include/linux/mm.h) */
 extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t);
 
 /* Encode and de-code a swap entry */
@@ -464,6 +464,7 @@ static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned
 
 extern spinlock_t pa_dbit_lock;
 
+struct mm_struct;
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
 {
 	pte_t old_pte;
diff --git a/include/asm-powerpc/elf.h b/include/asm-powerpc/elf.h
index d140577d0a05..feac3458d71f 100644
--- a/include/asm-powerpc/elf.h
+++ b/include/asm-powerpc/elf.h
@@ -1,11 +1,13 @@
 #ifndef _ASM_POWERPC_ELF_H
 #define _ASM_POWERPC_ELF_H
 
+#include <linux/sched.h>	/* for task_struct */
 #include <asm/types.h>
 #include <asm/ptrace.h>
 #include <asm/cputable.h>
 #include <asm/auxvec.h>
 #include <asm/page.h>
+#include <asm/string.h>
 
 /* PowerPC relocations defined by the ABIs */
 #define R_PPC_NONE		0
diff --git a/include/asm-ppc/pgtable.h b/include/asm-ppc/pgtable.h
index b28a713ba862..6d1c39e8a6af 100644
--- a/include/asm-ppc/pgtable.h
+++ b/include/asm-ppc/pgtable.h
@@ -12,6 +12,7 @@
 #include <asm/processor.h>		/* For TASK_SIZE */
 #include <asm/mmu.h>
 #include <asm/page.h>
+struct mm_struct;
 
 extern unsigned long va_to_phys(unsigned long address);
 extern pte_t *va_to_pte(unsigned long address);
diff --git a/include/asm-ppc64/pgtable.h b/include/asm-ppc64/pgtable.h
index fde93ec36abc..a9783ba7fe98 100644
--- a/include/asm-ppc64/pgtable.h
+++ b/include/asm-ppc64/pgtable.h
@@ -13,6 +13,7 @@
 #include <asm/mmu.h>
 #include <asm/page.h>
 #include <asm/tlbflush.h>
+struct mm_struct;
 #endif /* __ASSEMBLY__ */
 
 #ifdef CONFIG_PPC_64K_PAGES
diff --git a/include/asm-s390/elf.h b/include/asm-s390/elf.h
index 3b8bd46832a1..372d51cccd53 100644
--- a/include/asm-s390/elf.h
+++ b/include/asm-s390/elf.h
@@ -96,6 +96,7 @@
  * ELF register definitions..
  */
 
+#include <linux/sched.h>	/* for task_struct */
 #include <asm/ptrace.h>
 #include <asm/user.h>
 #include <asm/system.h>		/* for save_access_regs */
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index df94f89038cc..9be741bb1496 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -36,6 +36,7 @@
 #include <linux/threads.h>
 
 struct vm_area_struct; /* forward declaration (include/linux/mm.h) */
+struct mm_struct;
 
 extern pgd_t swapper_pg_dir[] __attribute__ ((aligned (4096)));
 extern void paging_init(void);
diff --git a/include/asm-sh/elf.h b/include/asm-sh/elf.h
index 8fe00a1981ce..1b63dfeea4f2 100644
--- a/include/asm-sh/elf.h
+++ b/include/asm-sh/elf.h
@@ -111,6 +111,7 @@ typedef struct user_fpu_struct elf_fpregset_t;
 
 #ifdef __KERNEL__
 #define SET_PERSONALITY(ex, ibcs2) set_personality(PER_LINUX_32BIT)
+struct task_struct;
 extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
 extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
 
diff --git a/include/asm-sh/pgtable.h b/include/asm-sh/pgtable.h
index dee36bcbcf98..bb0efb31a8cb 100644
--- a/include/asm-sh/pgtable.h
+++ b/include/asm-sh/pgtable.h
@@ -284,6 +284,8 @@ typedef pte_t *pte_addr_t;
 #define GET_IOSPACE(pfn)		0
 #define GET_PFN(pfn)			(pfn)
 
+struct mm_struct;
+
 /*
  * No page table caches to initialise
  */
diff --git a/include/asm-sh64/pgtable.h b/include/asm-sh64/pgtable.h
index 51b05818e4eb..a1906a772df9 100644
--- a/include/asm-sh64/pgtable.h
+++ b/include/asm-sh64/pgtable.h
@@ -24,6 +24,8 @@
 #include <linux/threads.h>
 #include <linux/config.h>
 
+struct vm_area_struct;
+
 extern void paging_init(void);
 
 /* We provide our own get_unmapped_area to avoid cache synonym issue */
diff --git a/include/asm-x86_64/elf.h b/include/asm-x86_64/elf.h
index a60a35e79222..43862cd6a569 100644
--- a/include/asm-x86_64/elf.h
+++ b/include/asm-x86_64/elf.h
@@ -149,6 +149,8 @@ extern void set_personality_64bit(void);
  */
 #define elf_read_implies_exec(ex, executable_stack)	(executable_stack != EXSTACK_DISABLE_X)
 
+struct task_struct;
+
 extern int dump_task_regs (struct task_struct *, elf_gregset_t *);
 extern int dump_task_fpu (struct task_struct *, elf_fpregset_t *);
 
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 7a07196a7202..7309fffeec9a 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -105,6 +105,8 @@ static inline void pgd_clear (pgd_t * pgd)
 
 #define ptep_get_and_clear(mm,addr,xp)	__pte(xchg(&(xp)->pte, 0))
 
+struct mm_struct;
+
 static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
 {
 	pte_t pte;
diff --git a/include/asm-xtensa/elf.h b/include/asm-xtensa/elf.h
index 64f1f53874fe..de0667453b2e 100644
--- a/include/asm-xtensa/elf.h
+++ b/include/asm-xtensa/elf.h
@@ -209,6 +209,8 @@ extern void xtensa_elf_core_copy_regs (xtensa_gregset_t *, struct pt_regs *);
 
 #define SET_PERSONALITY(ex, ibcs2) set_personality(PER_LINUX_32BIT)
 
+struct task_struct;
+
 extern void do_copy_regs (xtensa_gregset_t*, struct pt_regs*,
 			  struct task_struct*);
 extern void do_restore_regs (xtensa_gregset_t*, struct pt_regs*,
diff --git a/include/asm-xtensa/pgtable.h b/include/asm-xtensa/pgtable.h
index 987e3b802313..7b15afb70c56 100644
--- a/include/asm-xtensa/pgtable.h
+++ b/include/asm-xtensa/pgtable.h
@@ -278,6 +278,8 @@ static inline void update_pte(pte_t *ptep, pte_t pteval)
 #endif
 }
 
+struct mm_struct;
+
 static inline void
 set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval)
 {
@@ -294,6 +296,7 @@ set_pmd(pmd_t *pmdp, pmd_t pmdval)
 #endif
 }
 
+struct vm_area_struct;
 
 static inline int
 ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr,
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 69681c3b1f05..c516382fbec2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -10,6 +10,7 @@
  */
 
 #include <linux/config.h>
+#include <asm/smp.h>		/* cpu_online_map */
 
 #if !defined(CONFIG_ARCH_S390)
 
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0def328ab5cf..9a424383e6c6 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -54,6 +54,9 @@ struct memory_block {
  */
 #define	MEM_MAPPING_INVALID	(1<<3)
 
+struct notifier_block;
+struct mem_section;
+
 #ifndef CONFIG_MEMORY_HOTPLUG
 static inline int memory_dev_init(void)
 {
diff --git a/include/linux/sem.h b/include/linux/sem.h
index 106f9757339a..3c1f1120fe88 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -79,6 +79,8 @@ struct  seminfo {
 
 #ifdef __KERNEL__
 
+struct task_struct;
+
 /* One semaphore structure for each semaphore in the system. */
 struct sem {
 	int	semval;		/* current value */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index d38c9fecdc36..d28518236b62 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -54,6 +54,7 @@ struct __wait_queue_head {
 };
 typedef struct __wait_queue_head wait_queue_head_t;
 
+struct task_struct;
 
 /*
  * Macros for declaration and initialisaton of the datatypes
diff --git a/kernel/module.c b/kernel/module.c
index ff5c500ab625..2ea929d51ad0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -37,6 +37,7 @@
 #include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 #include <asm/cacheflush.h>
-- 
cgit v1.2.3


From 481bed454247538e9f57d4ea37b153ccba24ba7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 7 Nov 2005 00:59:47 -0800
Subject: [PATCH] consolidate sys_ptrace()

The sys_ptrace boilerplate code (everything outside the big switch
statement for the arch-specific requests) is shared by most architectures.
This patch moves it to kernel/ptrace.c and leaves the arch-specific code as
arch_ptrace.

Some architectures have a too different ptrace so we have to exclude them.
They continue to keep their implementations.  For sh64 I had to add a
sh64_ptrace wrapper because it does some initialization on the first call.
For um I removed an ifdefed SUBARCH_PTRACE_SPECIAL block, but
SUBARCH_PTRACE_SPECIAL isn't defined anywhere in the tree.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-By: David Howells <dhowells@redhat.com>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Acked-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/ptrace.c           | 49 +---------------------
 arch/arm26/kernel/ptrace.c         | 49 +---------------------
 arch/cris/arch-v10/kernel/ptrace.c | 51 +----------------------
 arch/cris/arch-v32/kernel/ptrace.c | 51 +----------------------
 arch/frv/kernel/ptrace.c           | 43 +-------------------
 arch/h8300/kernel/ptrace.c         | 39 +-----------------
 arch/i386/kernel/ptrace.c          | 44 +-------------------
 arch/m68k/kernel/ptrace.c          | 47 ++-------------------
 arch/m68knommu/kernel/ptrace.c     | 39 +-----------------
 arch/mips/kernel/ptrace.c          | 55 +++----------------------
 arch/parisc/kernel/ptrace.c        | 50 ++---------------------
 arch/powerpc/kernel/ptrace.c       | 43 +-------------------
 arch/sh/kernel/ptrace.c            | 44 +-------------------
 arch/sh64/kernel/ptrace.c          | 83 +++++++++++++-------------------------
 arch/sh64/kernel/syscalls.S        |  2 +-
 arch/um/kernel/ptrace.c            | 50 +----------------------
 arch/v850/kernel/ptrace.c          | 43 +-------------------
 arch/x86_64/kernel/ptrace.c        | 43 +-------------------
 arch/xtensa/kernel/ptrace.c        | 55 +------------------------
 include/asm-alpha/ptrace.h         |  3 ++
 include/asm-ia64/ptrace.h          |  3 ++
 include/asm-m32r/ptrace.h          |  3 ++
 include/asm-s390/ptrace.h          |  2 +
 include/asm-sparc/ptrace.h         |  3 ++
 include/asm-sparc64/ptrace.h       |  3 ++
 include/linux/ptrace.h             |  2 +
 kernel/ptrace.c                    | 82 +++++++++++++++++++++++++++++++++++++
 27 files changed, 163 insertions(+), 818 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c
index 9bd8609a2926..9a340e790da5 100644
--- a/arch/arm/kernel/ptrace.c
+++ b/arch/arm/kernel/ptrace.c
@@ -648,7 +648,7 @@ static int ptrace_setwmmxregs(struct task_struct *tsk, void __user *ufp)
 
 #endif
 
-static int do_ptrace(int request, struct task_struct *child, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
 	unsigned long tmp;
 	int ret;
@@ -782,53 +782,6 @@ static int do_ptrace(int request, struct task_struct *child, long addr, long dat
 	return ret;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
-{
-	struct task_struct *child;
-	int ret;
-
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret == 0)
-		ret = do_ptrace(request, child, addr, data);
-
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
-	return ret;
-}
-
 asmlinkage void syscall_trace(int why, struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/arch/arm26/kernel/ptrace.c b/arch/arm26/kernel/ptrace.c
index cf7e977d18c8..4e6b7356a722 100644
--- a/arch/arm26/kernel/ptrace.c
+++ b/arch/arm26/kernel/ptrace.c
@@ -546,7 +546,7 @@ static int ptrace_setfpregs(struct task_struct *tsk, void *ufp)
 			      sizeof(struct user_fp)) ? -EFAULT : 0;
 }
 
-static int do_ptrace(int request, struct task_struct *child, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
 	unsigned long tmp;
 	int ret;
@@ -665,53 +665,6 @@ static int do_ptrace(int request, struct task_struct *child, long addr, long dat
 	return ret;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
-{
-	struct task_struct *child;
-	int ret;
-
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret == 0)
-		ret = do_ptrace(request, child, addr, data);
-
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
-	return ret;
-}
-
 asmlinkage void syscall_trace(int why, struct pt_regs *regs)
 {
 	unsigned long ip;
diff --git a/arch/cris/arch-v10/kernel/ptrace.c b/arch/cris/arch-v10/kernel/ptrace.c
index 130dd214e41d..6cbd34a27b90 100644
--- a/arch/cris/arch-v10/kernel/ptrace.c
+++ b/arch/cris/arch-v10/kernel/ptrace.c
@@ -76,55 +76,11 @@ ptrace_disable(struct task_struct *child)
  * (in user space) where the result of the ptrace call is written (instead of
  * being returned).
  */
-asmlinkage int 
-sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret;
 	unsigned long __user *datap = (unsigned long __user *)data;
 
-	lock_kernel();
-	ret = -EPERM;
-	
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	
-	if (child)
-		get_task_struct(child);
-	
-	read_unlock(&tasklist_lock);
-	
-	if (!child)
-		goto out;
-	
-	ret = -EPERM;
-	
-	if (pid == 1)		/* Leave the init process alone! */
-		goto out_tsk;
-	
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 		/* Read word at location address. */ 
 		case PTRACE_PEEKTEXT:
@@ -289,10 +245,7 @@ sys_ptrace(long request, long pid, long addr, long data)
 			ret = ptrace_request(child, request, addr, data);
 			break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+
 	return ret;
 }
 
diff --git a/arch/cris/arch-v32/kernel/ptrace.c b/arch/cris/arch-v32/kernel/ptrace.c
index 208489da2a87..5528b83a622b 100644
--- a/arch/cris/arch-v32/kernel/ptrace.c
+++ b/arch/cris/arch-v32/kernel/ptrace.c
@@ -99,55 +99,11 @@ ptrace_disable(struct task_struct *child)
 }
 
 
-asmlinkage int
-sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret;
 	unsigned long __user *datap = (unsigned long __user *)data;
 
-	lock_kernel();
-	ret = -EPERM;
-
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-
-	if (child)
-		get_task_struct(child);
-
-	read_unlock(&tasklist_lock);
-
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-
-	if (pid == 1)		/* Leave the init process alone! */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 		/* Read word at location address. */
 		case PTRACE_PEEKTEXT:
@@ -347,10 +303,7 @@ sys_ptrace(long request, long pid, long addr, long data)
 			ret = ptrace_request(child, request, addr, data);
 			break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+
 	return ret;
 }
 
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index cb335a14a315..f953484e7d59 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -106,48 +106,11 @@ void ptrace_enable(struct task_struct *child)
 	child->thread.frame0->__status |= REG__STATUS_STEP;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	unsigned long tmp;
 	int ret;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 		/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
@@ -351,10 +314,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		ret = -EIO;
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
 	return ret;
 }
 
diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index a569fe4aa284..0ff6f79b0fed 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -57,43 +57,10 @@ void ptrace_disable(struct task_struct *child)
 	h8300_disable_trace(child);
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 		case PTRACE_PEEKTEXT: /* read word at location addr. */ 
 		case PTRACE_PEEKDATA: {
@@ -251,10 +218,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 			ret = -EIO;
 			break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
 	return ret;
 }
 
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index efd11f09c996..5ffbb4b7ad05 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -354,49 +354,12 @@ ptrace_set_thread_area(struct task_struct *child,
 	return 0;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	struct user * dummy = NULL;
 	int i, ret;
 	unsigned long __user *datap = (unsigned long __user *)data;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
@@ -663,10 +626,7 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+ out_tsk:
 	return ret;
 }
 
diff --git a/arch/m68k/kernel/ptrace.c b/arch/m68k/kernel/ptrace.c
index f7f1d2e5b90b..7e54422685cf 100644
--- a/arch/m68k/kernel/ptrace.c
+++ b/arch/m68k/kernel/ptrace.c
@@ -121,48 +121,11 @@ void ptrace_disable(struct task_struct *child)
 	child->thread.work.syscall_trace = 0;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	unsigned long tmp;
 	int i, ret = 0;
 
-	lock_kernel();
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			ret = -EPERM;
-			goto out;
-		}
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		goto out;
-	}
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (unlikely(!child)) {
-		ret = -ESRCH;
-		goto out;
-	}
-
-	/* you may not mess with init */
-	if (unlikely(pid == 1)) {
-		ret = -EPERM;
-		goto out_tsk;
-	}
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret)
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT:	/* read word at location addr. */
@@ -317,14 +280,10 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+
 	return ret;
 out_eio:
-	ret = -EIO;
-	goto out_tsk;
+	return -EIO;
 }
 
 asmlinkage void syscall_trace(void)
diff --git a/arch/m68knommu/kernel/ptrace.c b/arch/m68knommu/kernel/ptrace.c
index 621d7b91ccfe..262ab8c72e5f 100644
--- a/arch/m68knommu/kernel/ptrace.c
+++ b/arch/m68knommu/kernel/ptrace.c
@@ -101,43 +101,10 @@ void ptrace_disable(struct task_struct *child)
 	put_reg(child, PT_SR, tmp);
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(truct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 		/* when I and D space are separate, these will need to be fixed. */
 		case PTRACE_PEEKTEXT: /* read word at location addr. */ 
@@ -357,10 +324,6 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 			ret = -EIO;
 			break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
 	return ret;
 }
 
diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index f1b0f3e1f95b..510da5fda567 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -174,51 +174,10 @@ int ptrace_setfpregs (struct task_struct *child, __u32 __user *data)
 	return 0;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret;
 
-#if 0
-	printk("ptrace(r=%d,pid=%d,addr=%08lx,data=%08lx)\n",
-	       (int) request, (int) pid, (unsigned long) addr,
-	       (unsigned long) data);
-#endif
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		if ((ret = security_ptrace(current->parent, current)))
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
@@ -319,7 +278,7 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 			if (!cpu_has_dsp) {
 				tmp = 0;
 				ret = -EIO;
-				goto out_tsk;
+				goto out;
 			}
 			if (child->thread.dsp.used_dsp) {
 				dregs = __get_dsp_regs(child);
@@ -333,14 +292,14 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 			if (!cpu_has_dsp) {
 				tmp = 0;
 				ret = -EIO;
-				goto out_tsk;
+				goto out;
 			}
 			tmp = child->thread.dsp.dspcontrol;
 			break;
 		default:
 			tmp = 0;
 			ret = -EIO;
-			goto out_tsk;
+			goto out;
 		}
 		ret = put_user(tmp, (unsigned long __user *) data);
 		break;
@@ -495,11 +454,7 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+ out:
 	return ret;
 }
 
diff --git a/arch/parisc/kernel/ptrace.c b/arch/parisc/kernel/ptrace.c
index 18130c3748f3..b6fe202a620d 100644
--- a/arch/parisc/kernel/ptrace.c
+++ b/arch/parisc/kernel/ptrace.c
@@ -78,52 +78,13 @@ void ptrace_disable(struct task_struct *child)
 	pa_psw(child)->l = 0;
 }
 
-long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	long ret;
 #ifdef DEBUG_PTRACE
 	long oaddr=addr, odata=data;
 #endif
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-
-		ret = security_ptrace(current->parent, current);
-		if (ret) 
-			goto out;
-
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-	ret = -EPERM;
-	if (pid == 1)		/* no messing around with init! */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
 	case PTRACE_PEEKDATA: {
@@ -383,11 +344,11 @@ long sys_ptrace(long request, long pid, long addr, long data)
 
 	case PTRACE_GETEVENTMSG:
                 ret = put_user(child->ptrace_message, (unsigned int __user *) data);
-		goto out_tsk;
+		goto out;
 
 	default:
 		ret = ptrace_request(child, request, addr, data);
-		goto out_tsk;
+		goto out;
 	}
 
 out_wake_notrap:
@@ -396,10 +357,7 @@ out_wake:
 	wake_up_process(child);
 	ret = 0;
 out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
-	DBG("sys_ptrace(%ld, %d, %lx, %lx) returning %ld\n",
+	DBG("arch_ptrace(%ld, %d, %lx, %lx) returning %ld\n",
 		request, pid, oaddr, odata, ret);
 	return ret;
 }
diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c
index 568ea335d616..3d2abd95c7ae 100644
--- a/arch/powerpc/kernel/ptrace.c
+++ b/arch/powerpc/kernel/ptrace.c
@@ -248,46 +248,10 @@ void ptrace_disable(struct task_struct *child)
 	clear_single_step(child);
 }
 
-long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret = -EPERM;
 
-	lock_kernel();
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
@@ -540,10 +504,7 @@ long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+
 	return ret;
 }
 
diff --git a/arch/sh/kernel/ptrace.c b/arch/sh/kernel/ptrace.c
index 1fbe5a428e31..1a8be06519ec 100644
--- a/arch/sh/kernel/ptrace.c
+++ b/arch/sh/kernel/ptrace.c
@@ -80,48 +80,11 @@ void ptrace_disable(struct task_struct *child)
 	/* nothing to do.. */
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	struct user * dummy = NULL;
 	int ret;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
@@ -289,10 +252,7 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+
 	return ret;
 }
 
diff --git a/arch/sh64/kernel/ptrace.c b/arch/sh64/kernel/ptrace.c
index 71f2eec00b99..cd22e9471316 100644
--- a/arch/sh64/kernel/ptrace.c
+++ b/arch/sh64/kernel/ptrace.c
@@ -28,6 +28,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/signal.h>
+#include <linux/syscalls.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -121,61 +122,11 @@ put_fpu_long(struct task_struct *task, unsigned long addr, unsigned long data)
 	return 0;
 }
 
-asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
-	extern void poke_real_address_q(unsigned long long addr, unsigned long long data);
-#define WPC_DBRMODE 0x0d104008
-	static int first_call = 1;
 	int ret;
 
-	lock_kernel();
-
-	if (first_call) {
-		/* Set WPC.DBRMODE to 0.  This makes all debug events get
-		 * delivered through RESVEC, i.e. into the handlers in entry.S.
-		 * (If the kernel was downloaded using a remote gdb, WPC.DBRMODE
-		 * would normally be left set to 1, which makes debug events get
-		 * delivered through DBRVEC, i.e. into the remote gdb's
-		 * handlers.  This prevents ptrace getting them, and confuses
-		 * the remote gdb.) */
-		printk("DBRMODE set to 0 to permit native debugging\n");
-		poke_real_address_q(WPC_DBRMODE, 0);
-		first_call = 0;
-	}
-
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-			goto out_tsk;
-		}
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
@@ -313,13 +264,33 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
 	return ret;
 }
 
+asmlinkage int sh64_ptrace(long request, long pid, long addr, long data)
+{
+	extern void poke_real_address_q(unsigned long long addr, unsigned long long data);
+#define WPC_DBRMODE 0x0d104008
+	static int first_call = 1;
+
+	lock_kernel();
+	if (first_call) {
+		/* Set WPC.DBRMODE to 0.  This makes all debug events get
+		 * delivered through RESVEC, i.e. into the handlers in entry.S.
+		 * (If the kernel was downloaded using a remote gdb, WPC.DBRMODE
+		 * would normally be left set to 1, which makes debug events get
+		 * delivered through DBRVEC, i.e. into the remote gdb's
+		 * handlers.  This prevents ptrace getting them, and confuses
+		 * the remote gdb.) */
+		printk("DBRMODE set to 0 to permit native debugging\n");
+		poke_real_address_q(WPC_DBRMODE, 0);
+		first_call = 0;
+	}
+	unlock_kernel();
+
+	return sys_ptrace(request, pid, addr, data);
+}
+
 asmlinkage void syscall_trace(void)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/sh64/kernel/syscalls.S b/arch/sh64/kernel/syscalls.S
index a3d037805f1c..c0079d54c850 100644
--- a/arch/sh64/kernel/syscalls.S
+++ b/arch/sh64/kernel/syscalls.S
@@ -46,7 +46,7 @@ sys_call_table:
 	.long sys_setuid16
 	.long sys_getuid16
 	.long sys_stime			/* 25 */
-	.long sys_ptrace
+	.long sh64_ptrace
 	.long sys_alarm
 	.long sys_fstat
 	.long sys_pause
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 71af4d503899..98e09395c093 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -43,53 +43,10 @@ void ptrace_disable(struct task_struct *child)
 extern int peek_user(struct task_struct * child, long addr, long data);
 extern int poke_user(struct task_struct * child, long addr, long data);
 
-long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int i, ret;
 
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-
-		ret = security_ptrace(current->parent, current);
-		if (ret)
- 			goto out;
-
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-#ifdef SUBACH_PTRACE_SPECIAL
-        SUBARCH_PTRACE_SPECIAL(child,request,addr,data);
-#endif
-
-	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (ret < 0)
-		goto out_tsk;
-
 	switch (request) {
 		/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
@@ -282,10 +239,7 @@ long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
- out_tsk:
-	put_task_struct(child);
- out:
-	unlock_kernel();
+
 	return ret;
 }
 
diff --git a/arch/v850/kernel/ptrace.c b/arch/v850/kernel/ptrace.c
index d6077ff47d22..18492d02aaf6 100644
--- a/arch/v850/kernel/ptrace.c
+++ b/arch/v850/kernel/ptrace.c
@@ -113,45 +113,10 @@ static int set_single_step (struct task_struct *t, int val)
 	return 1;
 }
 
-long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int rval;
 
-	lock_kernel();
-
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED) {
-			rval = -EPERM;
-			goto out;
-		}
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		rval = 0;
-		goto out;
-	}
-	rval = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	rval = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		rval = ptrace_attach(child);
-		goto out_tsk;
-	}
-	rval = ptrace_check_attach(child, request == PTRACE_KILL);
-	if (rval < 0)
-		goto out_tsk;
-
 	switch (request) {
 		unsigned long val, copied;
 
@@ -248,11 +213,7 @@ long sys_ptrace(long request, long pid, long addr, long data)
 		rval = -EIO;
 		goto out;
 	}
-
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+ out:
 	return rval;
 }
 
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index bbf64b59a21e..a87b6cebe80f 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -313,48 +313,11 @@ static unsigned long getreg(struct task_struct *child, unsigned long regno)
 
 }
 
-asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	long i, ret;
 	unsigned ui;
 
-	/* This lock_kernel fixes a subtle race with suid exec */
-	lock_kernel();
-	ret = -EPERM;
-	if (request == PTRACE_TRACEME) {
-		/* are we already being traced? */
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-		ret = security_ptrace(current->parent, current);
-		if (ret)
-			goto out;
-		/* set the ptrace bit in the process flags. */
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out_tsk;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-	ret = ptrace_check_attach(child, request == PTRACE_KILL); 
-	if (ret < 0) 
-		goto out_tsk;
-
 	switch (request) {
 	/* when I and D space are separate, these will need to be fixed. */
 	case PTRACE_PEEKTEXT: /* read word at location addr. */ 
@@ -608,10 +571,6 @@ asmlinkage long sys_ptrace(long request, long pid, unsigned long addr, long data
 		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
 	return ret;
 }
 
diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c
index 14460743de07..ab5c4c65b5c4 100644
--- a/arch/xtensa/kernel/ptrace.c
+++ b/arch/xtensa/kernel/ptrace.c
@@ -45,58 +45,10 @@ void ptrace_disable(struct task_struct *child)
 	/* Nothing to do.. */
 }
 
-long sys_ptrace(long request, long pid, long addr, long data)
+long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct task_struct *child;
 	int ret = -EPERM;
 
-	lock_kernel();
-
-#if 0
-	if ((int)request != 1)
-	printk("ptrace(r=%d,pid=%d,addr=%08lx,data=%08lx)\n",
-	       (int) request, (int) pid, (unsigned long) addr,
-	       (unsigned long) data);
-#endif
-
-	if (request == PTRACE_TRACEME) {
-
-		/* Are we already being traced? */
-
-		if (current->ptrace & PT_PTRACED)
-			goto out;
-
-		if ((ret = security_ptrace(current->parent, current)))
-			goto out;
-
-		/* Set the ptrace bit in the process flags. */
-
-		current->ptrace |= PT_PTRACED;
-		ret = 0;
-		goto out;
-	}
-
-	ret = -ESRCH;
-	read_lock(&tasklist_lock);
-	child = find_task_by_pid(pid);
-	if (child)
-		get_task_struct(child);
-	read_unlock(&tasklist_lock);
-	if (!child)
-		goto out;
-
-	ret = -EPERM;
-	if (pid == 1)		/* you may not mess with init */
-		goto out;
-
-	if (request == PTRACE_ATTACH) {
-		ret = ptrace_attach(child);
-		goto out_tsk;
-	}
-
-	if ((ret = ptrace_check_attach(child, request == PTRACE_KILL)) < 0)
-		goto out_tsk;
-
 	switch (request) {
 	case PTRACE_PEEKTEXT: /* read word at location addr. */
 	case PTRACE_PEEKDATA:
@@ -375,10 +327,7 @@ long sys_ptrace(long request, long pid, long addr, long data)
 		ret = ptrace_request(child, request, addr, data);
 		goto out;
 	}
-out_tsk:
-	put_task_struct(child);
-out:
-	unlock_kernel();
+ out:
 	return ret;
 }
 
diff --git a/include/asm-alpha/ptrace.h b/include/asm-alpha/ptrace.h
index d462c5e14c13..072375c135b4 100644
--- a/include/asm-alpha/ptrace.h
+++ b/include/asm-alpha/ptrace.h
@@ -67,6 +67,9 @@ struct switch_stack {
 };
 
 #ifdef __KERNEL__
+
+#define __ARCH_SYS_PTRACE	1
+
 #define user_mode(regs) (((regs)->ps & 8) != 0)
 #define instruction_pointer(regs) ((regs)->pc)
 #define profile_pc(regs) instruction_pointer(regs)
diff --git a/include/asm-ia64/ptrace.h b/include/asm-ia64/ptrace.h
index a79d1a7ecc77..2c703d6e0c86 100644
--- a/include/asm-ia64/ptrace.h
+++ b/include/asm-ia64/ptrace.h
@@ -229,6 +229,9 @@ struct switch_stack {
 };
 
 #ifdef __KERNEL__
+
+#define __ARCH_SYS_PTRACE	1
+
 /*
  * We use the ia64_psr(regs)->ri to determine which of the three
  * instructions in bundle (16 bytes) took the sample. Generate
diff --git a/include/asm-m32r/ptrace.h b/include/asm-m32r/ptrace.h
index 976417126b2d..55cd7ecfde43 100644
--- a/include/asm-m32r/ptrace.h
+++ b/include/asm-m32r/ptrace.h
@@ -145,6 +145,9 @@ struct pt_regs {
 #define PTRACE_O_TRACESYSGOOD	0x00000001
 
 #ifdef __KERNEL__
+
+#define __ARCH_SYS_PTRACE	1
+
 #if defined(CONFIG_ISA_M32R2) || defined(CONFIG_CHIP_VDEC2)
 #define user_mode(regs) ((M32R_PSW_BPM & (regs)->psw) != 0)
 #elif defined(CONFIG_ISA_M32R)
diff --git a/include/asm-s390/ptrace.h b/include/asm-s390/ptrace.h
index fc7c96edc697..a949cc077cc7 100644
--- a/include/asm-s390/ptrace.h
+++ b/include/asm-s390/ptrace.h
@@ -468,6 +468,8 @@ struct user_regs_struct
 };
 
 #ifdef __KERNEL__
+#define __ARCH_SYS_PTRACE	1
+
 #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0)
 #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN)
 #define profile_pc(regs) instruction_pointer(regs)
diff --git a/include/asm-sparc/ptrace.h b/include/asm-sparc/ptrace.h
index a8ecb2d6977a..714497099a42 100644
--- a/include/asm-sparc/ptrace.h
+++ b/include/asm-sparc/ptrace.h
@@ -60,6 +60,9 @@ struct sparc_stackf {
 #define STACKFRAME_SZ sizeof(struct sparc_stackf)
 
 #ifdef __KERNEL__
+
+#define __ARCH_SYS_PTRACE	1
+
 #define user_mode(regs) (!((regs)->psr & PSR_PS))
 #define instruction_pointer(regs) ((regs)->pc)
 unsigned long profile_pc(struct pt_regs *);
diff --git a/include/asm-sparc64/ptrace.h b/include/asm-sparc64/ptrace.h
index 6194f771e9fc..7eba90c6c753 100644
--- a/include/asm-sparc64/ptrace.h
+++ b/include/asm-sparc64/ptrace.h
@@ -94,6 +94,9 @@ struct sparc_trapf {
 #define STACKFRAME32_SZ	sizeof(struct sparc_stackf32)
 
 #ifdef __KERNEL__
+
+#define __ARCH_SYS_PTRACE	1
+
 #define force_successful_syscall_return()	    \
 do {	current_thread_info()->syscall_noerror = 1; \
 } while (0)
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index dc6f3647bfbc..b2b3dba1298d 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -78,6 +78,8 @@
 #include <linux/compiler.h>		/* For unlikely.  */
 #include <linux/sched.h>		/* For struct task_struct.  */
 
+
+extern long arch_ptrace(struct task_struct *child, long request, long addr, long data);
 extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len);
 extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len);
 extern int ptrace_attach(struct task_struct *tsk);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 863eee8bff47..5b8dd98a230e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -406,3 +406,85 @@ int ptrace_request(struct task_struct *child, long request,
 
 	return ret;
 }
+
+#ifndef __ARCH_SYS_PTRACE
+static int ptrace_get_task_struct(long request, long pid,
+		struct task_struct **childp)
+{
+	struct task_struct *child;
+	int ret;
+
+	/*
+	 * Callers use child == NULL as an indication to exit early even
+	 * when the return value is 0, so make sure it is non-NULL here.
+	 */
+	*childp = NULL;
+
+	if (request == PTRACE_TRACEME) {
+		/*
+		 * Are we already being traced?
+		 */
+		if (current->ptrace & PT_PTRACED)
+			return -EPERM;
+		ret = security_ptrace(current->parent, current);
+		if (ret)
+			return -EPERM;
+		/*
+		 * Set the ptrace bit in the process ptrace flags.
+		 */
+		current->ptrace |= PT_PTRACED;
+		return 0;
+	}
+
+	/*
+	 * You may not mess with init
+	 */
+	if (pid == 1)
+		return -EPERM;
+
+	ret = -ESRCH;
+	read_lock(&tasklist_lock);
+	child = find_task_by_pid(pid);
+	if (child)
+		get_task_struct(child);
+	read_unlock(&tasklist_lock);
+	if (!child)
+		return -ESRCH;
+
+	*childp = child;
+	return 0;
+}
+
+asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
+{
+	struct task_struct *child;
+	long ret;
+
+	/*
+	 * This lock_kernel fixes a subtle race with suid exec
+	 */
+	lock_kernel();
+	ret = ptrace_get_task_struct(request, pid, &child);
+	if (!child)
+		goto out;
+
+	if (request == PTRACE_ATTACH) {
+		ret = ptrace_attach(child);
+		goto out;
+	}
+
+	ret = ptrace_check_attach(child, request == PTRACE_KILL);
+	if (ret < 0)
+		goto out_put_task_struct;
+
+	ret = arch_ptrace(child, request, addr, data);
+	if (ret < 0)
+		goto out_put_task_struct;
+
+ out_put_task_struct:
+	put_task_struct(child);
+ out:
+	unlock_kernel();
+	return ret;
+}
+#endif /* __ARCH_SYS_PTRACE */
-- 
cgit v1.2.3


From e65845235c8120be63001fc1a4ac00c819194bbe Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:07 -0800
Subject: [PATCH] Kprobes: Track kprobe on a per_cpu basis - base changes

Changes to the base kprobe infrastructure to track kprobe execution on a
per-cpu basis.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h | 31 ++++++++++++++++++++++---------
 kernel/kprobes.c        | 43 ++++++++++++++++++++++++++++---------------
 2 files changed, 50 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index e30afdca7917..6720305a31e8 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -33,6 +33,7 @@
 #include <linux/list.h>
 #include <linux/notifier.h>
 #include <linux/smp.h>
+#include <linux/percpu.h>
 
 #include <asm/kprobes.h>
 
@@ -106,6 +107,9 @@ struct jprobe {
 	kprobe_opcode_t *entry;	/* probe handling code to jump to */
 };
 
+DECLARE_PER_CPU(struct kprobe *, current_kprobe);
+DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
+
 #ifdef ARCH_SUPPORTS_KRETPROBES
 extern void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs);
 #else /* ARCH_SUPPORTS_KRETPROBES */
@@ -146,13 +150,6 @@ struct kretprobe_instance {
 void lock_kprobes(void);
 void unlock_kprobes(void);
 
-/* kprobe running now on this CPU? */
-static inline int kprobe_running(void)
-{
-	extern unsigned int kprobe_cpu;
-	return kprobe_cpu == smp_processor_id();
-}
-
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
@@ -167,6 +164,22 @@ extern void free_insn_slot(kprobe_opcode_t *slot);
 struct kprobe *get_kprobe(void *addr);
 struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
+/* kprobe_running() will just return the current_kprobe on this CPU */
+static inline struct kprobe *kprobe_running(void)
+{
+	return (__get_cpu_var(current_kprobe));
+}
+
+static inline void reset_current_kprobe(void)
+{
+	__get_cpu_var(current_kprobe) = NULL;
+}
+
+static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void)
+{
+	return (&__get_cpu_var(kprobe_ctlblk));
+}
+
 int register_kprobe(struct kprobe *p);
 void unregister_kprobe(struct kprobe *p);
 int setjmp_pre_handler(struct kprobe *, struct pt_regs *);
@@ -183,9 +196,9 @@ void add_rp_inst(struct kretprobe_instance *ri);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri);
 #else /* CONFIG_KPROBES */
-static inline int kprobe_running(void)
+static inline struct kprobe *kprobe_running(void)
 {
-	return 0;
+	return NULL;
 }
 static inline int register_kprobe(struct kprobe *p)
 {
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ce4915dd683a..6da8f9b33d1e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -51,7 +51,7 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 unsigned int kprobe_cpu = NR_CPUS;
 static DEFINE_SPINLOCK(kprobe_lock);
-static struct kprobe *curr_kprobe;
+static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
 /*
  * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -188,6 +188,17 @@ void __kprobes unlock_kprobes(void)
  	local_irq_restore(flags);
 }
 
+/* We have preemption disabled.. so it is safe to use __ versions */
+static inline void set_kprobe_instance(struct kprobe *kp)
+{
+	__get_cpu_var(kprobe_instance) = kp;
+}
+
+static inline void reset_kprobe_instance(void)
+{
+	__get_cpu_var(kprobe_instance) = NULL;
+}
+
 /* You have to be holding the kprobe_lock */
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
@@ -213,11 +224,11 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 
 	list_for_each_entry(kp, &p->list, list) {
 		if (kp->pre_handler) {
-			curr_kprobe = kp;
+			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
 				return 1;
 		}
-		curr_kprobe = NULL;
+		reset_kprobe_instance();
 	}
 	return 0;
 }
@@ -229,9 +240,9 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 
 	list_for_each_entry(kp, &p->list, list) {
 		if (kp->post_handler) {
-			curr_kprobe = kp;
+			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
-			curr_kprobe = NULL;
+			reset_kprobe_instance();
 		}
 	}
 	return;
@@ -240,12 +251,14 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 					int trapnr)
 {
+	struct kprobe *cur = __get_cpu_var(kprobe_instance);
+
 	/*
 	 * if we faulted "during" the execution of a user specified
 	 * probe handler, invoke just that probe's fault handler
 	 */
-	if (curr_kprobe && curr_kprobe->fault_handler) {
-		if (curr_kprobe->fault_handler(curr_kprobe, regs, trapnr))
+	if (cur && cur->fault_handler) {
+		if (cur->fault_handler(cur, regs, trapnr))
 			return 1;
 	}
 	return 0;
@@ -253,15 +266,15 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
 
 static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 {
-	struct kprobe *kp = curr_kprobe;
-	if (curr_kprobe && kp->break_handler) {
-		if (kp->break_handler(kp, regs)) {
-			curr_kprobe = NULL;
-			return 1;
-		}
+	struct kprobe *cur = __get_cpu_var(kprobe_instance);
+	int ret = 0;
+
+	if (cur && cur->break_handler) {
+		if (cur->break_handler(cur, regs))
+			ret = 1;
 	}
-	curr_kprobe = NULL;
-	return 0;
+	reset_kprobe_instance();
+	return ret;
 }
 
 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
-- 
cgit v1.2.3


From 3516a46042508a495fac13c2e73530d936ebe015 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:13 -0800
Subject: [PATCH] Kprobes: Use RCU for (un)register synchronization - base
 changes

Changes to the base kprobes infrastructure to use RCU for synchronization
during kprobe registration and unregistration.  These changes coupled with the
arch kprobe changes (next in series):

a. serialize registration and unregistration of kprobes.
b. enable lockless execution of handlers. Handlers can now run in parallel.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/kprobes.h |   9 ++---
 kernel/kprobes.c        | 103 ++++++++++++++++++++----------------------------
 2 files changed, 46 insertions(+), 66 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 6720305a31e8..cff281cf70cf 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -34,6 +34,8 @@
 #include <linux/notifier.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
 
 #include <asm/kprobes.h>
 
@@ -146,10 +148,7 @@ struct kretprobe_instance {
 };
 
 #ifdef CONFIG_KPROBES
-/* Locks kprobe: irq must be disabled */
-void lock_kprobes(void);
-void unlock_kprobes(void);
-
+extern spinlock_t kretprobe_lock;
 extern int arch_prepare_kprobe(struct kprobe *p);
 extern void arch_copy_kprobe(struct kprobe *p);
 extern void arch_arm_kprobe(struct kprobe *p);
@@ -160,7 +159,7 @@ extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
 extern void free_insn_slot(kprobe_opcode_t *slot);
 
-/* Get the kprobe at this addr (if any).  Must have called lock_kprobes */
+/* Get the kprobe at this addr (if any) - called under a rcu_read_lock() */
 struct kprobe *get_kprobe(void *addr);
 struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6da8f9b33d1e..cfef426e4cdc 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -32,7 +32,6 @@
  *		<prasanna@in.ibm.com> added function-return probes.
  */
 #include <linux/kprobes.h>
-#include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/slab.h>
@@ -49,8 +48,8 @@
 static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
-unsigned int kprobe_cpu = NR_CPUS;
-static DEFINE_SPINLOCK(kprobe_lock);
+static DEFINE_SPINLOCK(kprobe_lock);	/* Protects kprobe_table */
+DEFINE_SPINLOCK(kretprobe_lock);	/* Protects kretprobe_inst_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 
 /*
@@ -153,41 +152,6 @@ void __kprobes free_insn_slot(kprobe_opcode_t *slot)
 	}
 }
 
-/* Locks kprobe: irqs must be disabled */
-void __kprobes lock_kprobes(void)
-{
-	unsigned long flags = 0;
-
-	/* Avoiding local interrupts to happen right after we take the kprobe_lock
-	 * and before we get a chance to update kprobe_cpu, this to prevent
-	 * deadlock when we have a kprobe on ISR routine and a kprobe on task
-	 * routine
-	 */
-	local_irq_save(flags);
-
-	spin_lock(&kprobe_lock);
-	kprobe_cpu = smp_processor_id();
-
- 	local_irq_restore(flags);
-}
-
-void __kprobes unlock_kprobes(void)
-{
-	unsigned long flags = 0;
-
-	/* Avoiding local interrupts to happen right after we update
-	 * kprobe_cpu and before we get a a chance to release kprobe_lock,
-	 * this to prevent deadlock when we have a kprobe on ISR routine and
-	 * a kprobe on task routine
-	 */
-	local_irq_save(flags);
-
-	kprobe_cpu = NR_CPUS;
-	spin_unlock(&kprobe_lock);
-
- 	local_irq_restore(flags);
-}
-
 /* We have preemption disabled.. so it is safe to use __ versions */
 static inline void set_kprobe_instance(struct kprobe *kp)
 {
@@ -199,15 +163,20 @@ static inline void reset_kprobe_instance(void)
 	__get_cpu_var(kprobe_instance) = NULL;
 }
 
-/* You have to be holding the kprobe_lock */
+/*
+ * This routine is called either:
+ * 	- under the kprobe_lock spinlock - during kprobe_[un]register()
+ * 				OR
+ * 	- under an rcu_read_lock() - from arch/xxx/kernel/kprobes.c
+ */
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
+	struct kprobe *p;
 
 	head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)];
-	hlist_for_each(node, head) {
-		struct kprobe *p = hlist_entry(node, struct kprobe, hlist);
+	hlist_for_each_entry_rcu(p, node, head, hlist) {
 		if (p->addr == addr)
 			return p;
 	}
@@ -222,7 +191,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 {
 	struct kprobe *kp;
 
-	list_for_each_entry(kp, &p->list, list) {
+	list_for_each_entry_rcu(kp, &p->list, list) {
 		if (kp->pre_handler) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
@@ -238,7 +207,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 {
 	struct kprobe *kp;
 
-	list_for_each_entry(kp, &p->list, list) {
+	list_for_each_entry_rcu(kp, &p->list, list) {
 		if (kp->post_handler) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
@@ -277,6 +246,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
 	return ret;
 }
 
+/* Called with kretprobe_lock held */
 struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 {
 	struct hlist_node *node;
@@ -286,6 +256,7 @@ struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp)
 	return NULL;
 }
 
+/* Called with kretprobe_lock held */
 static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
 							      *rp)
 {
@@ -296,6 +267,7 @@ static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe
 	return NULL;
 }
 
+/* Called with kretprobe_lock held */
 void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 {
 	/*
@@ -314,6 +286,7 @@ void __kprobes add_rp_inst(struct kretprobe_instance *ri)
 	hlist_add_head(&ri->uflist, &ri->rp->used_instances);
 }
 
+/* Called with kretprobe_lock held */
 void __kprobes recycle_rp_inst(struct kretprobe_instance *ri)
 {
 	/* remove rp inst off the rprobe_inst_table */
@@ -347,13 +320,13 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
 	struct hlist_node *node, *tmp;
 	unsigned long flags = 0;
 
-	spin_lock_irqsave(&kprobe_lock, flags);
+	spin_lock_irqsave(&kretprobe_lock, flags);
         head = kretprobe_inst_table_head(current);
         hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
                 if (ri->task == tk)
                         recycle_rp_inst(ri);
         }
-	spin_unlock_irqrestore(&kprobe_lock, flags);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 
 /*
@@ -364,9 +337,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
 {
 	struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+	unsigned long flags = 0;
 
 	/*TODO: consider to only swap the RA after the last pre_handler fired */
+	spin_lock_irqsave(&kretprobe_lock, flags);
 	arch_prepare_kretprobe(rp, regs);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	return 0;
 }
 
@@ -397,13 +373,13 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
         struct kprobe *kp;
 
 	if (p->break_handler) {
-		list_for_each_entry(kp, &old_p->list, list) {
+		list_for_each_entry_rcu(kp, &old_p->list, list) {
 			if (kp->break_handler)
 				return -EEXIST;
 		}
-		list_add_tail(&p->list, &old_p->list);
+		list_add_tail_rcu(&p->list, &old_p->list);
 	} else
-		list_add(&p->list, &old_p->list);
+		list_add_rcu(&p->list, &old_p->list);
 	return 0;
 }
 
@@ -421,18 +397,18 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	ap->break_handler = aggr_break_handler;
 
 	INIT_LIST_HEAD(&ap->list);
-	list_add(&p->list, &ap->list);
+	list_add_rcu(&p->list, &ap->list);
 
 	INIT_HLIST_NODE(&ap->hlist);
-	hlist_del(&p->hlist);
-	hlist_add_head(&ap->hlist,
+	hlist_del_rcu(&p->hlist);
+	hlist_add_head_rcu(&ap->hlist,
 		&kprobe_table[hash_ptr(ap->addr, KPROBE_HASH_BITS)]);
 }
 
 /*
  * This is the second or subsequent kprobe at the address - handle
  * the intricacies
- * TODO: Move kcalloc outside the spinlock
+ * TODO: Move kcalloc outside the spin_lock
  */
 static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
@@ -458,7 +434,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 {
 	arch_disarm_kprobe(p);
-	hlist_del(&p->hlist);
+	hlist_del_rcu(&p->hlist);
 	spin_unlock_irqrestore(&kprobe_lock, flags);
 	arch_remove_kprobe(p);
 }
@@ -466,11 +442,10 @@ static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
 static inline void cleanup_aggr_kprobe(struct kprobe *old_p,
 		struct kprobe *p, unsigned long flags)
 {
-	list_del(&p->list);
-	if (list_empty(&old_p->list)) {
+	list_del_rcu(&p->list);
+	if (list_empty(&old_p->list))
 		cleanup_kprobe(old_p, flags);
-		kfree(old_p);
-	} else
+	else
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
 
@@ -493,9 +468,9 @@ int __kprobes register_kprobe(struct kprobe *p)
 	if ((ret = arch_prepare_kprobe(p)) != 0)
 		goto rm_kprobe;
 
+	p->nmissed = 0;
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
-	p->nmissed = 0;
 	if (old_p) {
 		ret = register_aggr_kprobe(old_p, p);
 		goto out;
@@ -503,7 +478,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 
 	arch_copy_kprobe(p);
 	INIT_HLIST_NODE(&p->hlist);
-	hlist_add_head(&p->hlist,
+	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
   	arch_arm_kprobe(p);
@@ -524,10 +499,16 @@ void __kprobes unregister_kprobe(struct kprobe *p)
 	spin_lock_irqsave(&kprobe_lock, flags);
 	old_p = get_kprobe(p->addr);
 	if (old_p) {
+		/* cleanup_*_kprobe() does the spin_unlock_irqrestore */
 		if (old_p->pre_handler == aggr_pre_handler)
 			cleanup_aggr_kprobe(old_p, p, flags);
 		else
 			cleanup_kprobe(p, flags);
+
+		synchronize_sched();
+		if (old_p->pre_handler == aggr_pre_handler &&
+				list_empty(&old_p->list))
+			kfree(old_p);
 	} else
 		spin_unlock_irqrestore(&kprobe_lock, flags);
 }
@@ -604,13 +585,13 @@ void __kprobes unregister_kretprobe(struct kretprobe *rp)
 
 	unregister_kprobe(&rp->kp);
 	/* No race here */
-	spin_lock_irqsave(&kprobe_lock, flags);
+	spin_lock_irqsave(&kretprobe_lock, flags);
 	free_rp_inst(rp);
 	while ((ri = get_used_rp_inst(rp)) != NULL) {
 		ri->rp = NULL;
 		hlist_del(&ri->uflist);
 	}
-	spin_unlock_irqrestore(&kprobe_lock, flags);
+	spin_unlock_irqrestore(&kretprobe_lock, flags);
 }
 
 static int __init init_kprobes(void)
-- 
cgit v1.2.3


From d217d5450f11d8c907c0458d175b0dc999b4d06d Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Mon, 7 Nov 2005 01:00:14 -0800
Subject: [PATCH] Kprobes: preempt_disable/enable() simplification

Reorganize the preempt_disable/enable calls to eliminate the extra preempt
depth.  Changes based on Paul McKenney's review suggestions for the kprobes
RCU changeset.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Signed-off-by: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/kernel/kprobes.c    | 25 +++++++++++++++----------
 arch/ia64/kernel/kprobes.c    | 37 ++++++++++++++++++++-----------------
 arch/ppc64/kernel/kprobes.c   | 25 +++++++++++++++----------
 arch/sparc64/kernel/kprobes.c | 21 +++++++++++++--------
 arch/x86_64/kernel/kprobes.c  | 29 +++++++++++++++--------------
 include/linux/kprobes.h       |  2 +-
 kernel/kprobes.c              |  2 +-
 7 files changed, 80 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/kernel/kprobes.c b/arch/i386/kernel/kprobes.c
index ad469299267a..32b0c24ab9a6 100644
--- a/arch/i386/kernel/kprobes.c
+++ b/arch/i386/kernel/kprobes.c
@@ -153,7 +153,14 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	int ret = 0;
 	kprobe_opcode_t *addr = NULL;
 	unsigned long *lp;
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Check if the application is using LDT entry for its code segment and
 	 * calculate the address by reading the base address from the LDT entry.
@@ -221,11 +228,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobe_handler()
-	 */
-	preempt_disable();
 	set_current_kprobe(p, regs, kcb);
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
@@ -239,6 +241,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -310,8 +313,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 
 	/*
 	 * By returning a non-zero value, we are telling
-	 * kprobe_handler() that we have handled unlocking
-	 * and re-enabling preemption
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
 	 */
         return 1;
 }
@@ -455,7 +458,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch (val) {
 	case DIE_INT3:
 		if (kprobe_handler(args->regs))
@@ -467,14 +469,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 		break;
 	case DIE_GPF:
 	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
 		if (kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 		break;
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -537,6 +541,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 		*regs = kcb->jprobe_saved_regs;
 		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
 		       MIN_STACK_SIZE(stack_addr));
+		preempt_enable_no_resched();
 		return 1;
 	}
 	return 0;
diff --git a/arch/ia64/kernel/kprobes.c b/arch/ia64/kernel/kprobes.c
index fddbac32d44a..96736a119c91 100644
--- a/arch/ia64/kernel/kprobes.c
+++ b/arch/ia64/kernel/kprobes.c
@@ -389,11 +389,11 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 	spin_unlock_irqrestore(&kretprobe_lock, flags);
 	preempt_enable_no_resched();
 
-        /*
-         * By returning a non-zero value, we are telling
-         * kprobe_handler() that we have handled unlocking
-	 * and re-enabling preemption
-         */
+	/*
+	 * By returning a non-zero value, we are telling
+	 * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
+	 */
         return 1;
 }
 
@@ -604,7 +604,14 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 	int ret = 0;
 	struct pt_regs *regs = args->regs;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)instruction_pointer(regs);
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Handle recursion cases */
 	if (kprobe_running()) {
@@ -659,11 +666,6 @@ static int __kprobes pre_kprobes_handler(struct die_args *args)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobes_handler()
-	 */
-	preempt_disable();
 	set_current_kprobe(p, kcb);
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
@@ -681,6 +683,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -716,9 +719,6 @@ static int __kprobes kprobes_fault_handler(struct pt_regs *regs, int trapnr)
 	struct kprobe *cur = kprobe_running();
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
-	if (!cur)
-		return 0;
-
 	if (cur->fault_handler && cur->fault_handler(cur, regs, trapnr))
 		return 1;
 
@@ -737,7 +737,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch(val) {
 	case DIE_BREAK:
 		if (pre_kprobes_handler(args))
@@ -748,12 +747,15 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
-		if (kprobes_fault_handler(args->regs, args->trapnr))
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
+		if (kprobe_running() &&
+			kprobes_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -785,6 +787,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
 
 	*regs = kcb->jprobe_saved_regs;
+	preempt_enable_no_resched();
 	return 1;
 }
 
diff --git a/arch/ppc64/kernel/kprobes.c b/arch/ppc64/kernel/kprobes.c
index e0a25b35437f..511af54e6230 100644
--- a/arch/ppc64/kernel/kprobes.c
+++ b/arch/ppc64/kernel/kprobes.c
@@ -148,7 +148,14 @@ static inline int kprobe_handler(struct pt_regs *regs)
 	struct kprobe *p;
 	int ret = 0;
 	unsigned int *addr = (unsigned int *)regs->nip;
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Check we're not actually recursing */
 	if (kprobe_running()) {
@@ -207,11 +214,6 @@ static inline int kprobe_handler(struct pt_regs *regs)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobe_handler().
-	 */
-	preempt_disable();
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 	set_current_kprobe(p, regs, kcb);
 	if (p->pre_handler && p->pre_handler(p, regs))
@@ -224,6 +226,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -296,8 +299,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 
         /*
          * By returning a non-zero value, we are telling
-         * kprobe_handler() that we have handled unlocking
-         * and re-enabling preemption.
+         * kprobe_handler() that we don't want the post_handler
+         * to run (and have re-enabled preemption)
          */
         return 1;
 }
@@ -385,7 +388,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch (val) {
 	case DIE_BPT:
 		if (kprobe_handler(args->regs))
@@ -396,14 +398,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 			ret = NOTIFY_STOP;
 		break;
 	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
 		if (kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 		break;
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -440,6 +444,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 	 * saved regs...
 	 */
 	memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs));
+	preempt_enable_no_resched();
 	return 1;
 }
 
diff --git a/arch/sparc64/kernel/kprobes.c b/arch/sparc64/kernel/kprobes.c
index 58a815e90373..96bd09b098f4 100644
--- a/arch/sparc64/kernel/kprobes.c
+++ b/arch/sparc64/kernel/kprobes.c
@@ -113,7 +113,14 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 	struct kprobe *p;
 	void *addr = (void *) regs->tpc;
 	int ret = 0;
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	if (kprobe_running()) {
 		p = get_kprobe(addr);
@@ -159,11 +166,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobes_handler()
-	 */
-	preempt_disable();
 	set_current_kprobe(p, regs, kcb);
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 	if (p->pre_handler && p->pre_handler(p, regs))
@@ -175,6 +177,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -321,7 +324,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch (val) {
 	case DIE_DEBUG:
 		if (kprobe_handler(args->regs))
@@ -333,14 +335,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 		break;
 	case DIE_GPF:
 	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
 		if (kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 		break;
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -426,6 +430,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 		       &(kcb->jprobe_saved_stack),
 		       sizeof(kcb->jprobe_saved_stack));
 
+		preempt_enable_no_resched();
 		return 1;
 	}
 	return 0;
diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c
index 9bef2c8dc12c..dddeb678b440 100644
--- a/arch/x86_64/kernel/kprobes.c
+++ b/arch/x86_64/kernel/kprobes.c
@@ -286,16 +286,19 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe *rp,
         }
 }
 
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate and they
- * remain disabled thorough out this function.
- */
 int __kprobes kprobe_handler(struct pt_regs *regs)
 {
 	struct kprobe *p;
 	int ret = 0;
 	kprobe_opcode_t *addr = (kprobe_opcode_t *)(regs->rip - sizeof(kprobe_opcode_t));
-	struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
+	struct kprobe_ctlblk *kcb;
+
+	/*
+	 * We don't want to be preempted for the entire
+	 * duration of kprobe processing
+	 */
+	preempt_disable();
+	kcb = get_kprobe_ctlblk();
 
 	/* Check we're not actually recursing */
 	if (kprobe_running()) {
@@ -359,11 +362,6 @@ int __kprobes kprobe_handler(struct pt_regs *regs)
 		goto no_kprobe;
 	}
 
-	/*
-	 * This preempt_disable() matches the preempt_enable_no_resched()
-	 * in post_kprobe_handler()
-	 */
-	preempt_disable();
 	set_current_kprobe(p, regs, kcb);
 	kcb->kprobe_status = KPROBE_HIT_ACTIVE;
 
@@ -377,6 +375,7 @@ ss_probe:
 	return 1;
 
 no_kprobe:
+	preempt_enable_no_resched();
 	return ret;
 }
 
@@ -448,8 +447,8 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs)
 
         /*
          * By returning a non-zero value, we are telling
-         * kprobe_handler() that we have handled unlocking
-	 * and re-enabling preemption
+         * kprobe_handler() that we don't want the post_handler
+	 * to run (and have re-enabled preemption)
          */
         return 1;
 }
@@ -594,7 +593,6 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 	struct die_args *args = (struct die_args *)data;
 	int ret = NOTIFY_DONE;
 
-	rcu_read_lock();
 	switch (val) {
 	case DIE_INT3:
 		if (kprobe_handler(args->regs))
@@ -606,14 +604,16 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
 		break;
 	case DIE_GPF:
 	case DIE_PAGE_FAULT:
+		/* kprobe_running() needs smp_processor_id() */
+		preempt_disable();
 		if (kprobe_running() &&
 		    kprobe_fault_handler(args->regs, args->trapnr))
 			ret = NOTIFY_STOP;
+		preempt_enable();
 		break;
 	default:
 		break;
 	}
-	rcu_read_unlock();
 	return ret;
 }
 
@@ -675,6 +675,7 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
 		*regs = kcb->jprobe_saved_regs;
 		memcpy((kprobe_opcode_t *) stack_addr, kcb->jprobes_stack,
 		       MIN_STACK_SIZE(stack_addr));
+		preempt_enable_no_resched();
 		return 1;
 	}
 	return 0;
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index cff281cf70cf..e373c4a9de53 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -159,7 +159,7 @@ extern void show_registers(struct pt_regs *regs);
 extern kprobe_opcode_t *get_insn_slot(void);
 extern void free_insn_slot(kprobe_opcode_t *slot);
 
-/* Get the kprobe at this addr (if any) - called under a rcu_read_lock() */
+/* Get the kprobe at this addr (if any) - called with preemption disabled */
 struct kprobe *get_kprobe(void *addr);
 struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk);
 
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cfef426e4cdc..5beda378cc75 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -167,7 +167,7 @@ static inline void reset_kprobe_instance(void)
  * This routine is called either:
  * 	- under the kprobe_lock spinlock - during kprobe_[un]register()
  * 				OR
- * 	- under an rcu_read_lock() - from arch/xxx/kernel/kprobes.c
+ * 	- with preemption disabled - from arch/xxx/kernel/kprobes.c
  */
 struct kprobe __kprobes *get_kprobe(void *addr)
 {
-- 
cgit v1.2.3


From 1e5d533142c1c178a31d4cc81837eb078f9269bc Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 7 Nov 2005 01:01:06 -0800
Subject: [PATCH] more kernel-doc cleanups, additions

Various core kernel-doc cleanups:
- add missing function parameters in ipc, irq/manage, kernel/sys,
  kernel/sysctl, and mm/slab;
- move description to just above function for kernel_restart()

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 ipc/util.c          |  9 +++++----
 kernel/irq/manage.c |  1 +
 kernel/sys.c        | 15 +++++++++------
 kernel/sysctl.c     |  1 +
 mm/slab.c           |  1 +
 5 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/ipc/util.c b/ipc/util.c
index 10e836d0d89e..23f1cec150c1 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -410,7 +410,8 @@ void ipc_rcu_getref(void *ptr)
 }
 
 /**
- *	ipc_schedule_free	- free ipc + rcu space
+ * ipc_schedule_free - free ipc + rcu space
+ * @head: RCU callback structure for queued work
  * 
  * Since RCU callback function is called in bh,
  * we need to defer the vfree to schedule_work
@@ -427,10 +428,10 @@ static void ipc_schedule_free(struct rcu_head *head)
 }
 
 /**
- *	ipc_immediate_free	- free ipc + rcu space
- *
- *	Free from the RCU callback context
+ * ipc_immediate_free - free ipc + rcu space
+ * @head: RCU callback structure that contains pointer to be freed
  *
+ * Free from the RCU callback context
  */
 static void ipc_immediate_free(struct rcu_head *head)
 {
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1cfdb08ddf20..3bd7226d15fa 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -24,6 +24,7 @@ cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
 
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *	@irq: interrupt number to wait for
  *
  *	This function waits for any pending IRQ handlers for this interrupt
  *	to complete before returning. If you use this function while
diff --git a/kernel/sys.c b/kernel/sys.c
index 1e1f41b3fdf6..3e332131000e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -376,18 +376,21 @@ void emergency_restart(void)
 }
 EXPORT_SYMBOL_GPL(emergency_restart);
 
-/**
- *	kernel_restart - reboot the system
- *
- *	Shutdown everything and perform a clean reboot.
- *	This is not safe to call in interrupt context.
- */
 void kernel_restart_prepare(char *cmd)
 {
 	notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
 	device_shutdown();
 }
+
+/**
+ *	kernel_restart - reboot the system
+ *	@cmd: pointer to buffer containing command to execute for restart
+ *		or NULL
+ *
+ *	Shutdown everything and perform a clean reboot.
+ *	This is not safe to call in interrupt context.
+ */
 void kernel_restart(char *cmd)
 {
 	kernel_restart_prepare(cmd);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e1351200ce85..c4f35f96884d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1997,6 +1997,7 @@ int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
  * @filp: the file structure
  * @buffer: the user buffer
  * @lenp: the size of the user buffer
+ * @ppos: pointer to the file position
  *
  * Reads/writes up to table->maxlen/sizeof(unsigned int) integer
  * values from/to the user buffer, treated as an ASCII string. 
diff --git a/mm/slab.c b/mm/slab.c
index 1db4d7313853..e291f5e1afbb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3263,6 +3263,7 @@ static void drain_array_locked(kmem_cache_t *cachep,
 
 /**
  * cache_reap - Reclaim memory from caches.
+ * @unused: unused parameter
  *
  * Called from workqueue/eventd every few seconds.
  * Purpose:
-- 
cgit v1.2.3


From b8887e6e8c04bcefb512cdb08fc7e9c310ac847e Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Mon, 7 Nov 2005 01:01:07 -0800
Subject: [PATCH] kernel-docs: fix kernel-doc format problems

Convert to proper kernel-doc format.

Some have extra blank lines (not allowed immed.  after the function name)
or need blank lines (after all parameters).  Function summary must be only
one line.

Colon (":") in a function description does weird things (causes kernel-doc
to think that it's a new section head sadly).

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/block/ll_rw_blk.c | 1 -
 fs/fs-writeback.c         | 5 +++--
 include/linux/kernel.h    | 1 -
 kernel/sys.c              | 2 +-
 4 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 2747741677fb..5f52e30b43f8 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -706,7 +706,6 @@ EXPORT_SYMBOL(blk_queue_dma_alignment);
 
 /**
  * blk_queue_find_tag - find a request by its tag and queue
- *
  * @q:	 The request queue for the device
  * @tag: The tag of the request
  *
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1361a4a64157..785c7213a54f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -606,7 +606,7 @@ EXPORT_SYMBOL(sync_inode);
  * O_SYNC flag set, to flush dirty writes to disk.
  *
  * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon:
+ * written and waited upon.
  *
  *    OSYNC_DATA:     i_mapping's dirty data
  *    OSYNC_METADATA: the buffers at i_mapping->private_list
@@ -672,8 +672,9 @@ int writeback_acquire(struct backing_dev_info *bdi)
 
 /**
  * writeback_in_progress: determine whether there is writeback in progress
- *                        against a backing device.
  * @bdi: the device's backing_dev_info structure.
+ *
+ * Determine whether there is writeback in progress against a backing device.
  */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f1925ccc9fe1..b6419489b27b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -266,7 +266,6 @@ extern void dump_stack(void);
 
 /**
  * container_of - cast a member of a structure out to the containing structure
- *
  * @ptr:	the pointer to the member.
  * @type:	the type of the container struct this is embedded in.
  * @member:	the name of the member within the struct.
diff --git a/kernel/sys.c b/kernel/sys.c
index 3e332131000e..bce933ebb29f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -386,7 +386,7 @@ void kernel_restart_prepare(char *cmd)
 /**
  *	kernel_restart - reboot the system
  *	@cmd: pointer to buffer containing command to execute for restart
- *		or NULL
+ *		or %NULL
  *
  *	Shutdown everything and perform a clean reboot.
  *	This is not safe to call in interrupt context.
-- 
cgit v1.2.3


From 47bdfb96de47d25bea423b5adbfe1c2e1ceaa296 Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Nov 2005 01:01:39 -0800
Subject: [PATCH] unexport console_unblank

I didn't find any possible modular usage of console_unblank in the
kernel.

This patch was already ACK'ed by Alan Cox.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 3cb9708209bc..e9be027bc930 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -806,7 +806,6 @@ void console_unblank(void)
 			c->unblank();
 	release_console_sem();
 }
-EXPORT_SYMBOL(console_unblank);
 
 /*
  * Return the console tty driver structure and its associated index
-- 
cgit v1.2.3


From 4664957b8ec78533f542900cecf7c38fbdc0d8da Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Nov 2005 01:01:42 -0800
Subject: [PATCH] unexport idle_cpu

I didn't find any possible modular usage in the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 013f1448006b..3ce26954be12 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3563,8 +3563,6 @@ int idle_cpu(int cpu)
 	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
 }
 
-EXPORT_SYMBOL_GPL(idle_cpu);
-
 /**
  * idle_task - return the idle task for a given cpu.
  * @cpu: the processor in question.
-- 
cgit v1.2.3


From b26b9bc58263acda274f82a9dde8b6d96559878a Mon Sep 17 00:00:00 2001
From: Adrian Bunk <bunk@stusta.de>
Date: Mon, 7 Nov 2005 01:01:43 -0800
Subject: [PATCH] unexport uts_sem

I didn't find any possible modular usage in the kernel.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sys.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index bce933ebb29f..c43b3e22bbda 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1497,8 +1497,6 @@ EXPORT_SYMBOL(in_egroup_p);
 
 DECLARE_RWSEM(uts_sem);
 
-EXPORT_SYMBOL(uts_sem);
-
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
 	int errno = 0;
-- 
cgit v1.2.3


From dedeb0029b9c83420fc1337d4ee53daa7b2a0ad4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 7 Nov 2005 14:09:01 -0800
Subject: [SPARC64] mm: context switch ptlock

sparc64 is unique among architectures in taking the page_table_lock in
its context switch (well, cris does too, but erroneously, and it's not
yet SMP anyway).

This seems to be a private affair between switch_mm and activate_mm,
using page_table_lock as a per-mm lock, without any relation to its uses
elsewhere.  That's fine, but comment it as such; and unlock sooner in
switch_mm, more like in activate_mm (preemption is disabled here).

There is a block of "if (0)"ed code in smp_flush_tlb_pending which would
have liked to rely on the page_table_lock, in switch_mm and elsewhere;
but its comment explains how dup_mmap's flush_tlb_mm defeated it.  And
though that could have been changed at any time over the past few years,
now the chance vanishes as we push the page_table_lock downwards, and
perhaps split it per page table page.  Just delete that block of code.

Which leaves the mysterious spin_unlock_wait(&oldmm->page_table_lock)
in kernel/fork.c copy_mm.  Textual analysis (supported by Nick Piggin)
suggests that the comment was written by DaveM, and that it relates to
the defeated approach in the sparc64 smp_flush_tlb_pending.  Just delete
this block too.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/sparc64/kernel/smp.c         | 31 +++++---------------------
 include/asm-sparc64/mmu_context.h | 46 ++++++++++++++++++++-------------------
 kernel/fork.c                     |  7 ------
 3 files changed, 29 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index b137fd63f5e1..a9089e2140e9 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -883,34 +883,13 @@ void smp_flush_tlb_pending(struct mm_struct *mm, unsigned long nr, unsigned long
 	u32 ctx = CTX_HWBITS(mm->context);
 	int cpu = get_cpu();
 
-	if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1) {
+	if (mm == current->active_mm && atomic_read(&mm->mm_users) == 1)
 		mm->cpu_vm_mask = cpumask_of_cpu(cpu);
-		goto local_flush_and_out;
-	} else {
-		/* This optimization is not valid.  Normally
-		 * we will be holding the page_table_lock, but
-		 * there is an exception which is copy_page_range()
-		 * when forking.  The lock is held during the individual
-		 * page table updates in the parent, but not at the
-		 * top level, which is where we are invoked.
-		 */
-		if (0) {
-			cpumask_t this_cpu_mask = cpumask_of_cpu(cpu);
-
-			/* By virtue of running under the mm->page_table_lock,
-			 * and mmu_context.h:switch_mm doing the same, the
-			 * following operation is safe.
-			 */
-			if (cpus_equal(mm->cpu_vm_mask, this_cpu_mask))
-				goto local_flush_and_out;
-		}
-	}
-
-	smp_cross_call_masked(&xcall_flush_tlb_pending,
-			      ctx, nr, (unsigned long) vaddrs,
-			      mm->cpu_vm_mask);
+	else
+		smp_cross_call_masked(&xcall_flush_tlb_pending,
+				      ctx, nr, (unsigned long) vaddrs,
+				      mm->cpu_vm_mask);
 
-local_flush_and_out:
 	__flush_tlb_pending(ctx, nr, vaddrs);
 
 	put_cpu();
diff --git a/include/asm-sparc64/mmu_context.h b/include/asm-sparc64/mmu_context.h
index 87c43c67866e..08ba72d7722c 100644
--- a/include/asm-sparc64/mmu_context.h
+++ b/include/asm-sparc64/mmu_context.h
@@ -87,37 +87,35 @@ extern void __flush_tlb_mm(unsigned long, unsigned long);
 static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, struct task_struct *tsk)
 {
 	unsigned long ctx_valid;
+	int cpu;
 
+	/* Note: page_table_lock is used here to serialize switch_mm
+	 * and activate_mm, and their calls to get_new_mmu_context.
+	 * This use of page_table_lock is unrelated to its other uses.
+	 */ 
 	spin_lock(&mm->page_table_lock);
-	if (CTX_VALID(mm->context))
-		ctx_valid = 1;
-        else
-		ctx_valid = 0;
+	ctx_valid = CTX_VALID(mm->context);
+	if (!ctx_valid)
+		get_new_mmu_context(mm);
+	spin_unlock(&mm->page_table_lock);
 
 	if (!ctx_valid || (old_mm != mm)) {
-		if (!ctx_valid)
-			get_new_mmu_context(mm);
-
 		load_secondary_context(mm);
 		reload_tlbmiss_state(tsk, mm);
 	}
 
-	{
-		int cpu = smp_processor_id();
-
-		/* Even if (mm == old_mm) we _must_ check
-		 * the cpu_vm_mask.  If we do not we could
-		 * corrupt the TLB state because of how
-		 * smp_flush_tlb_{page,range,mm} on sparc64
-		 * and lazy tlb switches work. -DaveM
-		 */
-		if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) {
-			cpu_set(cpu, mm->cpu_vm_mask);
-			__flush_tlb_mm(CTX_HWBITS(mm->context),
-				       SECONDARY_CONTEXT);
-		}
+	/* Even if (mm == old_mm) we _must_ check
+	 * the cpu_vm_mask.  If we do not we could
+	 * corrupt the TLB state because of how
+	 * smp_flush_tlb_{page,range,mm} on sparc64
+	 * and lazy tlb switches work. -DaveM
+	 */
+	cpu = smp_processor_id();
+	if (!ctx_valid || !cpu_isset(cpu, mm->cpu_vm_mask)) {
+		cpu_set(cpu, mm->cpu_vm_mask);
+		__flush_tlb_mm(CTX_HWBITS(mm->context),
+			       SECONDARY_CONTEXT);
 	}
-	spin_unlock(&mm->page_table_lock);
 }
 
 #define deactivate_mm(tsk,mm)	do { } while (0)
@@ -127,6 +125,10 @@ static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm
 {
 	int cpu;
 
+	/* Note: page_table_lock is used here to serialize switch_mm
+	 * and activate_mm, and their calls to get_new_mmu_context.
+	 * This use of page_table_lock is unrelated to its other uses.
+	 */ 
 	spin_lock(&mm->page_table_lock);
 	if (!CTX_VALID(mm->context))
 		get_new_mmu_context(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index efac2c58ec7d..158710d22566 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -470,13 +470,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 	if (clone_flags & CLONE_VM) {
 		atomic_inc(&oldmm->mm_users);
 		mm = oldmm;
-		/*
-		 * There are cases where the PTL is held to ensure no
-		 * new threads start up in user mode using an mm, which
-		 * allows optimizing out ipis; the tlb_gather_mmu code
-		 * is an example.
-		 */
-		spin_unlock_wait(&oldmm->page_table_lock);
 		goto good_mm;
 	}
 
-- 
cgit v1.2.3


From 7b7b1ace2d9d06d76bce7481a045c22ed75e35dd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Nov 2005 17:13:39 -0500
Subject: [PATCH] saner handling of auto_acct_off() and DQUOT_OFF() in umount

The way we currently deal with quota and process accounting that might
keep vfsmount busy at umount time is inherently broken; we try to turn
them off just in case (not quite correctly, at that) and

  a) pray umount doesn't fail (otherwise they'll stay turned off)
  b) pray nobody doesn anything funny just as we turn quota off

Moreover, LSM provides hooks for doing the same sort of broken logics.

The proper way to deal with that is to introduce the second kind of
reference to vfsmount.  Semantics:

 - when the last normal reference is dropped, all special ones are
   converted to normal ones and if there had been any, cleanup is done.
 - normal reference can be cloned into a special one
 - special reference can be converted to normal one; that's a no-op if
   we'd already passed the point of no return (i.e.  mntput() had
   converted special references to normal and started cleanup).

The way it works: e.g. starting process accounting converts the vfsmount
reference pinned by the opened file into special one and turns it back
to normal when it gets shut down; acct_auto_close() is done when no
normal references are left.  That way it does *not* obstruct umount(2)
and it silently gets turned off when the last normal reference to
vfsmount is gone.  Which is exactly what we want...

The same should be done by LSM module that holds some internal
references to vfsmount and wants to shut them down on umount - it should
make them special and security_sb_umount_close() will be called exactly
when the last normal reference to vfsmount is gone.

quota handling is even simpler - we don't use normal file IO anymore, so
there's no need to hold vfsmounts at all.  DQUOT_OFF() is done from
deactivate_super(), where it really belongs.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/dquot.c            | 18 ++--------
 fs/namespace.c        | 64 ++++++++++++++++++++++-------------
 fs/super.c            |  1 +
 include/linux/acct.h  |  3 ++
 include/linux/mount.h | 13 +++-----
 include/linux/quota.h |  1 -
 kernel/acct.c         | 92 ++++++++++++++++++++++++++++++++++-----------------
 7 files changed, 113 insertions(+), 79 deletions(-)

(limited to 'kernel')

diff --git a/fs/dquot.c b/fs/dquot.c
index afa06a893468..05b60283c9c2 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -1321,13 +1321,11 @@ int vfs_quota_off(struct super_block *sb, int type)
 	int cnt;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *toputinode[MAXQUOTAS];
-	struct vfsmount *toputmnt[MAXQUOTAS];
 
 	/* We need to serialize quota_off() for device */
 	down(&dqopt->dqonoff_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		toputinode[cnt] = NULL;
-		toputmnt[cnt] = NULL;
 		if (type != -1 && cnt != type)
 			continue;
 		if (!sb_has_quota_enabled(sb, cnt))
@@ -1348,9 +1346,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 		put_quota_format(dqopt->info[cnt].dqi_format);
 
 		toputinode[cnt] = dqopt->files[cnt];
-		toputmnt[cnt] = dqopt->mnt[cnt];
 		dqopt->files[cnt] = NULL;
-		dqopt->mnt[cnt] = NULL;
 		dqopt->info[cnt].dqi_flags = 0;
 		dqopt->info[cnt].dqi_igrace = 0;
 		dqopt->info[cnt].dqi_bgrace = 0;
@@ -1358,10 +1354,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 	}
 	up(&dqopt->dqonoff_sem);
 	/* Sync the superblock so that buffers with quota data are written to
-	 * disk (and so userspace sees correct data afterwards).
-	 * The reference to vfsmnt we are still holding protects us from
-	 * umount (we don't have it only when quotas are turned on/off for
-	 * journal replay but in that case we are guarded by the fs anyway). */
+	 * disk (and so userspace sees correct data afterwards). */
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, 1);
 	sync_blockdev(sb->s_bdev);
@@ -1385,10 +1378,6 @@ int vfs_quota_off(struct super_block *sb, int type)
 				iput(toputinode[cnt]);
 			}
 			up(&dqopt->dqonoff_sem);
-			/* We don't hold the reference when we turned on quotas
-			 * just for the journal replay... */
-			if (toputmnt[cnt])
-				mntput(toputmnt[cnt]);
 		}
 	if (sb->s_bdev)
 		invalidate_bdev(sb->s_bdev, 0);
@@ -1503,11 +1492,8 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	/* Quota file not on the same filesystem? */
 	if (nd.mnt->mnt_sb != sb)
 		error = -EXDEV;
-	else {
+	else
 		error = vfs_quota_on_inode(nd.dentry->d_inode, type, format_id);
-		if (!error)
-			sb_dqopt(sb)->mnt[type] = mntget(nd.mnt);
-	}
 out_path:
 	path_release(&nd);
 	return error;
diff --git a/fs/namespace.c b/fs/namespace.c
index 2fa9fdf7d6f5..1d83302f30c3 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -172,7 +172,7 @@ clone_mnt(struct vfsmount *old, struct dentry *root)
 	return mnt;
 }
 
-void __mntput(struct vfsmount *mnt)
+static inline void __mntput(struct vfsmount *mnt)
 {
 	struct super_block *sb = mnt->mnt_sb;
 	dput(mnt->mnt_root);
@@ -180,7 +180,46 @@ void __mntput(struct vfsmount *mnt)
 	deactivate_super(sb);
 }
 
-EXPORT_SYMBOL(__mntput);
+void mntput_no_expire(struct vfsmount *mnt)
+{
+repeat:
+	if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
+		if (likely(!mnt->mnt_pinned)) {
+			spin_unlock(&vfsmount_lock);
+			__mntput(mnt);
+			return;
+		}
+		atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+		mnt->mnt_pinned = 0;
+		spin_unlock(&vfsmount_lock);
+		acct_auto_close_mnt(mnt);
+		security_sb_umount_close(mnt);
+		goto repeat;
+	}
+}
+
+EXPORT_SYMBOL(mntput_no_expire);
+
+void mnt_pin(struct vfsmount *mnt)
+{
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_pinned++;
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL(mnt_pin);
+
+void mnt_unpin(struct vfsmount *mnt)
+{
+	spin_lock(&vfsmount_lock);
+	if (mnt->mnt_pinned) {
+		atomic_inc(&mnt->mnt_count);
+		mnt->mnt_pinned--;
+	}
+	spin_unlock(&vfsmount_lock);
+}
+
+EXPORT_SYMBOL(mnt_unpin);
 
 /* iterator */
 static void *m_start(struct seq_file *m, loff_t *pos)
@@ -435,16 +474,6 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	down_write(&current->namespace->sem);
 	spin_lock(&vfsmount_lock);
 
-	if (atomic_read(&sb->s_active) == 1) {
-		/* last instance - try to be smart */
-		spin_unlock(&vfsmount_lock);
-		lock_kernel();
-		DQUOT_OFF(sb);
-		acct_auto_close(sb);
-		unlock_kernel();
-		security_sb_umount_close(mnt);
-		spin_lock(&vfsmount_lock);
-	}
 	retval = -EBUSY;
 	if (atomic_read(&mnt->mnt_count) == 2 || flags & MNT_DETACH) {
 		if (!list_empty(&mnt->mnt_list))
@@ -850,17 +879,6 @@ static void expire_mount(struct vfsmount *mnt, struct list_head *mounts)
 		detach_mnt(mnt, &old_nd);
 		spin_unlock(&vfsmount_lock);
 		path_release(&old_nd);
-
-		/*
-		 * Now lay it to rest if this was the last ref on the superblock
-		 */
-		if (atomic_read(&mnt->mnt_sb->s_active) == 1) {
-			/* last instance - try to be smart */
-			lock_kernel();
-			DQUOT_OFF(mnt->mnt_sb);
-			acct_auto_close(mnt->mnt_sb);
-			unlock_kernel();
-		}
 		mntput(mnt);
 	} else {
 		/*
diff --git a/fs/super.c b/fs/super.c
index eed6c3132905..6689dded3c84 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -171,6 +171,7 @@ void deactivate_super(struct super_block *s)
 	if (atomic_dec_and_lock(&s->s_active, &sb_lock)) {
 		s->s_count -= S_BIAS-1;
 		spin_unlock(&sb_lock);
+		DQUOT_OFF(s);
 		down_write(&s->s_umount);
 		fs->kill_sb(s);
 		put_filesystem(fs);
diff --git a/include/linux/acct.h b/include/linux/acct.h
index 19f70462b3be..93c5b3cdf951 100644
--- a/include/linux/acct.h
+++ b/include/linux/acct.h
@@ -117,12 +117,15 @@ struct acct_v3
 #include <linux/config.h>
 
 #ifdef CONFIG_BSD_PROCESS_ACCT
+struct vfsmount;
 struct super_block;
+extern void acct_auto_close_mnt(struct vfsmount *m);
 extern void acct_auto_close(struct super_block *sb);
 extern void acct_process(long exitcode);
 extern void acct_update_integrals(struct task_struct *tsk);
 extern void acct_clear_integrals(struct task_struct *tsk);
 #else
+#define acct_auto_close_mnt(x)	do { } while (0)
 #define acct_auto_close(x)	do { } while (0)
 #define acct_process(x)		do { } while (0)
 #define acct_update_integrals(x)		do { } while (0)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index f8f39937e301..ffb0b5089880 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -37,6 +37,7 @@ struct vfsmount
 	struct list_head mnt_list;
 	struct list_head mnt_expire;	/* link in fs-specific expiry list */
 	struct namespace *mnt_namespace; /* containing namespace */
+	int mnt_pinned;
 };
 
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
@@ -46,15 +47,9 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 	return mnt;
 }
 
-extern void __mntput(struct vfsmount *mnt);
-
-static inline void mntput_no_expire(struct vfsmount *mnt)
-{
-	if (mnt) {
-		if (atomic_dec_and_test(&mnt->mnt_count))
-			__mntput(mnt);
-	}
-}
+extern void mntput_no_expire(struct vfsmount *mnt);
+extern void mnt_pin(struct vfsmount *mnt);
+extern void mnt_unpin(struct vfsmount *mnt);
 
 static inline void mntput(struct vfsmount *mnt)
 {
diff --git a/include/linux/quota.h b/include/linux/quota.h
index 700ead45084f..f33aeb22c26a 100644
--- a/include/linux/quota.h
+++ b/include/linux/quota.h
@@ -289,7 +289,6 @@ struct quota_info {
 	struct semaphore dqonoff_sem;		/* Serialize quotaon & quotaoff */
 	struct rw_semaphore dqptr_sem;		/* serialize ops using quota_info struct, pointers from inode to dquots */
 	struct inode *files[MAXQUOTAS];		/* inodes of quotafiles */
-	struct vfsmount *mnt[MAXQUOTAS];	/* mountpoint entries of filesystems with quota files */
 	struct mem_dqinfo info[MAXQUOTAS];	/* Information for each quota type */
 	struct quota_format_ops *ops[MAXQUOTAS];	/* Operations for each type */
 };
diff --git a/kernel/acct.c b/kernel/acct.c
index 2e3f4a47e7d0..6312d6bd43e3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -54,6 +54,7 @@
 #include <linux/jiffies.h>
 #include <linux/times.h>
 #include <linux/syscalls.h>
+#include <linux/mount.h>
 #include <asm/uaccess.h>
 #include <asm/div64.h>
 #include <linux/blkdev.h> /* sector_div */
@@ -192,6 +193,7 @@ static void acct_file_reopen(struct file *file)
 		add_timer(&acct_globals.timer);
 	}
 	if (old_acct) {
+		mnt_unpin(old_acct->f_vfsmnt);
 		spin_unlock(&acct_globals.lock);
 		do_acct_process(0, old_acct);
 		filp_close(old_acct, NULL);
@@ -199,6 +201,42 @@ static void acct_file_reopen(struct file *file)
 	}
 }
 
+static int acct_on(char *name)
+{
+	struct file *file;
+	int error;
+
+	/* Difference from BSD - they don't do O_APPEND */
+	file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
+		filp_close(file, NULL);
+		return -EACCES;
+	}
+
+	if (!file->f_op->write) {
+		filp_close(file, NULL);
+		return -EIO;
+	}
+
+	error = security_acct(file);
+	if (error) {
+		filp_close(file, NULL);
+		return error;
+	}
+
+	spin_lock(&acct_globals.lock);
+	mnt_pin(file->f_vfsmnt);
+	acct_file_reopen(file);
+	spin_unlock(&acct_globals.lock);
+
+	mntput(file->f_vfsmnt);	/* it's pinned, now give up active reference */
+
+	return 0;
+}
+
 /**
  * sys_acct - enable/disable process accounting
  * @name: file name for accounting records or NULL to shutdown accounting
@@ -212,47 +250,41 @@ static void acct_file_reopen(struct file *file)
  */
 asmlinkage long sys_acct(const char __user *name)
 {
-	struct file *file = NULL;
-	char *tmp;
 	int error;
 
 	if (!capable(CAP_SYS_PACCT))
 		return -EPERM;
 
 	if (name) {
-		tmp = getname(name);
-		if (IS_ERR(tmp)) {
+		char *tmp = getname(name);
+		if (IS_ERR(tmp))
 			return (PTR_ERR(tmp));
-		}
-		/* Difference from BSD - they don't do O_APPEND */
-		file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
+		error = acct_on(tmp);
 		putname(tmp);
-		if (IS_ERR(file)) {
-			return (PTR_ERR(file));
-		}
-		if (!S_ISREG(file->f_dentry->d_inode->i_mode)) {
-			filp_close(file, NULL);
-			return (-EACCES);
-		}
-
-		if (!file->f_op->write) {
-			filp_close(file, NULL);
-			return (-EIO);
+	} else {
+		error = security_acct(NULL);
+		if (!error) {
+			spin_lock(&acct_globals.lock);
+			acct_file_reopen(NULL);
+			spin_unlock(&acct_globals.lock);
 		}
 	}
+	return error;
+}
 
-	error = security_acct(file);
-	if (error) {
-		if (file)
-			filp_close(file, NULL);
-		return error;
-	}
-
+/**
+ * acct_auto_close - turn off a filesystem's accounting if it is on
+ * @m: vfsmount being shut down
+ *
+ * If the accounting is turned on for a file in the subtree pointed to
+ * to by m, turn accounting off.  Done when m is about to die.
+ */
+void acct_auto_close_mnt(struct vfsmount *m)
+{
 	spin_lock(&acct_globals.lock);
-	acct_file_reopen(file);
+	if (acct_globals.file && acct_globals.file->f_vfsmnt == m)
+		acct_file_reopen(NULL);
 	spin_unlock(&acct_globals.lock);
-
-	return (0);
 }
 
 /**
@@ -266,8 +298,8 @@ void acct_auto_close(struct super_block *sb)
 {
 	spin_lock(&acct_globals.lock);
 	if (acct_globals.file &&
-	    acct_globals.file->f_dentry->d_inode->i_sb == sb) {
-		acct_file_reopen((struct file *)NULL);
+	    acct_globals.file->f_vfsmnt->mnt_sb == sb) {
+		acct_file_reopen(NULL);
 	}
 	spin_unlock(&acct_globals.lock);
 }
-- 
cgit v1.2.3


From 330d57fb98a916fa8e1363846540dd420e99499a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Nov 2005 10:18:40 +0000
Subject: [PATCH] Fix sysctl unregistration oops (CVE-2005-2709)

You could open the /proc/sys/net/ipv4/conf/<if>/<whatever> file, then
wait for interface to go away, try to grab as much memory as possible in
hope to hit the (kfreed) ctl_table.  Then fill it with pointers to your
function.  Then do read from file you've opened and if you are lucky,
you'll get it called as ->proc_handler() in kernel mode.

So this is at least an Oops and possibly more.  It does depend on an
interface going away though, so less of a security risk than it would
otherwise be.

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/s390/appldata/appldata_base.c |   7 +-
 include/linux/proc_fs.h            |   1 +
 include/linux/sysctl.h             |   3 +
 kernel/sysctl.c                    | 136 +++++++++++++++++++++++++++++--------
 4 files changed, 116 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index c9f2f60cfa58..dee6ab54984d 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -592,12 +592,15 @@ int appldata_register_ops(struct appldata_ops *ops)
  */
 void appldata_unregister_ops(struct appldata_ops *ops)
 {
+	void *table;
 	spin_lock(&appldata_ops_lock);
-	unregister_sysctl_table(ops->sysctl_header);
 	list_del(&ops->list);
-	kfree(ops->ctl_table);
+	/* at that point any incoming access will fail */
+	table = ops->ctl_table;
 	ops->ctl_table = NULL;
 	spin_unlock(&appldata_ops_lock);
+	unregister_sysctl_table(ops->sysctl_header);
+	kfree(table);
 	P_INFO("%s-ops unregistered!\n", ops->name);
 }
 /********************** module-ops management <END> **************************/
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 65ceeaa30652..74488e49166d 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -66,6 +66,7 @@ struct proc_dir_entry {
 	write_proc_t *write_proc;
 	atomic_t count;		/* use count */
 	int deleted;		/* delete flag */
+	void *set;
 };
 
 struct kcore_list {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index fc8e367f671e..fc131d6602b9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -24,6 +24,7 @@
 #include <linux/compiler.h>
 
 struct file;
+struct completion;
 
 #define CTL_MAXNAME 10		/* how many path components do we allow in a
 				   call to sysctl?   In other words, what is
@@ -925,6 +926,8 @@ struct ctl_table_header
 {
 	ctl_table *ctl_table;
 	struct list_head ctl_entry;
+	int used;
+	struct completion *unregistering;
 };
 
 struct ctl_table_header * register_sysctl_table(ctl_table * table, 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c4f35f96884d..9990e10192e8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -169,7 +169,7 @@ struct file_operations proc_sys_file_operations = {
 
 extern struct proc_dir_entry *proc_sys_root;
 
-static void register_proc_table(ctl_table *, struct proc_dir_entry *);
+static void register_proc_table(ctl_table *, struct proc_dir_entry *, void *);
 static void unregister_proc_table(ctl_table *, struct proc_dir_entry *);
 #endif
 
@@ -992,10 +992,51 @@ static ctl_table dev_table[] = {
 
 extern void init_irq_proc (void);
 
+static DEFINE_SPINLOCK(sysctl_lock);
+
+/* called under sysctl_lock */
+static int use_table(struct ctl_table_header *p)
+{
+	if (unlikely(p->unregistering))
+		return 0;
+	p->used++;
+	return 1;
+}
+
+/* called under sysctl_lock */
+static void unuse_table(struct ctl_table_header *p)
+{
+	if (!--p->used)
+		if (unlikely(p->unregistering))
+			complete(p->unregistering);
+}
+
+/* called under sysctl_lock, will reacquire if has to wait */
+static void start_unregistering(struct ctl_table_header *p)
+{
+	/*
+	 * if p->used is 0, nobody will ever touch that entry again;
+	 * we'll eliminate all paths to it before dropping sysctl_lock
+	 */
+	if (unlikely(p->used)) {
+		struct completion wait;
+		init_completion(&wait);
+		p->unregistering = &wait;
+		spin_unlock(&sysctl_lock);
+		wait_for_completion(&wait);
+		spin_lock(&sysctl_lock);
+	}
+	/*
+	 * do not remove from the list until nobody holds it; walking the
+	 * list in do_sysctl() relies on that.
+	 */
+	list_del_init(&p->ctl_entry);
+}
+
 void __init sysctl_init(void)
 {
 #ifdef CONFIG_PROC_FS
-	register_proc_table(root_table, proc_sys_root);
+	register_proc_table(root_table, proc_sys_root, &root_table_header);
 	init_irq_proc();
 #endif
 }
@@ -1004,6 +1045,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 	       void __user *newval, size_t newlen)
 {
 	struct list_head *tmp;
+	int error = -ENOTDIR;
 
 	if (nlen <= 0 || nlen >= CTL_MAXNAME)
 		return -ENOTDIR;
@@ -1012,20 +1054,30 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
 		if (!oldlenp || get_user(old_len, oldlenp))
 			return -EFAULT;
 	}
+	spin_lock(&sysctl_lock);
 	tmp = &root_table_header.ctl_entry;
 	do {
 		struct ctl_table_header *head =
 			list_entry(tmp, struct ctl_table_header, ctl_entry);
 		void *context = NULL;
-		int error = parse_table(name, nlen, oldval, oldlenp, 
+
+		if (!use_table(head))
+			continue;
+
+		spin_unlock(&sysctl_lock);
+
+		error = parse_table(name, nlen, oldval, oldlenp, 
 					newval, newlen, head->ctl_table,
 					&context);
 		kfree(context);
+
+		spin_lock(&sysctl_lock);
+		unuse_table(head);
 		if (error != -ENOTDIR)
-			return error;
-		tmp = tmp->next;
-	} while (tmp != &root_table_header.ctl_entry);
-	return -ENOTDIR;
+			break;
+	} while ((tmp = tmp->next) != &root_table_header.ctl_entry);
+	spin_unlock(&sysctl_lock);
+	return error;
 }
 
 asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
@@ -1236,12 +1288,16 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
 		return NULL;
 	tmp->ctl_table = table;
 	INIT_LIST_HEAD(&tmp->ctl_entry);
+	tmp->used = 0;
+	tmp->unregistering = NULL;
+	spin_lock(&sysctl_lock);
 	if (insert_at_head)
 		list_add(&tmp->ctl_entry, &root_table_header.ctl_entry);
 	else
 		list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
+	spin_unlock(&sysctl_lock);
 #ifdef CONFIG_PROC_FS
-	register_proc_table(table, proc_sys_root);
+	register_proc_table(table, proc_sys_root, tmp);
 #endif
 	return tmp;
 }
@@ -1255,10 +1311,13 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
  */
 void unregister_sysctl_table(struct ctl_table_header * header)
 {
-	list_del(&header->ctl_entry);
+	might_sleep();
+	spin_lock(&sysctl_lock);
+	start_unregistering(header);
 #ifdef CONFIG_PROC_FS
 	unregister_proc_table(header->ctl_table, proc_sys_root);
 #endif
+	spin_unlock(&sysctl_lock);
 	kfree(header);
 }
 
@@ -1269,7 +1328,7 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 #ifdef CONFIG_PROC_FS
 
 /* Scan the sysctl entries in table and add them all into /proc */
-static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
+static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
 {
 	struct proc_dir_entry *de;
 	int len;
@@ -1305,13 +1364,14 @@ static void register_proc_table(ctl_table * table, struct proc_dir_entry *root)
 			de = create_proc_entry(table->procname, mode, root);
 			if (!de)
 				continue;
+			de->set = set;
 			de->data = (void *) table;
 			if (table->proc_handler)
 				de->proc_fops = &proc_sys_file_operations;
 		}
 		table->de = de;
 		if (de->mode & S_IFDIR)
-			register_proc_table(table->child, de);
+			register_proc_table(table->child, de, set);
 	}
 }
 
@@ -1336,6 +1396,13 @@ static void unregister_proc_table(ctl_table * table, struct proc_dir_entry *root
 				continue;
 		}
 
+		/*
+		 * In any case, mark the entry as goner; we'll keep it
+		 * around if it's busy, but we'll know to do nothing with
+		 * its fields.  We are under sysctl_lock here.
+		 */
+		de->data = NULL;
+
 		/* Don't unregister proc entries that are still being used.. */
 		if (atomic_read(&de->count))
 			continue;
@@ -1349,27 +1416,38 @@ static ssize_t do_rw_proc(int write, struct file * file, char __user * buf,
 			  size_t count, loff_t *ppos)
 {
 	int op;
-	struct proc_dir_entry *de;
+	struct proc_dir_entry *de = PDE(file->f_dentry->d_inode);
 	struct ctl_table *table;
 	size_t res;
-	ssize_t error;
-	
-	de = PDE(file->f_dentry->d_inode);
-	if (!de || !de->data)
-		return -ENOTDIR;
-	table = (struct ctl_table *) de->data;
-	if (!table || !table->proc_handler)
-		return -ENOTDIR;
-	op = (write ? 002 : 004);
-	if (ctl_perm(table, op))
-		return -EPERM;
+	ssize_t error = -ENOTDIR;
 	
-	res = count;
-
-	error = (*table->proc_handler) (table, write, file, buf, &res, ppos);
-	if (error)
-		return error;
-	return res;
+	spin_lock(&sysctl_lock);
+	if (de && de->data && use_table(de->set)) {
+		/*
+		 * at that point we know that sysctl was not unregistered
+		 * and won't be until we finish
+		 */
+		spin_unlock(&sysctl_lock);
+		table = (struct ctl_table *) de->data;
+		if (!table || !table->proc_handler)
+			goto out;
+		error = -EPERM;
+		op = (write ? 002 : 004);
+		if (ctl_perm(table, op))
+			goto out;
+		
+		/* careful: calling conventions are nasty here */
+		res = count;
+		error = (*table->proc_handler)(table, write, file,
+						buf, &res, ppos);
+		if (!error)
+			error = res;
+	out:
+		spin_lock(&sysctl_lock);
+		unuse_table(de->set);
+	}
+	spin_unlock(&sysctl_lock);
+	return error;
 }
 
 static int proc_opensys(struct inode *inode, struct file *file)
-- 
cgit v1.2.3


From 90d45d17f3e68608ac7ba8fc3d7acce022a19c8e Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Tue, 8 Nov 2005 21:34:24 -0800
Subject: [PATCH] cpu hotplug: fix locking in cpufreq drivers

When calling target drivers to set frequency, we take cpucontrol lock.
When we modified the code to accomodate CPU hotplug, there was an attempt
to take a double lock of cpucontrol leading to a deadlock.  Since the
current thread context is already holding the cpucontrol lock, we dont need
to make another attempt to acquire it.

Now we leave a trace in current->flags indicating current thread already is
under cpucontrol lock held, so we dont attempt to do this another time.

Thanks to Andrew Morton for the beating:-)

From: Brice Goglin <Brice.Goglin@ens-lyon.org>

  Build fix

(akpm: this patch is still unpleasant.  Ashok continues to look for a cleaner
solution, doesn't he?  ;))

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Signed-off-by: Brice Goglin <Brice.Goglin@ens-lyon.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cpufreq/cpufreq.c | 24 ++++++++++--------------
 include/linux/cpu.h       |  5 +++++
 include/linux/sched.h     |  1 +
 kernel/cpu.c              | 33 +++++++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 25acf478c9e8..23a63207d747 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -38,7 +38,6 @@ static struct cpufreq_driver   	*cpufreq_driver;
 static struct cpufreq_policy	*cpufreq_cpu_data[NR_CPUS];
 static DEFINE_SPINLOCK(cpufreq_driver_lock);
 
-
 /* internal prototypes */
 static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
 static void handle_update(void *data);
@@ -1115,24 +1114,21 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 	int retval = -EINVAL;
 
 	/*
-	 * Converted the lock_cpu_hotplug to preempt_disable()
-	 * and preempt_enable(). This is a bit kludgy and relies on how cpu
-	 * hotplug works. All we need is a guarantee that cpu hotplug won't make
-	 * progress on any cpu. Once we do preempt_disable(), this would ensure
-	 * that hotplug threads don't get onto this cpu, thereby delaying
-	 * the cpu remove process.
-	 *
-	 * We removed the lock_cpu_hotplug since we need to call this function
-	 * via cpu hotplug callbacks, which result in locking the cpu hotplug
-	 * thread itself. Agree this is not very clean, cpufreq community
-	 * could improve this if required. - Ashok Raj <ashok.raj@intel.com>
+	 * If we are already in context of hotplug thread, we dont need to
+	 * acquire the hotplug lock. Otherwise acquire cpucontrol to prevent
+	 * hotplug from removing this cpu that we are working on.
 	 */
-	preempt_disable();
+	if (!current_in_cpu_hotplug())
+		lock_cpu_hotplug();
+
 	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
-	preempt_enable();
+
+	if (!current_in_cpu_hotplug())
+		unlock_cpu_hotplug();
+
 	return retval;
 }
 EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 1f7b2c097503..43c44530ef9d 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -42,6 +42,7 @@ struct notifier_block;
 /* Need to know about CPUs going up/down? */
 extern int register_cpu_notifier(struct notifier_block *nb);
 extern void unregister_cpu_notifier(struct notifier_block *nb);
+extern int current_in_cpu_hotplug(void);
 
 int cpu_up(unsigned int cpu);
 
@@ -54,6 +55,10 @@ static inline int register_cpu_notifier(struct notifier_block *nb)
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 }
+static inline int current_in_cpu_hotplug(void)
+{
+	return 0;
+}
 
 #endif /* CONFIG_SMP */
 extern struct sysdev_class cpu_sysdev_class;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03b68a7b4b82..2bbf968b23d9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -909,6 +909,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_HOTPLUG_CPU	0x01000000	/* Currently performing CPU hotplug */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3619e939182e..d61ba88f34e5 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -21,6 +21,24 @@ EXPORT_SYMBOL_GPL(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
+/*
+ * Used to check by callers if they need to acquire the cpucontrol
+ * or not to protect a cpu from being removed. Its sometimes required to
+ * call these functions both for normal operations, and in response to
+ * a cpu being added/removed. If the context of the call is in the same
+ * thread context as a CPU hotplug thread, we dont need to take the lock
+ * since its already protected
+ * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj
+ */
+
+int current_in_cpu_hotplug(void)
+{
+	return (current->flags & PF_HOTPLUG_CPU);
+}
+
+EXPORT_SYMBOL_GPL(current_in_cpu_hotplug);
+
+
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
@@ -94,6 +112,13 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
+	/*
+	 * Leave a trace in current->flags indicating we are already in
+	 * process of performing CPU hotplug. Callers can check if cpucontrol
+	 * is already acquired by current thread, and if so not cause
+	 * a dead lock by not acquiring the lock
+	 */
+	current->flags |= PF_HOTPLUG_CPU;
 	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
@@ -146,6 +171,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out:
+	current->flags &= ~PF_HOTPLUG_CPU;
 	unlock_cpu_hotplug();
 	return err;
 }
@@ -163,6 +189,12 @@ int __devinit cpu_up(unsigned int cpu)
 		ret = -EINVAL;
 		goto out;
 	}
+
+	/*
+	 * Leave a trace in current->flags indicating we are already in
+	 * process of performing CPU hotplug.
+	 */
+	current->flags |= PF_HOTPLUG_CPU;
 	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
@@ -185,6 +217,7 @@ out_notify:
 	if (ret != 0)
 		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
 out:
+	current->flags &= ~PF_HOTPLUG_CPU;
 	up(&cpucontrol);
 	return ret;
 }
-- 
cgit v1.2.3


From 3aef1bde147a503aacb59b767826720a996aea6d Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Tue, 8 Nov 2005 21:34:25 -0800
Subject: [PATCH] quieten softlockup at boot

On a large SMP box we get a lot of softlockup thread XX started lines.

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/softlockup.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a2dcceb9437d..c67189a25d52 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -73,9 +73,6 @@ void softlockup_tick(struct pt_regs *regs)
 static int watchdog(void * __bind_cpu)
 {
 	struct sched_param param = { .sched_priority = 99 };
-	int this_cpu = (long) __bind_cpu;
-
-	printk("softlockup thread %d started up.\n", this_cpu);
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	current->flags |= PF_NOFREEZE;
-- 
cgit v1.2.3


From 969e9afd489514252a680914c6d8b9322c713eb4 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 8 Nov 2005 21:34:30 -0800
Subject: [PATCH] sleep: Fix oops in enter_state

If ACPI sleep is not configured, but someone still wants to run swsusp,
he'd get oops in enter_state.  This is regression since 2.6.14 and this
fixes it.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 18d7d693fbba..6ee2cad530e8 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -167,7 +167,7 @@ static int enter_state(suspend_state_t state)
 {
 	int error;
 
-	if (pm_ops->valid && !pm_ops->valid(state))
+	if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
 		return -ENODEV;
 	if (down_trylock(&pm_sem))
 		return -EBUSY;
-- 
cgit v1.2.3


From 054bd4c18853f3a3851bd97aa90e11022a69dc42 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Nov 2005 21:34:39 -0800
Subject: [PATCH] swsusp: reduce code duplication
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The changes made by this patch are necessary for the pagedir relocation
simplification in the next patch.  �Additionally, these changes allow us to
drop check_pagedir() and make get_safe_page() be a one-line wrapper around
alloc_image_page() (get_safe_page() goes to snapshot.c, because
alloc_image_page() is static and it does not make sense to export it).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |  3 ++-
 kernel/power/snapshot.c | 62 ++++++++++++++++++++++++++++++++++++++-----------
 kernel/power/swsusp.c   | 57 ++-------------------------------------------
 3 files changed, 52 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index d4fd96a135ab..c98923e13e75 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -66,7 +66,8 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern int restore_highmem(void);
-extern struct pbe * alloc_pagedir(unsigned nr_pages);
+extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
+extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
 extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 723f5179883e..96cc3e21e97d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -269,9 +269,30 @@ void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
 	pr_debug("create_pbe_list(): initialized %d PBEs\n", num);
 }
 
-static void *alloc_image_page(void)
+/**
+ *	@safe_needed - on resume, for storing the PBE list and the image,
+ *	we can only use memory pages that do not conflict with the pages
+ *	which had been used before suspend.
+ *
+ *	The unsafe pages are marked with the PG_nosave_free flag
+ *
+ *	Allocated but unusable (ie eaten) memory pages should be marked
+ *	so that swsusp_free() can release them
+ */
+
+static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
 {
-	void *res = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
+	void *res;
+
+	if (safe_needed)
+		do {
+			res = (void *)get_zeroed_page(gfp_mask);
+			if (res && PageNosaveFree(virt_to_page(res)))
+				/* This is for swsusp_free() */
+				SetPageNosave(virt_to_page(res));
+		} while (res && PageNosaveFree(virt_to_page(res)));
+	else
+		res = (void *)get_zeroed_page(gfp_mask);
 	if (res) {
 		SetPageNosave(virt_to_page(res));
 		SetPageNosaveFree(virt_to_page(res));
@@ -279,6 +300,11 @@ static void *alloc_image_page(void)
 	return res;
 }
 
+unsigned long get_safe_page(gfp_t gfp_mask)
+{
+	return (unsigned long)alloc_image_page(gfp_mask, 1);
+}
+
 /**
  *	alloc_pagedir - Allocate the page directory.
  *
@@ -292,7 +318,7 @@ static void *alloc_image_page(void)
  *	On each page we set up a list of struct_pbe elements.
  */
 
-struct pbe *alloc_pagedir(unsigned int nr_pages)
+struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed)
 {
 	unsigned int num;
 	struct pbe *pblist, *pbe;
@@ -301,12 +327,12 @@ struct pbe *alloc_pagedir(unsigned int nr_pages)
 		return NULL;
 
 	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
-	pblist = alloc_image_page();
+	pblist = alloc_image_page(gfp_mask, safe_needed);
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
         		pbe = pbe->next, num += PBES_PER_PAGE) {
 		pbe += PB_PAGE_SKIP;
-		pbe->next = alloc_image_page();
+		pbe->next = alloc_image_page(gfp_mask, safe_needed);
 	}
 	if (!pbe) { /* get_zeroed_page() failed */
 		free_pagedir(pblist);
@@ -354,24 +380,32 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
+int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+{
+	struct pbe *p;
+
+	for_each_pbe (p, pblist) {
+		p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
+		if (!p->address)
+			return -ENOMEM;
+	}
+	return 0;
+}
 
 static struct pbe *swsusp_alloc(unsigned int nr_pages)
 {
-	struct pbe *pblist, *p;
+	struct pbe *pblist;
 
-	if (!(pblist = alloc_pagedir(nr_pages))) {
+	if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) {
 		printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
 		return NULL;
 	}
 	create_pbe_list(pblist, nr_pages);
 
-	for_each_pbe (p, pblist) {
-		p->address = (unsigned long)alloc_image_page();
-		if (!p->address) {
-			printk(KERN_ERR "suspend: Allocating image pages failed.\n");
-			swsusp_free();
-			return NULL;
-		}
+	if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) {
+		printk(KERN_ERR "suspend: Allocating image pages failed.\n");
+		swsusp_free();
+		return NULL;
 	}
 
 	return pblist;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index e1ab28b9b217..a456ffe7a3c8 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -628,59 +628,6 @@ int swsusp_resume(void)
 	return error;
 }
 
-/**
- *	On resume, for storing the PBE list and the image,
- *	we can only use memory pages that do not conflict with the pages
- *	which had been used before suspend.
- *
- *	We don't know which pages are usable until we allocate them.
- *
- *	Allocated but unusable (ie eaten) memory pages are marked so that
- *	swsusp_free() can release them
- */
-
-unsigned long get_safe_page(gfp_t gfp_mask)
-{
-	unsigned long m;
-
-	do {
-		m = get_zeroed_page(gfp_mask);
-		if (m && PageNosaveFree(virt_to_page(m)))
-			/* This is for swsusp_free() */
-			SetPageNosave(virt_to_page(m));
-	} while (m && PageNosaveFree(virt_to_page(m)));
-	if (m) {
-		/* This is for swsusp_free() */
-		SetPageNosave(virt_to_page(m));
-		SetPageNosaveFree(virt_to_page(m));
-	}
-	return m;
-}
-
-/**
- *	check_pagedir - We ensure here that pages that the PBEs point to
- *	won't collide with pages where we're going to restore from the loaded
- *	pages later
- */
-
-static int check_pagedir(struct pbe *pblist)
-{
-	struct pbe *p;
-
-	/* This is necessary, so that we can free allocated pages
-	 * in case of failure
-	 */
-	for_each_pbe (p, pblist)
-		p->address = 0UL;
-
-	for_each_pbe (p, pblist) {
-		p->address = get_safe_page(GFP_ATOMIC);
-		if (!p->address)
-			return -ENOMEM;
-	}
-	return 0;
-}
-
 /**
  *	swsusp_pagedir_relocate - It is possible, that some memory pages
  *	occupied by the list of PBEs collide with pages where we're going to
@@ -990,7 +937,7 @@ static int read_suspend_image(void)
 	int error = 0;
 	struct pbe *p;
 
-	if (!(p = alloc_pagedir(nr_copy_pages)))
+	if (!(p = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 0)))
 		return -ENOMEM;
 
 	if ((error = read_pagedir(p)))
@@ -1003,7 +950,7 @@ static int read_suspend_image(void)
 
 	/* Allocate memory for the image and read the data from swap */
 
-	error = check_pagedir(pagedir_nosave);
+	error = alloc_data_pages(pagedir_nosave, GFP_ATOMIC, 1);
 
 	if (!error)
 		error = data_read(pagedir_nosave);
-- 
cgit v1.2.3


From ed14b52701e6ef5a5aaf7bdb75932d5ea5dd7387 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Nov 2005 21:34:40 -0800
Subject: [PATCH] swsusp: simplify pagedir relocation

This patch simplifies the relocation of the page backup list (aka pagedir)
during resume.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |  1 +
 kernel/power/snapshot.c |  2 +-
 kernel/power/swsusp.c   | 78 +++++++++++++++++--------------------------------
 3 files changed, 28 insertions(+), 53 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index c98923e13e75..893ee655085c 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -66,6 +66,7 @@ extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern int restore_highmem(void);
+extern void free_pagedir(struct pbe *pblist);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 96cc3e21e97d..b8a2e9a63206 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -216,7 +216,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-static void free_pagedir(struct pbe *pblist)
+void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a456ffe7a3c8..8511c7f3386a 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -629,74 +629,43 @@ int swsusp_resume(void)
 }
 
 /**
- *	swsusp_pagedir_relocate - It is possible, that some memory pages
- *	occupied by the list of PBEs collide with pages where we're going to
- *	restore from the loaded pages later.  We relocate them here.
+ *	mark_unsafe_pages - mark the pages that cannot be used for storing
+ *	the image during resume, because they conflict with the pages that
+ *	had been used before suspend
  */
 
-static struct pbe *swsusp_pagedir_relocate(struct pbe *pblist)
+static void mark_unsafe_pages(struct pbe *pblist)
 {
 	struct zone *zone;
 	unsigned long zone_pfn;
-	struct pbe *pbpage, *tail, *p;
-	void *m;
-	int rel = 0;
+	struct pbe *p;
 
 	if (!pblist) /* a sanity check */
-		return NULL;
-
-	pr_debug("swsusp: Relocating pagedir (%lu pages to check)\n",
-			swsusp_info.pagedir_pages);
+		return;
 
 	/* Clear page flags */
-
 	for_each_zone (zone) {
-        	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-        		if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-                		ClearPageNosaveFree(pfn_to_page(zone_pfn +
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+				ClearPageNosaveFree(pfn_to_page(zone_pfn +
 					zone->zone_start_pfn));
 	}
 
 	/* Mark orig addresses */
-
 	for_each_pbe (p, pblist)
 		SetPageNosaveFree(virt_to_page(p->orig_address));
 
-	tail = pblist + PB_PAGE_SKIP;
-
-	/* Relocate colliding pages */
-
-	for_each_pb_page (pbpage, pblist) {
-		if (PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
-			m = (void *)get_safe_page(GFP_ATOMIC | __GFP_COLD);
-			if (!m)
-				return NULL;
-			memcpy(m, (void *)pbpage, PAGE_SIZE);
-			if (pbpage == pblist)
-				pblist = (struct pbe *)m;
-			else
-				tail->next = (struct pbe *)m;
-			pbpage = (struct pbe *)m;
-
-			/* We have to link the PBEs again */
-			for (p = pbpage; p < pbpage + PB_PAGE_SKIP; p++)
-				if (p->next) /* needed to save the end */
-					p->next = p + 1;
-
-			rel++;
-		}
-		tail = pbpage + PB_PAGE_SKIP;
-	}
+}
 
-	/* This is for swsusp_free() */
-	for_each_pb_page (pbpage, pblist) {
-		SetPageNosave(virt_to_page(pbpage));
-		SetPageNosaveFree(virt_to_page(pbpage));
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+	/* We assume both lists contain the same number of elements */
+	while (src) {
+		dst->orig_address = src->orig_address;
+		dst->swap_address = src->swap_address;
+		dst = dst->next;
+		src = src->next;
 	}
-
-	printk("swsusp: Relocated %d pages\n", rel);
-
-	return pblist;
 }
 
 /*
@@ -942,10 +911,15 @@ static int read_suspend_image(void)
 
 	if ((error = read_pagedir(p)))
 		return error;
-
 	create_pbe_list(p, nr_copy_pages);
-
-	if (!(pagedir_nosave = swsusp_pagedir_relocate(p)))
+	mark_unsafe_pages(p);
+	pagedir_nosave = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+	if (pagedir_nosave) {
+		create_pbe_list(pagedir_nosave, nr_copy_pages);
+		copy_page_backup_list(pagedir_nosave, p);
+	}
+	free_pagedir(p);
+	if (!pagedir_nosave)
 		return -ENOMEM;
 
 	/* Allocate memory for the image and read the data from swap */
-- 
cgit v1.2.3


From 0fbeb5a45dccd493c35a68a5548e6a9d9882a791 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 8 Nov 2005 21:34:41 -0800
Subject: [PATCH] swsusp: rework swsusp_suspend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch makes only the functions in swsusp.c call functions in snapshot.c
and not both ways.  �It also moves the check for available swap out of
swsusp_suspend() which is necessary for separating the swap-handling functions
in swsusp from the core code.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/power.h    |  2 --
 kernel/power/snapshot.c | 19 ++-----------
 kernel/power/swsusp.c   | 75 +++++++++++++++++++++++++++++--------------------
 3 files changed, 47 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/power.h b/kernel/power/power.h
index 893ee655085c..6c042b5ee14b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -65,10 +65,8 @@ extern suspend_pagedir_t *pagedir_save;
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
-extern int restore_highmem(void);
 extern void free_pagedir(struct pbe *pblist);
 extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void create_pbe_list(struct pbe *pblist, unsigned nr_pages);
 extern void swsusp_free(void);
 extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
-extern int enough_swap(unsigned nr_pages);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index b8a2e9a63206..4a6dbcefd378 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -88,8 +88,7 @@ static int save_highmem_zone(struct zone *zone)
 	return 0;
 }
 
-
-static int save_highmem(void)
+int save_highmem(void)
 {
 	struct zone *zone;
 	int res = 0;
@@ -120,11 +119,7 @@ int restore_highmem(void)
 	}
 	return 0;
 }
-#else
-static int save_highmem(void) { return 0; }
-int restore_highmem(void) { return 0; }
-#endif /* CONFIG_HIGHMEM */
-
+#endif
 
 static int pfn_is_nosave(unsigned long pfn)
 {
@@ -416,11 +411,6 @@ asmlinkage int swsusp_save(void)
 	unsigned int nr_pages;
 
 	pr_debug("swsusp: critical section: \n");
-	if (save_highmem()) {
-		printk(KERN_CRIT "swsusp: Not enough free pages for highmem\n");
-		restore_highmem();
-		return -ENOMEM;
-	}
 
 	drain_local_pages();
 	nr_pages = count_data_pages();
@@ -440,11 +430,6 @@ asmlinkage int swsusp_save(void)
 		return -ENOMEM;
 	}
 
-	if (!enough_swap(nr_pages)) {
-		printk(KERN_ERR "swsusp: Not enough free swap\n");
-		return -ENOSPC;
-	}
-
 	pagedir_nosave = swsusp_alloc(nr_pages);
 	if (!pagedir_nosave)
 		return -ENOMEM;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 8511c7f3386a..c05f46e7348f 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -73,6 +73,14 @@
 
 #include "power.h"
 
+#ifdef CONFIG_HIGHMEM
+int save_highmem(void);
+int restore_highmem(void);
+#else
+static int save_highmem(void) { return 0; }
+static int restore_highmem(void) { return 0; }
+#endif
+
 #define CIPHER "aes"
 #define MAXKEY 32
 #define MAXIV  32
@@ -499,6 +507,26 @@ static int write_pagedir(void)
 	return error;
 }
 
+/**
+ *	enough_swap - Make sure we have enough swap to save the image.
+ *
+ *	Returns TRUE or FALSE after checking the total amount of swap
+ *	space avaiable.
+ *
+ *	FIXME: si_swapinfo(&i) returns all swap devices information.
+ *	We should only consider resume_device.
+ */
+
+static int enough_swap(unsigned int nr_pages)
+{
+	struct sysinfo i;
+
+	si_swapinfo(&i);
+	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
+	return i.freeswap > (nr_pages + PAGES_FOR_IO +
+		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
+}
+
 /**
  *	write_suspend_image - Write entire image and metadata.
  *
@@ -507,6 +535,11 @@ static int write_suspend_image(void)
 {
 	int error;
 
+	if (!enough_swap(nr_copy_pages)) {
+		printk(KERN_ERR "swsusp: Not enough free swap\n");
+		return -ENOSPC;
+	}
+
 	init_header();
 	if ((error = data_write()))
 		goto FreeData;
@@ -526,27 +559,6 @@ static int write_suspend_image(void)
 	goto Done;
 }
 
-/**
- *	enough_swap - Make sure we have enough swap to save the image.
- *
- *	Returns TRUE or FALSE after checking the total amount of swap
- *	space avaiable.
- *
- *	FIXME: si_swapinfo(&i) returns all swap devices information.
- *	We should only consider resume_device.
- */
-
-int enough_swap(unsigned int nr_pages)
-{
-	struct sysinfo i;
-
-	si_swapinfo(&i);
-	pr_debug("swsusp: available swap: %lu pages\n", i.freeswap);
-	return i.freeswap > (nr_pages + PAGES_FOR_IO +
-		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
-}
-
-
 /* It is important _NOT_ to umount filesystems at this point. We want
  * them synced (in case something goes wrong) but we DO not want to mark
  * filesystem clean: it is not. (And it does not matter, if we resume
@@ -556,12 +568,15 @@ int swsusp_write(void)
 {
 	int error;
 
+	if ((error = swsusp_swap_check())) {
+		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
+		return error;
+	}
 	lock_swapdevices();
 	error = write_suspend_image();
 	/* This will unlock ignored swap devices since writing is finished */
 	lock_swapdevices();
 	return error;
-
 }
 
 
@@ -569,6 +584,7 @@ int swsusp_write(void)
 int swsusp_suspend(void)
 {
 	int error;
+
 	if ((error = arch_prepare_suspend()))
 		return error;
 	local_irq_disable();
@@ -580,15 +596,12 @@ int swsusp_suspend(void)
 	 */
 	if ((error = device_power_down(PMSG_FREEZE))) {
 		printk(KERN_ERR "Some devices failed to power down, aborting suspend\n");
-		local_irq_enable();
-		return error;
+		goto Enable_irqs;
 	}
 
-	if ((error = swsusp_swap_check())) {
-		printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n");
-		device_power_up();
-		local_irq_enable();
-		return error;
+	if ((error = save_highmem())) {
+		printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
+		goto Restore_highmem;
 	}
 
 	save_processor_state();
@@ -596,8 +609,10 @@ int swsusp_suspend(void)
 		printk(KERN_ERR "Error %d suspending\n", error);
 	/* Restore control flow magically appears here */
 	restore_processor_state();
+Restore_highmem:
 	restore_highmem();
 	device_power_up();
+Enable_irqs:
 	local_irq_enable();
 	return error;
 }
@@ -804,7 +819,7 @@ static int check_sig(void)
 		 * Reset swap signature now.
 		 */
 		error = bio_write_page(0, &swsusp_header);
-	} else { 
+	} else {
 		return -EINVAL;
 	}
 	if (!error)
-- 
cgit v1.2.3


From b910472dd3b7c1d51af9a594a759f642520c33e1 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 8 Nov 2005 21:38:55 -0800
Subject: [PATCH] sched: implement nice support across physical cpus on SMP

This patch implements 'nice' support across physical cpus on SMP.

It introduces an extra runqueue variable prio_bias which is the sum of the
(inverted) static priorities of all the tasks on the runqueue.

This is then used to bias busy rebalancing between runqueues to obtain good
distribution of tasks of different nice values.  By biasing the balancing only
during busy rebalancing we can avoid having any significant loss of throughput
by not affecting the carefully tuned idle balancing already in place.  If all
tasks are running at the same nice level this code should also have minimal
effect.  The code is optimised out in the !CONFIG_SMP case.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 101 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 83 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3ce26954be12..c6827f94e156 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,6 +206,7 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
+	unsigned long prio_bias;
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -659,13 +660,45 @@ static int effective_prio(task_t *p)
 	return prio;
 }
 
+#ifdef CONFIG_SMP
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias += MAX_PRIO - static_prio;
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+	rq->prio_bias -= MAX_PRIO - static_prio;
+}
+#else
+static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+
+static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+{
+}
+#endif
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	inc_prio_bias(rq, p->static_prio);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	dec_prio_bias(rq, p->static_prio);
+}
+
 /*
  * __activate_task - move a task to the runqueue.
  */
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 /*
@@ -674,7 +707,7 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
 static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	inc_nr_running(p, rq);
 }
 
 static int recalc_task_prio(task_t *p, unsigned long long now)
@@ -793,7 +826,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
  */
 static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	rq->nr_running--;
+	dec_nr_running(p, rq);
 	dequeue_task(p, p->array);
 	p->array = NULL;
 }
@@ -930,27 +963,54 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu, int type)
+static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load[type-1],
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
+	if (idle == NOT_IDLE) {
+		/*
+		 * If we are balancing busy runqueues the load is biased by
+		 * priority to create 'nice' support across cpus.
+		 */
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
+	}
+
 	if (type == 0)
 		return load_now;
 
-	return min(rq->cpu_load[type-1], load_now);
+	return min(cpu_load, load_now);
+}
+
+static inline unsigned long source_load(int cpu, int type)
+{
+	return __source_load(cpu, type, NOT_IDLE);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu, int type)
+static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+	unsigned long cpu_load = rq->cpu_load[type-1],
+		load_now = rq->nr_running * SCHED_LOAD_SCALE;
+
 	if (type == 0)
 		return load_now;
 
-	return max(rq->cpu_load[type-1], load_now);
+	if (idle == NOT_IDLE) {
+		cpu_load *= rq->prio_bias;
+		load_now *= rq->prio_bias;
+	}
+	return max(cpu_load, load_now);
+}
+
+static inline unsigned long target_load(int cpu, int type)
+{
+	return __target_load(cpu, type, NOT_IDLE);
 }
 
 /*
@@ -1411,7 +1471,7 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
 				list_add_tail(&p->run_list, &current->run_list);
 				p->array = current->array;
 				p->array->nr_active++;
-				rq->nr_running++;
+				inc_nr_running(p, rq);
 			}
 			set_need_resched();
 		} else
@@ -1756,9 +1816,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
 	       runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	src_rq->nr_running--;
+	dec_nr_running(p, src_rq);
 	set_task_cpu(p, this_cpu);
-	this_rq->nr_running++;
+	inc_nr_running(p, this_rq);
 	enqueue_task(p, this_array);
 	p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
 				+ this_rq->timestamp_last_tick;
@@ -1937,9 +1997,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 			/* Bias balancing toward cpus of our domain */
 			if (local_group)
-				load = target_load(i, load_idx);
+				load = __target_load(i, load_idx, idle);
 			else
-				load = source_load(i, load_idx);
+				load = __source_load(i, load_idx, idle);
 
 			avg_load += load;
 		}
@@ -2044,14 +2104,15 @@ out_balanced:
 /*
  * find_busiest_queue - find the busiest runqueue among the cpus in group.
  */
-static runqueue_t *find_busiest_queue(struct sched_group *group)
+static runqueue_t *find_busiest_queue(struct sched_group *group,
+	enum idle_type idle)
 {
 	unsigned long load, max_load = 0;
 	runqueue_t *busiest = NULL;
 	int i;
 
 	for_each_cpu_mask(i, group->cpumask) {
-		load = source_load(i, 0);
+		load = __source_load(i, 0, idle);
 
 		if (load > max_load) {
 			max_load = load;
@@ -2095,7 +2156,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, idle);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[idle]);
 		goto out_balanced;
@@ -2218,7 +2279,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 		goto out_balanced;
 	}
 
-	busiest = find_busiest_queue(group);
+	busiest = find_busiest_queue(group, NEWLY_IDLE);
 	if (!busiest) {
 		schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
 		goto out_balanced;
@@ -3447,7 +3508,9 @@ void set_user_nice(task_t *p, long nice)
 	 * not SCHED_NORMAL:
 	 */
 	if (rt_task(p)) {
+		dec_prio_bias(rq, p->static_prio);
 		p->static_prio = NICE_TO_PRIO(nice);
+		inc_prio_bias(rq, p->static_prio);
 		goto out_unlock;
 	}
 	array = p->array;
@@ -3457,7 +3520,9 @@ void set_user_nice(task_t *p, long nice)
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
+	dec_prio_bias(rq, p->static_prio);
 	p->static_prio = NICE_TO_PRIO(nice);
+	inc_prio_bias(rq, p->static_prio);
 	p->prio += delta;
 
 	if (array) {
-- 
cgit v1.2.3


From 738a2ccbcf8c2c1b039f1e76662dce60b22b694b Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 8 Nov 2005 21:38:56 -0800
Subject: [PATCH] sched: change prio bias only if queued

prio_bias should only be adjusted in set_user_nice if p is actually currently
queued.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c6827f94e156..e1f57bd5aaa6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3508,25 +3508,24 @@ void set_user_nice(task_t *p, long nice)
 	 * not SCHED_NORMAL:
 	 */
 	if (rt_task(p)) {
-		dec_prio_bias(rq, p->static_prio);
 		p->static_prio = NICE_TO_PRIO(nice);
-		inc_prio_bias(rq, p->static_prio);
 		goto out_unlock;
 	}
 	array = p->array;
-	if (array)
+	if (array) {
 		dequeue_task(p, array);
+		dec_prio_bias(rq, p->static_prio);
+	}
 
 	old_prio = p->prio;
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
-	dec_prio_bias(rq, p->static_prio);
 	p->static_prio = NICE_TO_PRIO(nice);
-	inc_prio_bias(rq, p->static_prio);
 	p->prio += delta;
 
 	if (array) {
 		enqueue_task(p, array);
+		inc_prio_bias(rq, p->static_prio);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
-- 
cgit v1.2.3


From dad1c65c8000f4485d8602e1875ded77e0d72133 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 8 Nov 2005 21:38:57 -0800
Subject: [PATCH] sched: account rt tasks in prio_bias()

Real time tasks' effect on prio_bias should be based on their real time
priority level instead of their static_prio which is based on nice.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e1f57bd5aaa6..d9dbf8ee6ca4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -661,21 +661,21 @@ static int effective_prio(task_t *p)
 }
 
 #ifdef CONFIG_SMP
-static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
-	rq->prio_bias += MAX_PRIO - static_prio;
+	rq->prio_bias += MAX_PRIO - prio;
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
-	rq->prio_bias -= MAX_PRIO - static_prio;
+	rq->prio_bias -= MAX_PRIO - prio;
 }
 #else
-static inline void inc_prio_bias(runqueue_t *rq, int static_prio)
+static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
 }
 
-static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
+static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
 }
 #endif
@@ -683,13 +683,19 @@ static inline void dec_prio_bias(runqueue_t *rq, int static_prio)
 static inline void inc_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running++;
-	inc_prio_bias(rq, p->static_prio);
+	if (rt_task(p))
+		inc_prio_bias(rq, p->prio);
+	else
+		inc_prio_bias(rq, p->static_prio);
 }
 
 static inline void dec_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running--;
-	dec_prio_bias(rq, p->static_prio);
+	if (rt_task(p))
+		dec_prio_bias(rq, p->prio);
+	else
+		dec_prio_bias(rq, p->static_prio);
 }
 
 /*
-- 
cgit v1.2.3


From 3b0bd9bc6f3b8a47853d1b1de4520de3878e8941 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 8 Nov 2005 21:38:58 -0800
Subject: [PATCH] sched: smp nice bias busy queues on idle rebalance

To intensify the 'nice' support across physical cpus on SMP we can bias the
loads on idle rebalancing. To prevent idle rebalance from trying to pull tasks
from queues that appear heavily loaded we only bias the load if there is more
than one task running.

Add some minor micro-optimisations and have only one return from __source_load
and __target_load functions.

Fix the fact that target_load was not biased by priority when type == 0.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d9dbf8ee6ca4..ec9ea9119b98 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -972,22 +972,26 @@ void kick_process(task_t *p)
 static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long cpu_load = rq->cpu_load[type-1],
+	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
 		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
-	if (idle == NOT_IDLE) {
+	if (type == 0)
+		source_load = load_now;
+	else
+		source_load = min(cpu_load, load_now);
+
+	if (idle == NOT_IDLE || rq->nr_running > 1)
 		/*
-		 * If we are balancing busy runqueues the load is biased by
-		 * priority to create 'nice' support across cpus.
+		 * If we are busy rebalancing the load is biased by
+		 * priority to create 'nice' support across cpus. When
+		 * idle rebalancing we should only bias the source_load if
+		 * there is more than one task running on that queue to
+		 * prevent idle rebalance from trying to pull tasks from a
+		 * queue with only one running task.
 		 */
-		cpu_load *= rq->prio_bias;
-		load_now *= rq->prio_bias;
-	}
+		source_load *= rq->prio_bias;
 
-	if (type == 0)
-		return load_now;
-
-	return min(cpu_load, load_now);
+	return source_load;
 }
 
 static inline unsigned long source_load(int cpu, int type)
@@ -1001,17 +1005,18 @@ static inline unsigned long source_load(int cpu, int type)
 static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long cpu_load = rq->cpu_load[type-1],
+	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
 		load_now = rq->nr_running * SCHED_LOAD_SCALE;
 
 	if (type == 0)
-		return load_now;
+		target_load = load_now;
+	else
+		target_load = max(cpu_load, load_now);
 
-	if (idle == NOT_IDLE) {
-		cpu_load *= rq->prio_bias;
-		load_now *= rq->prio_bias;
-	}
-	return max(cpu_load, load_now);
+	if (idle == NOT_IDLE || rq->nr_running > 1)
+		target_load *= rq->prio_bias;
+
+	return target_load;
 }
 
 static inline unsigned long target_load(int cpu, int type)
-- 
cgit v1.2.3


From 6dd4a85bb3ee0715415892c8b0f2a9bd08d31ca4 Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 8 Nov 2005 21:38:59 -0800
Subject: [PATCH] sched: correct smp_nice_bias

The priority biasing was off by mutliplying the total load by the total
priority bias and this ruins the ratio of loads between runqueues. This
patch should correct the ratios of loads between runqueues to be proportional
to overall load. -2nd attempt.

From: Dave Kleikamp <shaggy@austin.ibm.com>

  This patch fixes a divide-by-zero error that I hit on a two-way i386
  machine.  rq->nr_running is tested to be non-zero, but may change by the
  time it is used in the division.  Saving the value to a local variable
  ensures that the same value that is checked is used in the division.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ec9ea9119b98..502d47c883b6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -972,15 +972,16 @@ void kick_process(task_t *p)
 static inline unsigned long __source_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long running = rq->nr_running;
 	unsigned long source_load, cpu_load = rq->cpu_load[type-1],
-		load_now = rq->nr_running * SCHED_LOAD_SCALE;
+		load_now = running * SCHED_LOAD_SCALE;
 
 	if (type == 0)
 		source_load = load_now;
 	else
 		source_load = min(cpu_load, load_now);
 
-	if (idle == NOT_IDLE || rq->nr_running > 1)
+	if (running > 1 || (idle == NOT_IDLE && running))
 		/*
 		 * If we are busy rebalancing the load is biased by
 		 * priority to create 'nice' support across cpus. When
@@ -989,7 +990,7 @@ static inline unsigned long __source_load(int cpu, int type, enum idle_type idle
 		 * prevent idle rebalance from trying to pull tasks from a
 		 * queue with only one running task.
 		 */
-		source_load *= rq->prio_bias;
+		source_load = source_load * rq->prio_bias / running;
 
 	return source_load;
 }
@@ -1005,16 +1006,17 @@ static inline unsigned long source_load(int cpu, int type)
 static inline unsigned long __target_load(int cpu, int type, enum idle_type idle)
 {
 	runqueue_t *rq = cpu_rq(cpu);
+	unsigned long running = rq->nr_running;
 	unsigned long target_load, cpu_load = rq->cpu_load[type-1],
-		load_now = rq->nr_running * SCHED_LOAD_SCALE;
+		load_now = running * SCHED_LOAD_SCALE;
 
 	if (type == 0)
 		target_load = load_now;
 	else
 		target_load = max(cpu_load, load_now);
 
-	if (idle == NOT_IDLE || rq->nr_running > 1)
-		target_load *= rq->prio_bias;
+	if (running > 1 || (idle == NOT_IDLE && running))
+		target_load = target_load * rq->prio_bias / running;
 
 	return target_load;
 }
-- 
cgit v1.2.3


From ede3d0fba99520f268067917b50858d788bc41da Mon Sep 17 00:00:00 2001
From: Con Kolivas <kernel@kolivas.org>
Date: Tue, 8 Nov 2005 21:39:00 -0800
Subject: [PATCH] sched: consider migration thread with smp nice

The intermittent scheduling of the migration thread at ultra high priority
makes the smp nice handling see that runqueue as being heavily loaded.  The
migration thread itself actually handles the balancing so its influence on
priority balancing should be ignored.

Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 502d47c883b6..0f2def822296 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -670,6 +670,31 @@ static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
 	rq->prio_bias -= MAX_PRIO - prio;
 }
+
+static inline void inc_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running++;
+	if (rt_task(p)) {
+		if (p != rq->migration_thread)
+			/*
+			 * The migration thread does the actual balancing. Do
+			 * not bias by its priority as the ultra high priority
+			 * will skew balancing adversely.
+			 */
+			inc_prio_bias(rq, p->prio);
+	} else
+		inc_prio_bias(rq, p->static_prio);
+}
+
+static inline void dec_nr_running(task_t *p, runqueue_t *rq)
+{
+	rq->nr_running--;
+	if (rt_task(p)) {
+		if (p != rq->migration_thread)
+			dec_prio_bias(rq, p->prio);
+	} else
+		dec_prio_bias(rq, p->static_prio);
+}
 #else
 static inline void inc_prio_bias(runqueue_t *rq, int prio)
 {
@@ -678,25 +703,17 @@ static inline void inc_prio_bias(runqueue_t *rq, int prio)
 static inline void dec_prio_bias(runqueue_t *rq, int prio)
 {
 }
-#endif
 
 static inline void inc_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running++;
-	if (rt_task(p))
-		inc_prio_bias(rq, p->prio);
-	else
-		inc_prio_bias(rq, p->static_prio);
 }
 
 static inline void dec_nr_running(task_t *p, runqueue_t *rq)
 {
 	rq->nr_running--;
-	if (rt_task(p))
-		dec_prio_bias(rq, p->prio);
-	else
-		dec_prio_bias(rq, p->static_prio);
 }
+#endif
 
 /*
  * __activate_task - move a task to the runqueue.
-- 
cgit v1.2.3


From 64c7c8f88559624abdbe12b5da6502e8879f8d28 Mon Sep 17 00:00:00 2001
From: Nick Piggin <nickpiggin@yahoo.com.au>
Date: Tue, 8 Nov 2005 21:39:04 -0800
Subject: [PATCH] sched: resched and cpu_idle rework

Make some changes to the NEED_RESCHED and POLLING_NRFLAG to reduce
confusion, and make their semantics rigid.  Improves efficiency of
resched_task and some cpu_idle routines.

* In resched_task:
- TIF_NEED_RESCHED is only cleared with the task's runqueue lock held,
  and as we hold it during resched_task, then there is no need for an
  atomic test and set there. The only other time this should be set is
  when the task's quantum expires, in the timer interrupt - this is
  protected against because the rq lock is irq-safe.

- If TIF_NEED_RESCHED is set, then we don't need to do anything. It
  won't get unset until the task get's schedule()d off.

- If we are running on the same CPU as the task we resched, then set
  TIF_NEED_RESCHED and no further action is required.

- If we are running on another CPU, and TIF_POLLING_NRFLAG is *not* set
  after TIF_NEED_RESCHED has been set, then we need to send an IPI.

Using these rules, we are able to remove the test and set operation in
resched_task, and make clear the previously vague semantics of
POLLING_NRFLAG.

* In idle routines:
- Enter cpu_idle with preempt disabled. When the need_resched() condition
  becomes true, explicitly call schedule(). This makes things a bit clearer
  (IMO), but haven't updated all architectures yet.

- Many do a test and clear of TIF_NEED_RESCHED for some reason. According
  to the resched_task rules, this isn't needed (and actually breaks the
  assumption that TIF_NEED_RESCHED is only cleared with the runqueue lock
  held). So remove that. Generally one less locked memory op when switching
  to the idle thread.

- Many idle routines clear TIF_POLLING_NRFLAG, and only set it in the inner
  most polling idle loops. The above resched_task semantics allow it to be
  set until before the last time need_resched() is checked before going into
  a halt requiring interrupt wakeup.

  Many idle routines simply never enter such a halt, and so POLLING_NRFLAG
  can be always left set, completely eliminating resched IPIs when rescheduling
  the idle task.

  POLLING_NRFLAG width can be increased, to reduce the chance of resched IPIs.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/sched-arch.txt           | 89 ++++++++++++++++++++++++++++++++++
 arch/alpha/kernel/process.c            | 10 ++--
 arch/arm/kernel/process.c              | 14 ++++--
 arch/i386/kernel/apm.c                 | 20 +++++++-
 arch/i386/kernel/process.c             | 64 +++++++++++-------------
 arch/ia64/kernel/process.c             | 32 ++++++------
 arch/parisc/kernel/process.c           |  2 +
 arch/powerpc/platforms/iseries/setup.c | 10 +---
 arch/powerpc/platforms/pseries/setup.c | 12 ++---
 arch/ppc/kernel/idle.c                 | 20 ++++----
 arch/ppc64/kernel/idle.c               | 11 +----
 arch/s390/kernel/process.c             | 13 ++---
 arch/sh/kernel/process.c               | 12 ++---
 arch/sh64/kernel/process.c             | 14 ++----
 arch/sparc/kernel/process.c            | 35 ++++++-------
 arch/sparc64/kernel/process.c          | 20 +++++---
 arch/sparc64/kernel/smp.c              | 13 +----
 arch/x86_64/kernel/process.c           | 67 ++++++++++++-------------
 drivers/acpi/processor_idle.c          | 37 ++++++++------
 kernel/sched.c                         | 21 +++++---
 20 files changed, 296 insertions(+), 220 deletions(-)
 create mode 100644 Documentation/sched-arch.txt

(limited to 'kernel')

diff --git a/Documentation/sched-arch.txt b/Documentation/sched-arch.txt
new file mode 100644
index 000000000000..941615a9769b
--- /dev/null
+++ b/Documentation/sched-arch.txt
@@ -0,0 +1,89 @@
+	CPU Scheduler implementation hints for architecture specific code
+
+	Nick Piggin, 2005
+
+Context switch
+==============
+1. Runqueue locking
+By default, the switch_to arch function is called with the runqueue
+locked. This is usually not a problem unless switch_to may need to
+take the runqueue lock. This is usually due to a wake up operation in
+the context switch. See include/asm-ia64/system.h for an example.
+
+To request the scheduler call switch_to with the runqueue unlocked,
+you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
+(typically the one where switch_to is defined).
+
+Unlocked context switches introduce only a very minor performance
+penalty to the core scheduler implementation in the CONFIG_SMP case.
+
+2. Interrupt status
+By default, the switch_to arch function is called with interrupts
+disabled. Interrupts may be enabled over the call if it is likely to
+introduce a significant interrupt latency by adding the line
+`#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
+unlocked context switches. This define also implies
+`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an
+example.
+
+
+CPU idle
+========
+Your cpu_idle routines need to obey the following rules:
+
+1. Preempt should now disabled over idle routines. Should only
+   be enabled to call schedule() then disabled again.
+
+2. need_resched/TIF_NEED_RESCHED is only ever set, and will never
+   be cleared until the running task has called schedule(). Idle
+   threads need only ever query need_resched, and may never set or
+   clear it.
+
+3. When cpu_idle finds (need_resched() == 'true'), it should call
+   schedule(). It should not call schedule() otherwise.
+
+4. The only time interrupts need to be disabled when checking
+   need_resched is if we are about to sleep the processor until
+   the next interrupt (this doesn't provide any protection of
+   need_resched, it prevents losing an interrupt).
+
+	4a. Common problem with this type of sleep appears to be:
+	        local_irq_disable();
+	        if (!need_resched()) {
+	                local_irq_enable();
+	                *** resched interrupt arrives here ***
+	                __asm__("sleep until next interrupt");
+	        }
+
+5. TIF_POLLING_NRFLAG can be set by idle routines that do not
+   need an interrupt to wake them up when need_resched goes high.
+   In other words, they must be periodically polling need_resched,
+   although it may be reasonable to do some background work or enter
+   a low CPU priority.
+
+   	5a. If TIF_POLLING_NRFLAG is set, and we do decide to enter
+	    an interrupt sleep, it needs to be cleared then a memory
+	    barrier issued (followed by a test of need_resched with
+	    interrupts disabled, as explained in 3).
+
+arch/i386/kernel/process.c has examples of both polling and
+sleeping idle functions.
+
+
+Possible arch/ problems
+=======================
+
+Possible arch problems I found (and either tried to fix or didn't):
+
+h8300 - Is such sleeping racy vs interrupts? (See #4a).
+        The H8/300 manual I found indicates yes, however disabling IRQs
+        over the sleep mean only NMIs can wake it up, so can't fix easily
+        without doing spin waiting.
+
+ia64 - is safe_halt call racy vs interrupts? (does it sleep?) (See #4a)
+
+sh64 - Is sleeping racy vs interrupts? (See #4a)
+
+sparc - IRQs on at this point(?), change local_irq_save to _disable.
+      - TODO: needs secondary CPUs to disable preempt (See #1)
+
diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index eb20c3afff58..a8682612abc0 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -43,21 +43,17 @@
 #include "proto.h"
 #include "pci_impl.h"
 
-void default_idle(void)
-{
-	barrier();
-}
-
 void
 cpu_idle(void)
 {
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
 	while (1) {
-		void (*idle)(void) = default_idle;
 		/* FIXME -- EV6 and LCA45 know how to power down
 		   the CPU.  */
 
 		while (!need_resched())
-			idle();
+			cpu_relax();
 		schedule();
 	}
 }
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 93dd92cc12f8..c0f6a119de3b 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -86,12 +86,16 @@ EXPORT_SYMBOL(pm_power_off);
  */
 void default_idle(void)
 {
-	local_irq_disable();
-	if (!need_resched() && !hlt_counter) {
-		timer_dyn_reprogram();
-		arch_idle();
+	if (hlt_counter)
+		cpu_relax();
+	else {
+		local_irq_disable();
+		if (!need_resched()) {
+			timer_dyn_reprogram();
+			arch_idle();
+		}
+		local_irq_enable();
 	}
-	local_irq_enable();
 }
 
 /*
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 86e80c551478..003548b8735f 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -769,8 +769,26 @@ static int set_system_power_state(u_short state)
 static int apm_do_idle(void)
 {
 	u32	eax;
+	u8	ret = 0;
+	int	idled = 0;
+	int	polling;
+
+	polling = test_thread_flag(TIF_POLLING_NRFLAG);
+	if (polling) {
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+	}
+	if (!need_resched()) {
+		idled = 1;
+		ret = apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax);
+	}
+	if (polling)
+		set_thread_flag(TIF_POLLING_NRFLAG);
+
+	if (!idled)
+		return 0;
 
-	if (apm_bios_call_simple(APM_FUNC_IDLE, 0, 0, &eax)) {
+	if (ret) {
 		static unsigned long t;
 
 		/* This always fails on some SMP boards running UP kernels.
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 5296e284ea36..1cb261f225d5 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -99,14 +99,22 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
+	local_irq_enable();
+
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
-		else
-			local_irq_enable();
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+		while (!need_resched()) {
+			local_irq_disable();
+			if (!need_resched())
+				safe_halt();
+			else
+				local_irq_enable();
+		}
+		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
-		cpu_relax();
+		while (!need_resched())
+			cpu_relax();
 	}
 }
 #ifdef CONFIG_APM_MODULE
@@ -120,29 +128,14 @@ EXPORT_SYMBOL(default_idle);
  */
 static void poll_idle (void)
 {
-	int oldval;
-
 	local_irq_enable();
 
-	/*
-	 * Deal with another CPU just having chosen a thread to
-	 * run here:
-	 */
-	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-	if (!oldval) {
-		set_thread_flag(TIF_POLLING_NRFLAG);
-		asm volatile(
-			"2:"
-			"testl %0, %1;"
-			"rep; nop;"
-			"je 2b;"
-			: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
-
-		clear_thread_flag(TIF_POLLING_NRFLAG);
-	} else {
-		set_need_resched();
-	}
+	asm volatile(
+		"2:"
+		"testl %0, %1;"
+		"rep; nop;"
+		"je 2b;"
+		: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +174,8 @@ void cpu_idle(void)
 {
 	int cpu = smp_processor_id();
 
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
@@ -246,15 +241,12 @@ static void mwait_idle(void)
 {
 	local_irq_enable();
 
-	if (!need_resched()) {
-		set_thread_flag(TIF_POLLING_NRFLAG);
-		do {
-			__monitor((void *)&current_thread_info()->flags, 0, 0);
-			if (need_resched())
-				break;
-			__mwait(0, 0);
-		} while (!need_resched());
-		clear_thread_flag(TIF_POLLING_NRFLAG);
+	while (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (need_resched())
+			break;
+		__mwait(0, 0);
 	}
 }
 
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 4c621fc3c3b9..640d6908f8ec 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -197,11 +197,15 @@ void
 default_idle (void)
 {
 	local_irq_enable();
-	while (!need_resched())
-		if (can_do_pal_halt)
-			safe_halt();
-		else
+	while (!need_resched()) {
+		if (can_do_pal_halt) {
+			local_irq_disable();
+			if (!need_resched())
+				safe_halt();
+			local_irq_enable();
+		} else
 			cpu_relax();
+	}
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -263,16 +267,16 @@ void __attribute__((noreturn))
 cpu_idle (void)
 {
 	void (*mark_idle)(int) = ia64_mark_idle;
+  	int cpu = smp_processor_id();
+	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	/* endless idle loop with no priority at all */
 	while (1) {
+		if (!need_resched()) {
+			void (*idle)(void);
 #ifdef CONFIG_SMP
-		if (!need_resched())
 			min_xtp();
 #endif
-		while (!need_resched()) {
-			void (*idle)(void);
-
 			if (__get_cpu_var(cpu_idle_state))
 				__get_cpu_var(cpu_idle_state) = 0;
 
@@ -284,19 +288,17 @@ cpu_idle (void)
 			if (!idle)
 				idle = default_idle;
 			(*idle)();
-		}
-
-		if (mark_idle)
-			(*mark_idle)(0);
-
+			if (mark_idle)
+				(*mark_idle)(0);
 #ifdef CONFIG_SMP
-		normal_xtp();
+			normal_xtp();
 #endif
+		}
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
 		check_pgt_cache();
-		if (cpu_is_offline(smp_processor_id()))
+		if (cpu_is_offline(cpu))
 			play_dead();
 	}
 }
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index f482f78de435..fee4f1f09adc 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -88,6 +88,8 @@ void default_idle(void)
  */
 void cpu_idle(void)
 {
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched())
diff --git a/arch/powerpc/platforms/iseries/setup.c b/arch/powerpc/platforms/iseries/setup.c
index 0130f2619dac..7f8f0cda6a74 100644
--- a/arch/powerpc/platforms/iseries/setup.c
+++ b/arch/powerpc/platforms/iseries/setup.c
@@ -703,13 +703,10 @@ static void iseries_shared_idle(void)
 static void iseries_dedicated_idle(void)
 {
 	long oldval;
+	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
-		oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-		if (!oldval) {
-			set_thread_flag(TIF_POLLING_NRFLAG);
-
+		if (!need_resched()) {
 			while (!need_resched()) {
 				ppc64_runlatch_off();
 				HMT_low();
@@ -722,9 +719,6 @@ static void iseries_dedicated_idle(void)
 			}
 
 			HMT_medium();
-			clear_thread_flag(TIF_POLLING_NRFLAG);
-		} else {
-			set_need_resched();
 		}
 
 		ppc64_runlatch_on();
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 4854f5eb5c3d..a093a0d4dd69 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -469,6 +469,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu)
 		 * more.
 		 */
 		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
 
 		/*
 		 * SMT dynamic mode. Cede will result in this thread going
@@ -481,6 +482,7 @@ static inline void dedicated_idle_sleep(unsigned int cpu)
 			cede_processor();
 		else
 			local_irq_enable();
+		set_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
 		/*
 		 * Give the HV an opportunity at the processor, since we are
@@ -492,11 +494,11 @@ static inline void dedicated_idle_sleep(unsigned int cpu)
 
 static void pseries_dedicated_idle(void)
 { 
-	long oldval;
 	struct paca_struct *lpaca = get_paca();
 	unsigned int cpu = smp_processor_id();
 	unsigned long start_snooze;
 	unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
+	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
 		/*
@@ -505,10 +507,7 @@ static void pseries_dedicated_idle(void)
 		 */
 		lpaca->lppaca.idle = 1;
 
-		oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-		if (!oldval) {
-			set_thread_flag(TIF_POLLING_NRFLAG);
-
+		if (!need_resched()) {
 			start_snooze = __get_tb() +
 				*smt_snooze_delay * tb_ticks_per_usec;
 
@@ -531,9 +530,6 @@ static void pseries_dedicated_idle(void)
 			}
 
 			HMT_medium();
-			clear_thread_flag(TIF_POLLING_NRFLAG);
-		} else {
-			set_need_resched();
 		}
 
 		lpaca->lppaca.idle = 0;
diff --git a/arch/ppc/kernel/idle.c b/arch/ppc/kernel/idle.c
index a6141f05c919..3c4e4cb61074 100644
--- a/arch/ppc/kernel/idle.c
+++ b/arch/ppc/kernel/idle.c
@@ -63,18 +63,18 @@ void cpu_idle(void)
 	int cpu = smp_processor_id();
 
 	for (;;) {
-		if (ppc_md.idle != NULL)
-			ppc_md.idle();
-		else
-			default_idle();
-		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
-			cpu_die();
-		if (need_resched()) {
-			preempt_enable_no_resched();
-			schedule();
-			preempt_disable();
+		while (need_resched()) {
+			if (ppc_md.idle != NULL)
+				ppc_md.idle();
+			else
+				default_idle();
 		}
 
+		if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
+			cpu_die();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 	}
 }
 
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index 909ea669af91..715bc0e71e0f 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -34,15 +34,11 @@ extern void power4_idle(void);
 
 void default_idle(void)
 {
-	long oldval;
 	unsigned int cpu = smp_processor_id();
+	set_thread_flag(TIF_POLLING_NRFLAG);
 
 	while (1) {
-		oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-		if (!oldval) {
-			set_thread_flag(TIF_POLLING_NRFLAG);
-
+		if (!need_resched()) {
 			while (!need_resched() && !cpu_is_offline(cpu)) {
 				ppc64_runlatch_off();
 
@@ -55,9 +51,6 @@ void default_idle(void)
 			}
 
 			HMT_medium();
-			clear_thread_flag(TIF_POLLING_NRFLAG);
-		} else {
-			set_need_resched();
 		}
 
 		ppc64_runlatch_on();
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 66ca5757e368..78b64fe5e7c2 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -99,14 +99,15 @@ void default_idle(void)
 {
 	int cpu, rc;
 
+	/* CPU is going idle. */
+	cpu = smp_processor_id();
+
 	local_irq_disable();
-        if (need_resched()) {
+	if (need_resched()) {
 		local_irq_enable();
-                return;
-        }
+		return;
+	}
 
-	/* CPU is going idle. */
-	cpu = smp_processor_id();
 	rc = notifier_call_chain(&idle_chain, CPU_IDLE, (void *)(long) cpu);
 	if (rc != NOTIFY_OK && rc != NOTIFY_DONE)
 		BUG();
@@ -119,7 +120,7 @@ void default_idle(void)
 	__ctl_set_bit(8, 15);
 
 #ifdef CONFIG_HOTPLUG_CPU
-	if (cpu_is_offline(smp_processor_id()))
+	if (cpu_is_offline(cpu))
 		cpu_die();
 #endif
 
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 1cbc26b796ad..fd4f240b833d 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -51,14 +51,13 @@ void enable_hlt(void)
 
 EXPORT_SYMBOL(enable_hlt);
 
-void default_idle(void)
+void cpu_idle(void)
 {
 	/* endless idle loop with no priority at all */
 	while (1) {
 		if (hlt_counter) {
-			while (1)
-				if (need_resched())
-					break;
+			while (!need_resched())
+				cpu_relax();
 		} else {
 			while (!need_resched())
 				cpu_sleep();
@@ -70,11 +69,6 @@ void default_idle(void)
 	}
 }
 
-void cpu_idle(void)
-{
-	default_idle();
-}
-
 void machine_restart(char * __unused)
 {
 	/* SR.BL=1 and invoke address error to let CPU reset (manual reset) */
diff --git a/arch/sh64/kernel/process.c b/arch/sh64/kernel/process.c
index 0c09537449b3..b95d04141855 100644
--- a/arch/sh64/kernel/process.c
+++ b/arch/sh64/kernel/process.c
@@ -307,23 +307,19 @@ __setup("hlt", hlt_setup);
 
 static inline void hlt(void)
 {
-	if (hlt_counter)
-		return;
-
 	__asm__ __volatile__ ("sleep" : : : "memory");
 }
 
 /*
  * The idle loop on a uniprocessor SH..
  */
-void default_idle(void)
+void cpu_idle(void)
 {
 	/* endless idle loop with no priority at all */
 	while (1) {
 		if (hlt_counter) {
-			while (1)
-				if (need_resched())
-					break;
+			while (!need_resched())
+				cpu_relax();
 		} else {
 			local_irq_disable();
 			while (!need_resched()) {
@@ -338,11 +334,7 @@ void default_idle(void)
 		schedule();
 		preempt_disable();
 	}
-}
 
-void cpu_idle(void)
-{
-	default_idle();
 }
 
 void machine_restart(char * __unused)
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index c39f4d01096d..ea8647411462 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -67,13 +67,6 @@ extern void fpsave(unsigned long *, unsigned long *, void *, unsigned long *);
 struct task_struct *last_task_used_math = NULL;
 struct thread_info *current_set[NR_CPUS];
 
-/*
- * default_idle is new in 2.5. XXX Review, currently stolen from sparc64.
- */
-void default_idle(void)
-{
-}
-
 #ifndef CONFIG_SMP
 
 #define SUN4C_FAULT_HIGH 100
@@ -92,12 +85,11 @@ void cpu_idle(void)
 			static unsigned long fps;
 			unsigned long now;
 			unsigned long faults;
-			unsigned long flags;
 
 			extern unsigned long sun4c_kernel_faults;
 			extern void sun4c_grow_kernel_ring(void);
 
-			local_irq_save(flags);
+			local_irq_disable();
 			now = jiffies;
 			count -= (now - last_jiffies);
 			last_jiffies = now;
@@ -113,13 +105,16 @@ void cpu_idle(void)
 					sun4c_grow_kernel_ring();
 				}
 			}
-			local_irq_restore(flags);
+			local_irq_enable();
 		}
 
-		while((!need_resched()) && pm_idle) {
-			(*pm_idle)();
+		if (pm_idle) {
+			while (!need_resched())
+				(*pm_idle)();
+		} else {
+			while (!need_resched())
+				cpu_relax();
 		}
-
 		preempt_enable_no_resched();
 		schedule();
 		preempt_disable();
@@ -132,15 +127,15 @@ void cpu_idle(void)
 /* This is being executed in task 0 'user space'. */
 void cpu_idle(void)
 {
+        set_thread_flag(TIF_POLLING_NRFLAG);
 	/* endless idle loop with no priority at all */
 	while(1) {
-		if(need_resched()) {
-			preempt_enable_no_resched();
-			schedule();
-			preempt_disable();
-			check_pgt_cache();
-		}
-		barrier(); /* or else gcc optimizes... */
+		while (!need_resched())
+			cpu_relax();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
+		check_pgt_cache();
 	}
 }
 
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 2f89206e008f..02f9dec1d459 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -85,23 +85,31 @@ void cpu_idle(void)
 
 /*
  * the idle loop on a UltraMultiPenguin...
+ *
+ * TIF_POLLING_NRFLAG is set because we do not sleep the cpu
+ * inside of the idler task, so an interrupt is not needed
+ * to get a clean fast response.
+ *
+ * XXX Reverify this assumption... -DaveM
+ *
+ * Addendum: We do want it to do something for the signal
+ *           delivery case, we detect that by just seeing
+ *           if we are trying to send this to an idler or not.
  */
-#define idle_me_harder()	(cpu_data(smp_processor_id()).idle_volume += 1)
-#define unidle_me()		(cpu_data(smp_processor_id()).idle_volume = 0)
 void cpu_idle(void)
 {
+	cpuinfo_sparc *cpuinfo = &local_cpu_data();
 	set_thread_flag(TIF_POLLING_NRFLAG);
+
 	while(1) {
 		if (need_resched()) {
-			unidle_me();
-			clear_thread_flag(TIF_POLLING_NRFLAG);
+			cpuinfo->idle_volume = 0;
 			preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
-			set_thread_flag(TIF_POLLING_NRFLAG);
 			check_pgt_cache();
 		}
-		idle_me_harder();
+		cpuinfo->idle_volume++;
 
 		/* The store ordering is so that IRQ handlers on
 		 * other cpus see our increasing idleness for the buddy
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index 8aca4b1dc04e..797a65493fb8 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -1152,20 +1152,9 @@ void __init smp_cpus_done(unsigned int max_cpus)
 	       (bogosum/(5000/HZ))%100);
 }
 
-/* This needn't do anything as we do not sleep the cpu
- * inside of the idler task, so an interrupt is not needed
- * to get a clean fast response.
- *
- * XXX Reverify this assumption... -DaveM
- *
- * Addendum: We do want it to do something for the signal
- *           delivery case, we detect that by just seeing
- *           if we are trying to send this to an idler or not.
- */
 void smp_send_reschedule(int cpu)
 {
-	if (cpu_data(cpu).idle_volume == 0)
-		smp_receive_signal(cpu);
+	smp_receive_signal(cpu);
 }
 
 /* This is a nop because we capture all other cpus
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 571f9fe490ce..59be85d9a4bc 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -86,12 +86,22 @@ EXPORT_SYMBOL(enable_hlt);
  */
 void default_idle(void)
 {
+	local_irq_enable();
+
 	if (!atomic_read(&hlt_counter)) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
-		else
-			local_irq_enable();
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+		while (!need_resched()) {
+			local_irq_disable();
+			if (!need_resched())
+				safe_halt();
+			else
+				local_irq_enable();
+		}
+		set_thread_flag(TIF_POLLING_NRFLAG);
+	} else {
+		while (!need_resched())
+			cpu_relax();
 	}
 }
 
@@ -102,30 +112,16 @@ void default_idle(void)
  */
 static void poll_idle (void)
 {
-	int oldval;
-
 	local_irq_enable();
 
-	/*
-	 * Deal with another CPU just having chosen a thread to
-	 * run here:
-	 */
-	oldval = test_and_clear_thread_flag(TIF_NEED_RESCHED);
-
-	if (!oldval) {
-		set_thread_flag(TIF_POLLING_NRFLAG); 
-		asm volatile(
-			"2:"
-			"testl %0,%1;"
-			"rep; nop;"
-			"je 2b;"
-			: :
-			"i" (_TIF_NEED_RESCHED), 
-			"m" (current_thread_info()->flags));
-		clear_thread_flag(TIF_POLLING_NRFLAG);
-	} else {
-		set_need_resched();
-	}
+	asm volatile(
+		"2:"
+		"testl %0,%1;"
+		"rep; nop;"
+		"je 2b;"
+		: :
+		"i" (_TIF_NEED_RESCHED),
+		"m" (current_thread_info()->flags));
 }
 
 void cpu_idle_wait(void)
@@ -187,6 +183,8 @@ static inline void play_dead(void)
  */
 void cpu_idle (void)
 {
+	set_thread_flag(TIF_POLLING_NRFLAG);
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		while (!need_resched()) {
@@ -221,15 +219,12 @@ static void mwait_idle(void)
 {
 	local_irq_enable();
 
-	if (!need_resched()) {
-		set_thread_flag(TIF_POLLING_NRFLAG);
-		do {
-			__monitor((void *)&current_thread_info()->flags, 0, 0);
-			if (need_resched())
-				break;
-			__mwait(0, 0);
-		} while (!need_resched());
-		clear_thread_flag(TIF_POLLING_NRFLAG);
+	while (!need_resched()) {
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (need_resched())
+			break;
+		__mwait(0, 0);
 	}
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 161db4acfb91..573b6a97bb1f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -167,6 +167,19 @@ acpi_processor_power_activate(struct acpi_processor *pr,
 	return;
 }
 
+static void acpi_safe_halt(void)
+{
+	int polling = test_thread_flag(TIF_POLLING_NRFLAG);
+	if (polling) {
+		clear_thread_flag(TIF_POLLING_NRFLAG);
+		smp_mb__after_clear_bit();
+	}
+	if (!need_resched())
+		safe_halt();
+	if (polling)
+		set_thread_flag(TIF_POLLING_NRFLAG);
+}
+
 static atomic_t c3_cpu_count;
 
 static void acpi_processor_idle(void)
@@ -177,7 +190,7 @@ static void acpi_processor_idle(void)
 	int sleep_ticks = 0;
 	u32 t1, t2 = 0;
 
-	pr = processors[raw_smp_processor_id()];
+	pr = processors[smp_processor_id()];
 	if (!pr)
 		return;
 
@@ -197,8 +210,13 @@ static void acpi_processor_idle(void)
 	}
 
 	cx = pr->power.state;
-	if (!cx)
-		goto easy_out;
+	if (!cx) {
+		if (pm_idle_save)
+			pm_idle_save();
+		else
+			acpi_safe_halt();
+		return;
+	}
 
 	/*
 	 * Check BM Activity
@@ -278,7 +296,8 @@ static void acpi_processor_idle(void)
 		if (pm_idle_save)
 			pm_idle_save();
 		else
-			safe_halt();
+			acpi_safe_halt();
+
 		/*
 		 * TBD: Can't get time duration while in C1, as resumes
 		 *      go to an ISR rather than here.  Need to instrument
@@ -414,16 +433,6 @@ static void acpi_processor_idle(void)
 	 */
 	if (next_state != pr->power.state)
 		acpi_processor_power_activate(pr, next_state);
-
-	return;
-
-      easy_out:
-	/* do C1 instead of busy loop */
-	if (pm_idle_save)
-		pm_idle_save();
-	else
-		safe_halt();
-	return;
 }
 
 static int acpi_processor_set_power_policy(struct acpi_processor *pr)
diff --git a/kernel/sched.c b/kernel/sched.c
index 0f2def822296..ac3f5cc3bb51 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -864,21 +864,28 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
 #ifdef CONFIG_SMP
 static void resched_task(task_t *p)
 {
-	int need_resched, nrpolling;
+	int cpu;
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	/* minimise the chance of sending an interrupt to poll_idle() */
-	nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
-	need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED);
-	nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG);
+	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+		return;
+
+	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id())
+		return;
 
-	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
-		smp_send_reschedule(task_cpu(p));
+	/* NEED_RESCHED must be visible before we test POLLING_NRFLAG */
+	smp_mb();
+	if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG))
+		smp_send_reschedule(cpu);
 }
 #else
 static inline void resched_task(task_t *p)
 {
+	assert_spin_locked(&task_rq(p)->lock);
 	set_tsk_need_resched(p);
 }
 #endif
-- 
cgit v1.2.3


From 28d838cc4dfea980cb6eda0a7409cbf91889ca74 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Wed, 9 Nov 2005 11:33:07 -0800
Subject: Fix ptrace self-attach rule

Before we did CLONE_THREAD, the way to check whether we were attaching
to ourselves was to just check "current == task", but with CLONE_THREAD
we should check that the thread group ID matches instead.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 5b8dd98a230e..b88d4186cd7a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -155,7 +155,7 @@ int ptrace_attach(struct task_struct *task)
 	retval = -EPERM;
 	if (task->pid <= 1)
 		goto bad;
-	if (task == current)
+	if (task->tgid == current->tgid)
 		goto bad;
 	/* the same process cannot be attached many times */
 	if (task->ptrace & PT_PTRACED)
-- 
cgit v1.2.3


From a47ab9371e664952b1104a70ec8e9b74db3f7a5f Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 9 Nov 2005 15:45:29 -0800
Subject: [PATCH] optimize activate_task()

recalc_task_prio() is called from activate_task() to calculate dynamic
priority and interactive credit for the activating task.  For real-time
scheduling process, all that dynamic calculation is thrown away at the end
because rt priority is fixed.  Patch to optimize recalc_task_prio() away
for rt processes.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <piggin@cyberone.com.au>
Cc: Con Kolivas <kernel@kolivas.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ac3f5cc3bb51..b6506671b2be 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -815,7 +815,8 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
 	}
 #endif
 
-	p->prio = recalc_task_prio(p, now);
+	if (!rt_task(p))
+		p->prio = recalc_task_prio(p, now);
 
 	/*
 	 * This checks to make sure it's not an uninterruptible task
-- 
cgit v1.2.3


From 7ed0175a462c4c30f6df6fac1cccac058f997739 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Thu, 10 Nov 2005 17:22:18 +0300
Subject: [PATCH] Don't auto-reap traced children

If a task is being traced we never auto-reap it even if it might look
like its parent doesn't care. The tracer obviously _does_ care.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 1bf3c39d6109..80789a59b4db 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1499,7 +1499,7 @@ void do_notify_parent(struct task_struct *tsk, int sig)
 
 	psig = tsk->parent->sighand;
 	spin_lock_irqsave(&psig->siglock, flags);
-	if (sig == SIGCHLD &&
+	if (!tsk->ptrace && sig == SIGCHLD &&
 	    (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
 	     (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) {
 		/*
-- 
cgit v1.2.3


From 393b07258766130146b962bc294d66615a47468a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 10 Nov 2005 12:47:50 -0800
Subject: [SPARC64]: Re-export uts_sem for solaris compat module.

Revert: b26b9bc58263acda274f82a9dde8b6d96559878a

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 kernel/sys.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sys.c b/kernel/sys.c
index c43b3e22bbda..bce933ebb29f 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1497,6 +1497,8 @@ EXPORT_SYMBOL(in_egroup_p);
 
 DECLARE_RWSEM(uts_sem);
 
+EXPORT_SYMBOL(uts_sem);
+
 asmlinkage long sys_newuname(struct new_utsname __user * name)
 {
 	int errno = 0;
-- 
cgit v1.2.3


From bca73e4bf8563d83f7856164caa44d5f42e44cca Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jgarzik@pobox.com>
Date: Sun, 13 Nov 2005 16:06:25 -0800
Subject: [PATCH] move pm_register/etc. to CONFIG_PM_LEGACY, pm_legacy.h

Since few people need the support anymore, this moves the legacy
pm_xxx functions to CONFIG_PM_LEGACY, and include/linux/pm_legacy.h.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/kernel/apm.c           |  1 +
 arch/frv/kernel/pm.c            |  1 +
 arch/i386/Kconfig               |  2 +-
 arch/i386/kernel/apm.c          |  1 +
 arch/mips/au1000/common/power.c |  1 +
 drivers/acpi/bus.c              |  3 ++-
 drivers/net/3c509.c             | 13 +++++-----
 drivers/net/irda/ali-ircc.c     |  1 +
 drivers/net/irda/nsc-ircc.c     |  1 +
 drivers/serial/68328serial.c    |  7 +++---
 include/linux/pm.h              | 49 ------------------------------------
 include/linux/pm_legacy.h       | 56 +++++++++++++++++++++++++++++++++++++++++
 kernel/power/Kconfig            |  9 +++++++
 kernel/power/Makefile           |  3 ++-
 kernel/power/pm.c               |  1 +
 sound/oss/ad1848.c              |  1 +
 sound/oss/cs4281/cs4281m.c      |  1 +
 sound/oss/maestro.c             |  1 +
 sound/oss/nm256_audio.c         |  1 +
 sound/oss/opl3sa2.c             | 18 +++++++------
 20 files changed, 102 insertions(+), 69 deletions(-)
 create mode 100644 include/linux/pm_legacy.h

(limited to 'kernel')

diff --git a/arch/arm/kernel/apm.c b/arch/arm/kernel/apm.c
index b0bbd1e62ebb..a2843be05557 100644
--- a/arch/arm/kernel/apm.c
+++ b/arch/arm/kernel/apm.c
@@ -20,6 +20,7 @@
 #include <linux/apm_bios.h>
 #include <linux/sched.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c
index 1a1e8a119c3d..712c3c24c954 100644
--- a/arch/frv/kernel/pm.c
+++ b/arch/frv/kernel/pm.c
@@ -14,6 +14,7 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/sysctl.h>
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index dbf90ad6eac3..6004bb0795e0 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -699,7 +699,7 @@ depends on PM && !X86_VISWS
 
 config APM
 	tristate "APM (Advanced Power Management) BIOS support"
-	depends on PM
+	depends on PM && PM_LEGACY
 	---help---
 	  APM is a BIOS specification for saving power using several different
 	  techniques. This is mostly useful for battery powered laptops with
diff --git a/arch/i386/kernel/apm.c b/arch/i386/kernel/apm.c
index 003548b8735f..1e60acbed3c1 100644
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -218,6 +218,7 @@
 #include <linux/time.h>
 #include <linux/sched.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
 #include <linux/smp.h>
diff --git a/arch/mips/au1000/common/power.c b/arch/mips/au1000/common/power.c
index f85093b8d54d..f4926315fb68 100644
--- a/arch/mips/au1000/common/power.c
+++ b/arch/mips/au1000/common/power.c
@@ -32,6 +32,7 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
 #include <linux/jiffies.h>
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 6a4da417c16b..606f8733a776 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/device.h>
 #include <linux/proc_fs.h>
 #ifdef CONFIG_X86
@@ -754,7 +755,7 @@ static int __init acpi_init(void)
 	result = acpi_bus_init();
 
 	if (!result) {
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 		if (!PM_IS_ACTIVE())
 			pm_active = 1;
 		else {
diff --git a/drivers/net/3c509.c b/drivers/net/3c509.c
index 977935a3d898..824e430486c2 100644
--- a/drivers/net/3c509.c
+++ b/drivers/net/3c509.c
@@ -84,6 +84,7 @@ static int max_interrupt_work = 10;
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/skbuff.h>
 #include <linux/delay.h>	/* for udelay() */
 #include <linux/spinlock.h>
@@ -173,7 +174,7 @@ struct el3_private {
 	/* skb send-queue */
 	int head, size;
 	struct sk_buff *queue[SKB_QUEUE_SIZE];
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 	struct pm_dev *pmdev;
 #endif
 	enum {
@@ -200,7 +201,7 @@ static void el3_tx_timeout (struct net_device *dev);
 static void el3_down(struct net_device *dev);
 static void el3_up(struct net_device *dev);
 static struct ethtool_ops ethtool_ops;
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 static int el3_suspend(struct pm_dev *pdev);
 static int el3_resume(struct pm_dev *pdev);
 static int el3_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *data);
@@ -361,7 +362,7 @@ static void el3_common_remove (struct net_device *dev)
 	struct el3_private *lp = netdev_priv(dev);
 
 	(void) lp;				/* Keep gcc quiet... */
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 	if (lp->pmdev)
 		pm_unregister(lp->pmdev);
 #endif
@@ -571,7 +572,7 @@ no_pnp:
 	if (err)
 		goto out1;
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 	/* register power management */
 	lp->pmdev = pm_register(PM_ISA_DEV, card_idx, el3_pm_callback);
 	if (lp->pmdev) {
@@ -1479,7 +1480,7 @@ el3_up(struct net_device *dev)
 }
 
 /* Power Management support functions */
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 
 static int
 el3_suspend(struct pm_dev *pdev)
@@ -1548,7 +1549,7 @@ el3_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *data)
 	return 0;
 }
 
-#endif /* CONFIG_PM */
+#endif /* CONFIG_PM_LEGACY */
 
 /* Parameters that may be passed into the module. */
 static int debug = -1;
diff --git a/drivers/net/irda/ali-ircc.c b/drivers/net/irda/ali-ircc.c
index 9bf34681d3df..2e7882eb7d6f 100644
--- a/drivers/net/irda/ali-ircc.c
+++ b/drivers/net/irda/ali-ircc.c
@@ -40,6 +40,7 @@
 #include <asm/byteorder.h>
 
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 
 #include <net/irda/wrapper.h>
 #include <net/irda/irda.h>
diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index 805714ec9a8a..ee717d0e939e 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -59,6 +59,7 @@
 #include <asm/byteorder.h>
 
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 
 #include <net/irda/wrapper.h>
 #include <net/irda/irda.h>
diff --git a/drivers/serial/68328serial.c b/drivers/serial/68328serial.c
index 2efb317153ce..67e9afa000c1 100644
--- a/drivers/serial/68328serial.c
+++ b/drivers/serial/68328serial.c
@@ -34,6 +34,7 @@
 #include <linux/keyboard.h>
 #include <linux/init.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
 
@@ -1343,7 +1344,7 @@ static void show_serial_version(void)
 	printk("MC68328 serial driver version 1.00\n");
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 /* Serial Power management
  *  The console (currently fixed at line 0) is a special case for power
  *  management because the kernel is so chatty. The console will be 
@@ -1393,7 +1394,7 @@ void startup_console(void)
 	struct m68k_serial *info = &m68k_soft[0];
 	startup(info);
 }
-#endif
+#endif /* CONFIG_PM_LEGACY */
 
 
 static struct tty_operations rs_ops = {
@@ -1486,7 +1487,7 @@ rs68328_init(void)
 			    IRQ_FLG_STD,
 			    "M68328_UART", NULL))
                 panic("Unable to attach 68328 serial interrupt\n");
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 	    serial_pm[i] = pm_register(PM_SYS_DEV, PM_SYS_COM, serial_pm_callback);
 	    if (serial_pm[i])
 		    serial_pm[i]->data = info;
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 1514098d156d..5be87ba3b7ac 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -94,55 +94,6 @@ struct pm_dev
 	struct list_head entry;
 };
 
-#ifdef CONFIG_PM
-
-extern int pm_active;
-
-#define PM_IS_ACTIVE() (pm_active != 0)
-
-/*
- * Register a device with power management
- */
-struct pm_dev __deprecated *
-pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
-
-/*
- * Unregister a device with power management
- */
-void __deprecated pm_unregister(struct pm_dev *dev);
-
-/*
- * Unregister all devices with matching callback
- */
-void __deprecated pm_unregister_all(pm_callback callback);
-
-/*
- * Send a request to all devices
- */
-int __deprecated pm_send_all(pm_request_t rqst, void *data);
-
-#else /* CONFIG_PM */
-
-#define PM_IS_ACTIVE() 0
-
-static inline struct pm_dev *pm_register(pm_dev_t type,
-					 unsigned long id,
-					 pm_callback callback)
-{
-	return NULL;
-}
-
-static inline void pm_unregister(struct pm_dev *dev) {}
-
-static inline void pm_unregister_all(pm_callback callback) {}
-
-static inline int pm_send_all(pm_request_t rqst, void *data)
-{
-	return 0;
-}
-
-#endif /* CONFIG_PM */
-
 /* Functions above this comment are list-based old-style power
  * managment. Please avoid using them.  */
 
diff --git a/include/linux/pm_legacy.h b/include/linux/pm_legacy.h
new file mode 100644
index 000000000000..1252b45face1
--- /dev/null
+++ b/include/linux/pm_legacy.h
@@ -0,0 +1,56 @@
+#ifndef __LINUX_PM_LEGACY_H__
+#define __LINUX_PM_LEGACY_H__
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PM_LEGACY
+
+extern int pm_active;
+
+#define PM_IS_ACTIVE() (pm_active != 0)
+
+/*
+ * Register a device with power management
+ */
+struct pm_dev __deprecated *
+pm_register(pm_dev_t type, unsigned long id, pm_callback callback);
+
+/*
+ * Unregister a device with power management
+ */
+void __deprecated pm_unregister(struct pm_dev *dev);
+
+/*
+ * Unregister all devices with matching callback
+ */
+void __deprecated pm_unregister_all(pm_callback callback);
+
+/*
+ * Send a request to all devices
+ */
+int __deprecated pm_send_all(pm_request_t rqst, void *data);
+
+#else /* CONFIG_PM_LEGACY */
+
+#define PM_IS_ACTIVE() 0
+
+static inline struct pm_dev *pm_register(pm_dev_t type,
+					 unsigned long id,
+					 pm_callback callback)
+{
+	return NULL;
+}
+
+static inline void pm_unregister(struct pm_dev *dev) {}
+
+static inline void pm_unregister_all(pm_callback callback) {}
+
+static inline int pm_send_all(pm_request_t rqst, void *data)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PM_LEGACY */
+
+#endif /* __LINUX_PM_LEGACY_H__ */
+
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 46a5e5acff97..5ec248cb7f4a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -19,6 +19,15 @@ config PM
 	  will issue the hlt instruction if nothing is to be done, thereby
 	  sending the processor to sleep and saving power.
 
+config PM_LEGACY
+	bool "Legacy Power Management API"
+	depends on PM
+	default y
+	---help---
+	   Support for pm_register() and friends.
+
+	   If unsure, say Y.
+
 config PM_DEBUG
 	bool "Power Management Debug Support"
 	depends on PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c71eb4579c07..04be7d0d96a7 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,7 +3,8 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
-obj-y				:= main.o process.o console.o pm.o
+obj-y				:= main.o process.o console.o
+obj-$(CONFIG_PM_LEGACY)		+= pm.o
 obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 159149321b3c..33c508e857dd 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/interrupt.h>
 
 int pm_active;
diff --git a/sound/oss/ad1848.c b/sound/oss/ad1848.c
index 7c835abd99bc..3f30c57676c1 100644
--- a/sound/oss/ad1848.c
+++ b/sound/oss/ad1848.c
@@ -47,6 +47,7 @@
 #include <linux/module.h>
 #include <linux/stddef.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/isapnp.h>
 #include <linux/pnp.h>
 #include <linux/spinlock.h>
diff --git a/sound/oss/cs4281/cs4281m.c b/sound/oss/cs4281/cs4281m.c
index d0d3963e1b83..adc689649fe1 100644
--- a/sound/oss/cs4281/cs4281m.c
+++ b/sound/oss/cs4281/cs4281m.c
@@ -298,6 +298,7 @@ struct cs4281_state {
 	struct cs4281_pipeline pl[CS4281_NUMBER_OF_PIPELINES];
 };
 
+#include <linux/pm_legacy.h>
 #include "cs4281pm-24.c"
 
 #if CSDEBUG
diff --git a/sound/oss/maestro.c b/sound/oss/maestro.c
index 3dce504e6d6d..3abd3541cbc7 100644
--- a/sound/oss/maestro.c
+++ b/sound/oss/maestro.c
@@ -231,6 +231,7 @@
 #include <asm/uaccess.h>
 
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 static int maestro_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *d);
 
 #include "maestro.h"
diff --git a/sound/oss/nm256_audio.c b/sound/oss/nm256_audio.c
index 66970062eb36..0ce2c404a730 100644
--- a/sound/oss/nm256_audio.c
+++ b/sound/oss/nm256_audio.c
@@ -25,6 +25,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include <linux/delay.h>
 #include <linux/spinlock.h>
 #include "sound_config.h"
diff --git a/sound/oss/opl3sa2.c b/sound/oss/opl3sa2.c
index 2efbd865109b..cd41d0e4706a 100644
--- a/sound/oss/opl3sa2.c
+++ b/sound/oss/opl3sa2.c
@@ -70,6 +70,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/pm.h>
+#include <linux/pm_legacy.h>
 #include "sound_config.h"
 
 #include "ad1848.h"
@@ -138,7 +139,7 @@ typedef struct {
 	struct pnp_dev* pdev;
 	int activated;			/* Whether said devices have been activated */
 #endif
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 	unsigned int	in_suspend;
 	struct pm_dev	*pmdev;
 #endif
@@ -341,7 +342,7 @@ static void opl3sa2_mixer_reset(opl3sa2_state_t* devc)
 }
 
 /* Currently only used for power management */
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 static void opl3sa2_mixer_restore(opl3sa2_state_t* devc)
 {
 	if (devc) {
@@ -354,7 +355,7 @@ static void opl3sa2_mixer_restore(opl3sa2_state_t* devc)
 		}
 	}
 }
-#endif
+#endif /* CONFIG_PM_LEGACY */
 
 static inline void arg_to_vol_mono(unsigned int vol, int* value)
 {
@@ -831,7 +832,8 @@ static struct pnp_driver opl3sa2_driver = {
 
 /* End of component functions */
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
+
 static DEFINE_SPINLOCK(opl3sa2_lock);
 
 /* Power Management support functions */
@@ -906,7 +908,7 @@ static int opl3sa2_pm_callback(struct pm_dev *pdev, pm_request_t rqst, void *dat
 	}
 	return 0;
 }
-#endif /* CONFIG_PM */
+#endif /* CONFIG_PM_LEGACY */
 
 /*
  * Install OPL3-SA2 based card(s).
@@ -1019,12 +1021,12 @@ static int __init init_opl3sa2(void)
 
 		/* ewww =) */
 		opl3sa2_state[card].card = card;
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 		/* register our power management capabilities */
 		opl3sa2_state[card].pmdev = pm_register(PM_ISA_DEV, card, opl3sa2_pm_callback);
 		if (opl3sa2_state[card].pmdev)
 			opl3sa2_state[card].pmdev->data = &opl3sa2_state[card];
-#endif /* CONFIG_PM */
+#endif /* CONFIG_PM_LEGACY */
 
 		/*
 		 * Set the Yamaha 3D enhancement mode (aka Ymersion) if asked to and
@@ -1081,7 +1083,7 @@ static void __exit cleanup_opl3sa2(void)
 	int card;
 
 	for(card = 0; card < opl3sa2_cards_num; card++) {
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_LEGACY
 		if (opl3sa2_state[card].pmdev)
 			pm_unregister(opl3sa2_state[card].pmdev);
 #endif
-- 
cgit v1.2.3


From 005f18dfd0ed86c39716277b61dfb4bd2af91059 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Sun, 13 Nov 2005 16:06:33 -0800
Subject: [PATCH] fix task_struct leak in ptrace

When ptrace_attach fails we need to drop the task_struct reference.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index b88d4186cd7a..17ee7e5a3451 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -470,7 +470,7 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
 
 	if (request == PTRACE_ATTACH) {
 		ret = ptrace_attach(child);
-		goto out;
+		goto out_put_task_struct;
 	}
 
 	ret = ptrace_check_attach(child, request == PTRACE_KILL);
-- 
cgit v1.2.3


From 5563e77078d85c4f107a0a673500c43ce57cf702 Mon Sep 17 00:00:00 2001
From: Bob Picco <bob.picco@hp.com>
Date: Sun, 13 Nov 2005 16:06:35 -0800
Subject: [PATCH] cpuset: fix return without releasing semaphore

It is wrong to acquire the semaphore and then return from
cpuset_zone_allowed without releasing it.

Signed-off-by: Bob Picco <bob.picco@hp.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/cpuset.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 5a737ed9dac7..7430640f9816 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1809,11 +1809,12 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 	if (gfp_mask & __GFP_HARDWALL)	/* If hardwall request, stop here */
 		return 0;
 
+	if (current->flags & PF_EXITING) /* Let dying task have memory */
+		return 1;
+
 	/* Not hardwall and node outside mems_allowed: scan up cpusets */
 	down(&callback_sem);
 
-	if (current->flags & PF_EXITING) /* Let dying task have memory */
-		return 1;
 	task_lock(current);
 	cs = nearest_exclusive_ancestor(current->cpuset);
 	task_unlock(current);
-- 
cgit v1.2.3


From a1261f54611ec4ad6a7ab7080f86747e3ac3685b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Date: Sun, 13 Nov 2005 16:06:55 -0800
Subject: [PATCH] m68k: introduce task_thread_info

new helper - task_thread_info(task).  On platforms that have thread_info
allocated separately (i.e.  in default case) it simply returns
task->thread_info.  m68k wants (and for good reasons) to embed its thread_info
into task_struct.  So it will (in later patch) have task_thread_info() of its
own.  For now we just add a macro for generic case and convert existing
instances of its body in core kernel to uses of new macro.  Obviously safe -
all normal architectures get the same preprocessor output they used to get.

Signed-off-by: Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 16 +++++++++-------
 kernel/exit.c         |  2 +-
 kernel/fork.c         |  4 ++--
 kernel/sched.c        |  6 +++---
 4 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2bbf968b23d9..f8650314ba2f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1233,32 +1233,34 @@ static inline void task_unlock(struct task_struct *p)
 	spin_unlock(&p->alloc_lock);
 }
 
+#define task_thread_info(task) (task)->thread_info
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
 static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
-	set_ti_thread_flag(tsk->thread_info,flag);
+	set_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
 static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
-	clear_ti_thread_flag(tsk->thread_info,flag);
+	clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
 static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
-	return test_and_set_ti_thread_flag(tsk->thread_info,flag);
+	return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
 static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
-	return test_and_clear_ti_thread_flag(tsk->thread_info,flag);
+	return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
 static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
 {
-	return test_ti_thread_flag(tsk->thread_info,flag);
+	return test_ti_thread_flag(task_thread_info(tsk), flag);
 }
 
 static inline void set_tsk_need_resched(struct task_struct *tsk)
@@ -1329,12 +1331,12 @@ extern void signal_wake_up(struct task_struct *t, int resume_stopped);
 
 static inline unsigned int task_cpu(const struct task_struct *p)
 {
-	return p->thread_info->cpu;
+	return task_thread_info(p)->cpu;
 }
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-	p->thread_info->cpu = cpu;
+	task_thread_info(p)->cpu = cpu;
 }
 
 #else
diff --git a/kernel/exit.c b/kernel/exit.c
index 452a1d116178..ee515683b92d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -859,7 +859,7 @@ fastcall NORET_TYPE void do_exit(long code)
 	if (group_dead && tsk->signal->leader)
 		disassociate_ctty(1);
 
-	module_put(tsk->thread_info->exec_domain->module);
+	module_put(task_thread_info(tsk)->exec_domain->module);
 	if (tsk->binfmt)
 		module_put(tsk->binfmt->module);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 158710d22566..7ef352ce347b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -919,7 +919,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	if (nr_threads >= max_threads)
 		goto bad_fork_cleanup_count;
 
-	if (!try_module_get(p->thread_info->exec_domain->module))
+	if (!try_module_get(task_thread_info(p)->exec_domain->module))
 		goto bad_fork_cleanup_count;
 
 	if (p->binfmt && !try_module_get(p->binfmt->module))
@@ -1180,7 +1180,7 @@ bad_fork_cleanup:
 	if (p->binfmt)
 		module_put(p->binfmt->module);
 bad_fork_cleanup_put_domain:
-	module_put(p->thread_info->exec_domain->module);
+	module_put(task_thread_info(p)->exec_domain->module);
 bad_fork_cleanup_count:
 	put_group_info(p->group_info);
 	atomic_dec(&p->user->processes);
diff --git a/kernel/sched.c b/kernel/sched.c
index b6506671b2be..831f7e9d8f1c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1437,7 +1437,7 @@ void fastcall sched_fork(task_t *p, int clone_flags)
 #endif
 #ifdef CONFIG_PREEMPT
 	/* Want to start with kernel preemption disabled. */
-	p->thread_info->preempt_count = 1;
+	task_thread_info(p)->preempt_count = 1;
 #endif
 	/*
 	 * Share the timeslice between parent and child, thus the
@@ -4410,9 +4410,9 @@ void __devinit init_idle(task_t *idle, int cpu)
 
 	/* Set the preempt count _outside_ the spinlocks! */
 #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
+	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
 #else
-	idle->thread_info->preempt_count = 0;
+	task_thread_info(idle)->preempt_count = 0;
 #endif
 }
 
-- 
cgit v1.2.3


From 10ebffde3d3916026974352b7900e44afe2b243f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Date: Sun, 13 Nov 2005 16:06:56 -0800
Subject: [PATCH] m68k: introduce setup_thread_stack() and end_of_stack()

encapsulates the rest of arch-dependent operations with thread_info access.
Two new helpers - setup_thread_stack() and end_of_stack().  For normal case
the former consists of copying thread_info of parent to new thread_info and
the latter returns pointer immediately past the end of thread_info.

Signed-off-by: Al Viro <viro@parcelfarce.linux.theplanet.co.uk>
Signed-off-by: Roman Zippel <zippel@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/sched.h | 11 +++++++++++
 kernel/fork.c         |  3 +--
 kernel/sched.c        |  4 ++--
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8650314ba2f..e4681256e43e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1235,6 +1235,17 @@ static inline void task_unlock(struct task_struct *p)
 
 #define task_thread_info(task) (task)->thread_info
 
+static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
+{
+	*task_thread_info(p) = *task_thread_info(org);
+	task_thread_info(p)->task = p;
+}
+
+static inline unsigned long *end_of_stack(struct task_struct *p)
+{
+	return (unsigned long *)(p->thread_info + 1);
+}
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ef352ce347b..2c70c9cdf5dc 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -171,10 +171,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		return NULL;
 	}
 
-	*ti = *orig->thread_info;
 	*tsk = *orig;
 	tsk->thread_info = ti;
-	ti->task = tsk;
+	setup_thread_stack(tsk, orig);
 
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
diff --git a/kernel/sched.c b/kernel/sched.c
index 831f7e9d8f1c..6f46c94cc29e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4327,10 +4327,10 @@ static void show_task(task_t *p)
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
-		unsigned long *n = (unsigned long *) (p->thread_info+1);
+		unsigned long *n = end_of_stack(p);
 		while (!*n)
 			n++;
-		free = (unsigned long) n - (unsigned long)(p->thread_info+1);
+		free = (unsigned long)n - (unsigned long)end_of_stack(p);
 	}
 #endif
 	printk("%5lu %5d %6d ", free, p->pid, p->parent->pid);
-- 
cgit v1.2.3


From b17b0421d70f5b85a791afe145a16d5ca5f849aa Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Sun, 13 Nov 2005 16:07:14 -0800
Subject: [PATCH] signal handling: revert sigkill priority fix

This patch reverts commit c33880aaddbbab1ccf36f4457ed1090621f2e39a since
it's not needed anymore. As pointed out by Roland McGrath the real fix
is to deliver all signals before returning to user space.
See http://www.ussg.iu.edu/hypermail/linux/kernel/0509.2/0683.html
A fix for s390 has been merged.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/signal.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 80789a59b4db..d7611f189ef7 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -513,16 +513,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 {
 	int sig = 0;
 
-	/* SIGKILL must have priority, otherwise it is quite easy
-	 * to create an unkillable process, sending sig < SIGKILL
-	 * to self */
-	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
-		if (!sigismember(mask, SIGKILL))
-			sig = SIGKILL;
-	}
-
-	if (likely(!sig))
-		sig = next_signal(pending, mask);
+	sig = next_signal(pending, mask);
 	if (sig) {
 		if (current->notifier) {
 			if (sigismember(current->notifier_mask, sig)) {
-- 
cgit v1.2.3


From dbdf65b1b7f8ec48bda1604cfea7ac09ce583d6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Nov 2005 16:07:22 -0800
Subject: [PATCH] rcutorture: renice to low priority

Make the box usable for interactive work when running the RCU torture test,
by renicing the RCU torture-test threads to +19 by default.  Kthreads run
at nice -5 by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 9b58f1eff3ca..eb6719c50b4e 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -195,6 +195,8 @@ rcu_torture_writer(void *arg)
 	static DEFINE_RCU_RANDOM(rand);
 
 	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	set_user_nice(current, 19);
+
 	do {
 		schedule_timeout_uninterruptible(1);
 		if (rcu_batches_completed() == oldbatch)
@@ -238,6 +240,8 @@ rcu_torture_reader(void *arg)
 	int pipe_count;
 
 	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	set_user_nice(current, 19);
+
 	do {
 		rcu_read_lock();
 		completed = rcu_batches_completed();
-- 
cgit v1.2.3


From 4557398f8cbaf9f254cff747534b4724c7f75c4f Mon Sep 17 00:00:00 2001
From: Kirill Korotaev <dev@sw.ru>
Date: Sun, 13 Nov 2005 16:07:30 -0800
Subject: [PATCH] stop_machine() vs. synchronous IPI send deadlock

This fixes deadlock of stop_machine() vs.  synchronous IPI send.  The
problem is that stop_machine() disables interrupts before disabling
preemption on other CPUs.  So if another CPU is preempted and then calls
something like flush_tlb_all() it will deadlock with CPU doing
stop_machine() and which can't process IPI due to disabled IRQs.

I changed stop_machine() to do the same things exactly as it does on other
CPUs, i.e.  it should disable preemption first on _all_ CPUs including
itself and only after that disable IRQs.

Signed-off-by: Kirill Korotaev <dev@sw.ru>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: "Andrey Savochkin" <saw@sawoct.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/stop_machine.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 84a9d18aa8da..b3d4dc858e35 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -119,13 +119,12 @@ static int stop_machine(void)
 		return ret;
 	}
 
-	/* Don't schedule us away at this point, please. */
-	local_irq_disable();
-
 	/* Now they are all started, make them hold the CPUs, ready. */
+	preempt_disable();
 	stopmachine_set_state(STOPMACHINE_PREPARE);
 
 	/* Make them disable irqs. */
+	local_irq_disable();
 	stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
 
 	return 0;
@@ -135,6 +134,7 @@ static void restart_machine(void)
 {
 	stopmachine_set_state(STOPMACHINE_EXIT);
 	local_irq_enable();
+	preempt_enable_no_resched();
 }
 
 struct stop_machine_data
-- 
cgit v1.2.3


From 20dcae32439384b6863c626bb3b2a09bed65b33e Mon Sep 17 00:00:00 2001
From: Zach Brown <zach.brown@oracle.com>
Date: Sun, 13 Nov 2005 16:07:33 -0800
Subject: [PATCH] aio: remove kioctx from mm_struct

Sync iocbs have a life cycle that don't need a kioctx.  Their retrying, if
any, is done in the context of their owner who has allocated them on the
stack.

The sole user of a sync iocb's ctx reference was aio_complete() checking for
an elevated iocb ref count that could never happen.  No path which grabs an
iocb ref has access to sync iocbs.

If we were to implement sync iocb cancelation it would be done by the owner of
the iocb using its on-stack reference.

Removing this chunk from aio_complete allows us to remove the entire kioctx
instance from mm_struct, reducing its size by a third.  On a i386 testing box
the slab size went from 768 to 504 bytes and from 5 to 8 per page.

Signed-off-by: Zach Brown <zach.brown@oracle.com>
Acked-by: Benjamin LaHaise <bcrl@kvack.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/aio.c                  | 27 +++++++++------------------
 include/linux/aio.h       |  2 +-
 include/linux/init_task.h |  1 -
 include/linux/sched.h     |  1 -
 kernel/fork.c             |  1 -
 5 files changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/fs/aio.c b/fs/aio.c
index 20bb919eb195..e7cd40b626b7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -937,28 +937,19 @@ int fastcall aio_complete(struct kiocb *iocb, long res, long res2)
 	unsigned long	tail;
 	int		ret;
 
-	/* Special case handling for sync iocbs: events go directly
-	 * into the iocb for fast handling.  Note that this will not 
-	 * work if we allow sync kiocbs to be cancelled. in which
-	 * case the usage count checks will have to move under ctx_lock
-	 * for all cases.
+	/*
+	 * Special case handling for sync iocbs:
+	 *  - events go directly into the iocb for fast handling
+	 *  - the sync task with the iocb in its stack holds the single iocb
+	 *    ref, no other paths have a way to get another ref
+	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
 	if (is_sync_kiocb(iocb)) {
-		int ret;
-
+		BUG_ON(iocb->ki_users != 1);
 		iocb->ki_user_data = res;
-		if (iocb->ki_users == 1) {
-			iocb->ki_users = 0;
-			ret = 1;
-		} else {
-			spin_lock_irq(&ctx->ctx_lock);
-			iocb->ki_users--;
-			ret = (0 == iocb->ki_users);
-			spin_unlock_irq(&ctx->ctx_lock);
-		}
-		/* sync iocbs put the task here for us */
+		iocb->ki_users = 0;
 		wake_up_process(iocb->ki_obj.tsk);
-		return ret;
+		return 1;
 	}
 
 	info = &ctx->ring_info;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 403d71dcb7c8..9e0ae8711276 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -124,7 +124,7 @@ struct kiocb {
 		(x)->ki_users = 1;			\
 		(x)->ki_key = KIOCB_SYNC_KEY;		\
 		(x)->ki_filp = (filp);			\
-		(x)->ki_ctx = &tsk->active_mm->default_kioctx;	\
+		(x)->ki_ctx = NULL;			\
 		(x)->ki_cancel = NULL;			\
 		(x)->ki_dtor = NULL;			\
 		(x)->ki_obj.tsk = tsk;			\
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 68ab5f2ab9cd..dcfd2ecccb5d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -51,7 +51,6 @@
 	.page_table_lock =  SPIN_LOCK_UNLOCKED, 		\
 	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
 	.cpu_vm_mask	= CPU_MASK_ALL,				\
-	.default_kioctx = INIT_KIOCTX(name.default_kioctx, name),	\
 }
 
 #define INIT_SIGNALS(sig) {	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41df81395719..2038bd27b041 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -357,7 +357,6 @@ struct mm_struct {
 	/* aio bits */
 	rwlock_t		ioctx_list_lock;
 	struct kioctx		*ioctx_list;
-	struct kioctx		default_kioctx;
 };
 
 struct sighand_struct {
diff --git a/kernel/fork.c b/kernel/fork.c
index 2c70c9cdf5dc..e0d0b77343f8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -323,7 +323,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
 	spin_lock_init(&mm->page_table_lock);
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
-	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 
-- 
cgit v1.2.3


From 3f39894d1b5c253b10fcb8fbbbcf65a330f6cdc7 Mon Sep 17 00:00:00 2001
From: George Anzinger <george@mvista.com>
Date: Sun, 13 Nov 2005 16:07:44 -0800
Subject: [PATCH] timespec: normalize off by one errors

It would appear that the timespec normalize code has an off by one error.
Found in three places.  Thanks to Ben for spotting.

Signed-off-by: George Anzinger<george@mvista.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/time.h  |  2 +-
 kernel/posix-timers.c | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time.h b/include/linux/time.h
index 8e83f4e778bb..bfbe92d0767c 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -101,7 +101,7 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 static inline void
 set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
 {
-	while (nsec > NSEC_PER_SEC) {
+	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
 		++sec;
 	}
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ea55c7a1cd75..5870efb3e200 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -270,7 +270,7 @@ static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
 	long sec = tp->tv_sec;
 	long nsec = tp->tv_nsec + res - 1;
 
-	if (nsec > NSEC_PER_SEC) {
+	if (nsec >= NSEC_PER_SEC) {
 		sec++;
 		nsec -= NSEC_PER_SEC;
 	}
@@ -1209,13 +1209,9 @@ static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
 
 	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
 
-	tp->tv_sec += wall_to_mono.tv_sec;
-	tp->tv_nsec += wall_to_mono.tv_nsec;
+	set_normalized_timespec(tp, tp->tv_sec + wall_to_mono.tv_sec,
+				tp->tv_nsec + wall_to_mono.tv_nsec);
 
-	if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
-		tp->tv_nsec -= NSEC_PER_SEC;
-		tp->tv_sec++;
-	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From ddad86c2d6f660112c6ce8aabae6ffd346e25b9b Mon Sep 17 00:00:00 2001
From: Martin Waitz <tali@admingilde.org>
Date: Sun, 13 Nov 2005 16:08:14 -0800
Subject: [PATCH] DocBook: include printk documentation

Add printk documentation to kernel-api.

Signed-off-by: Martin Waitz <tali@admingilde.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/DocBook/kernel-api.tmpl |  4 +---
 kernel/printk.c                       | 16 ++++++++++++++--
 2 files changed, 15 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index a8316b1a3e3d..0519c9dc0065 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -68,9 +68,7 @@ X!Iinclude/linux/kobject.h
 
      <sect1><title>Kernel utility functions</title>
 !Iinclude/linux/kernel.h
-<!-- This needs to clean up to make kernel-doc happy
-X!Ekernel/printk.c
- -->
+!Ekernel/printk.c
 !Ekernel/panic.c
 !Ekernel/sys.c
 !Ekernel/rcupdate.c
diff --git a/kernel/printk.c b/kernel/printk.c
index e9be027bc930..ac8a08f36207 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -491,7 +491,10 @@ __attribute__((weak)) unsigned long long printk_clock(void)
 	return sched_clock();
 }
 
-/*
+/**
+ * printk - print a kernel message
+ * @fmt: format string
+ *
  * This is printk.  It can be called from any context.  We want it to work.
  *
  * We try to grab the console_sem.  If we succeed, it's easy - we log the output and
@@ -503,6 +506,9 @@ __attribute__((weak)) unsigned long long printk_clock(void)
  * One effect of this deferred printing is that code which calls printk() and
  * then changes console_loglevel may break. This is because console_loglevel
  * is inspected when the actual printing occurs.
+ *
+ * See also:
+ * printf(3)
  */
 
 asmlinkage int printk(const char *fmt, ...)
@@ -655,6 +661,9 @@ static void call_console_drivers(unsigned long start, unsigned long end)
 
 /**
  * add_preferred_console - add a device to the list of preferred consoles.
+ * @name: device name
+ * @idx: device index
+ * @options: options for this console
  *
  * The last preferred console added will be used for kernel messages
  * and stdin/out/err for init.  Normally this is used by console_setup
@@ -764,7 +773,8 @@ void release_console_sem(void)
 }
 EXPORT_SYMBOL(release_console_sem);
 
-/** console_conditional_schedule - yield the CPU if required
+/**
+ * console_conditional_schedule - yield the CPU if required
  *
  * If the console code is currently allowed to sleep, and
  * if this CPU should yield the CPU to another task, do
@@ -976,6 +986,8 @@ EXPORT_SYMBOL(unregister_console);
 
 /**
  * tty_write_message - write a message to a certain tty, not just the console.
+ * @tty: the destination tty_struct
+ * @msg: the message to write
  *
  * This is used for messages that need to be redirected to a specific tty.
  * We don't put it into the syslog queue right now maybe in the future if
-- 
cgit v1.2.3


From 996417d2c4eb583e94553e4ede58974e0da1c38e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@us.ibm.com>
Date: Fri, 18 Nov 2005 01:10:50 -0800
Subject: [PATCH] add success/failure indication to RCU torture test

One issue with the RCU torture test is that the current error flagging can
be lost in dmesg.  This patch adds a "SUCCESS"/"FAILURE" string to the line
that flags the end of the test, where it can easily be seen with "dmesg |
tail" at the end of the test.  Also adds tests of architecture-specific
memory barriers -- or, more likely, of the RCU torture test itself.

Cc: <vatsa@in.ibm.com>
Signed-off-by: "Paul E. McKenney" <paulmck@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/rcutorture.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index eb6719c50b4e..88c28d476550 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -80,6 +80,7 @@ struct rcu_torture {
 	struct rcu_head rtort_rcu;
 	int rtort_pipe_count;
 	struct list_head rtort_free;
+	int rtort_mbtest;
 };
 
 static int fullstop = 0;	/* stop generating callbacks at test end. */
@@ -96,6 +97,8 @@ static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
 atomic_t n_rcu_torture_alloc;
 atomic_t n_rcu_torture_alloc_fail;
 atomic_t n_rcu_torture_free;
+atomic_t n_rcu_torture_mberror;
+atomic_t n_rcu_torture_error;
 
 /*
  * Allocate an element from the rcu_tortures pool.
@@ -145,9 +148,10 @@ rcu_torture_cb(struct rcu_head *p)
 	if (i > RCU_TORTURE_PIPE_LEN)
 		i = RCU_TORTURE_PIPE_LEN;
 	atomic_inc(&rcu_torture_wcount[i]);
-	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN)
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+		rp->rtort_mbtest = 0;
 		rcu_torture_free(rp);
-	else
+	} else
 		call_rcu(p, rcu_torture_cb);
 }
 
@@ -206,6 +210,7 @@ rcu_torture_writer(void *arg)
 		rp->rtort_pipe_count = 0;
 		udelay(rcu_random(&rand) & 0x3ff);
 		old_rp = rcu_torture_current;
+		rp->rtort_mbtest = 1;
 		rcu_assign_pointer(rcu_torture_current, rp);
 		smp_wmb();
 		if (old_rp != NULL) {
@@ -252,6 +257,8 @@ rcu_torture_reader(void *arg)
 			schedule_timeout_interruptible(HZ);
 			continue;
 		}
+		if (p->rtort_mbtest == 0)
+			atomic_inc(&n_rcu_torture_mberror);
 		udelay(rcu_random(&rand) & 0x7f);
 		preempt_disable();
 		pipe_count = p->rtort_pipe_count;
@@ -300,16 +307,22 @@ rcu_torture_printk(char *page)
 	}
 	cnt += sprintf(&page[cnt], "rcutorture: ");
 	cnt += sprintf(&page[cnt],
-		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d",
+		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
+		       "rtmbe: %d",
 		       rcu_torture_current,
 		       rcu_torture_current_version,
 		       list_empty(&rcu_torture_freelist),
 		       atomic_read(&n_rcu_torture_alloc),
 		       atomic_read(&n_rcu_torture_alloc_fail),
-		       atomic_read(&n_rcu_torture_free));
+		       atomic_read(&n_rcu_torture_free),
+		       atomic_read(&n_rcu_torture_mberror));
+	if (atomic_read(&n_rcu_torture_mberror) != 0)
+		cnt += sprintf(&page[cnt], " !!!");
 	cnt += sprintf(&page[cnt], "\nrcutorture: ");
-	if (i > 1)
+	if (i > 1) {
 		cnt += sprintf(&page[cnt], "!!! ");
+		atomic_inc(&n_rcu_torture_error);
+	}
 	cnt += sprintf(&page[cnt], "Reader Pipe: ");
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
@@ -400,7 +413,9 @@ rcu_torture_cleanup(void)
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
 		synchronize_rcu();
 	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
-	PRINTK_STRING("--- End of test");
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- End of test: %s\n",
+	       atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
 }
 
 static int
@@ -425,6 +440,7 @@ rcu_torture_init(void)
 
 	INIT_LIST_HEAD(&rcu_torture_freelist);
 	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+		rcu_tortures[i].rtort_mbtest = 0;
 		list_add_tail(&rcu_tortures[i].rtort_free,
 			      &rcu_torture_freelist);
 	}
@@ -436,6 +452,8 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_alloc, 0);
 	atomic_set(&n_rcu_torture_alloc_fail, 0);
 	atomic_set(&n_rcu_torture_free, 0);
+	atomic_set(&n_rcu_torture_mberror, 0);
+	atomic_set(&n_rcu_torture_error, 0);
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
 	for_each_cpu(cpu) {
-- 
cgit v1.2.3


From 0b0db14c536debd92328819fe6c51a49717e8440 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:20 -0800
Subject: [PATCH] unpaged: copy_page_range vma

For copy_one_pte's print_bad_pte to show the task correctly (instead of
"???"), dup_mmap must pass down parent vma rather than child vma.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index e0d0b77343f8..1c1cf8dc396b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -263,7 +263,7 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		rb_parent = &tmp->vm_rb;
 
 		mm->map_count++;
-		retval = copy_page_range(mm, oldmm, tmp);
+		retval = copy_page_range(mm, oldmm, mpnt);
 
 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);
-- 
cgit v1.2.3


From c2b5a251b9feca727661f1a3278cafb1de4c80f3 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 3 Nov 2005 07:51:18 -0700
Subject: [PATCH] Check the irq number is within bounds

Most of the functions already check. Do the ones that didn't.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/irq/manage.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3bd7226d15fa..81c49a4d679e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -36,6 +36,9 @@ void synchronize_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 
+	if (irq >= NR_IRQS)
+		return;
+
 	while (desc->status & IRQ_INPROGRESS)
 		cpu_relax();
 }
@@ -60,6 +63,9 @@ void disable_irq_nosync(unsigned int irq)
 	irq_desc_t *desc = irq_desc + irq;
 	unsigned long flags;
 
+	if (irq >= NR_IRQS)
+		return;
+
 	spin_lock_irqsave(&desc->lock, flags);
 	if (!desc->depth++) {
 		desc->status |= IRQ_DISABLED;
@@ -86,6 +92,9 @@ void disable_irq(unsigned int irq)
 {
 	irq_desc_t *desc = irq_desc + irq;
 
+	if (irq >= NR_IRQS)
+		return;
+
 	disable_irq_nosync(irq);
 	if (desc->action)
 		synchronize_irq(irq);
@@ -108,6 +117,9 @@ void enable_irq(unsigned int irq)
 	irq_desc_t *desc = irq_desc + irq;
 	unsigned long flags;
 
+	if (irq >= NR_IRQS)
+		return;
+
 	spin_lock_irqsave(&desc->lock, flags);
 	switch (desc->depth) {
 	case 0:
@@ -163,6 +175,9 @@ int setup_irq(unsigned int irq, struct irqaction * new)
 	unsigned long flags;
 	int shared = 0;
 
+	if (irq >= NR_IRQS)
+		return -EINVAL;
+
 	if (desc->handler == &no_irq_type)
 		return -ENOSYS;
 	/*
-- 
cgit v1.2.3


From cc3327e7dfc16a9a3e164075234c869867a59e45 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 23 Nov 2005 13:37:38 -0800
Subject: [PATCH] mm: unbloat get_futex_key

The follow_page changes in get_futex_key have left it with two almost
identical blocks, when handling the rare case of a futex in a nonlinear vma.
get_user_pages will itself do that follow_page, and its additional
find_extend_vma is hardly any overhead since the vma is already cached.  Let's
just delete the follow_page block and let get_user_pages do it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/futex.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index aca8d10704f6..5872e3507f35 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -201,21 +201,6 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
 	 * from swap.  But that's a lot of code to duplicate here
 	 * for a rare case, so we simply fetch the page.
 	 */
-
-	/*
-	 * Do a quick atomic lookup first - this is the fastpath.
-	 */
-	page = follow_page(mm, uaddr, FOLL_TOUCH|FOLL_GET);
-	if (likely(page != NULL)) {
-		key->shared.pgoff =
-			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
-		put_page(page);
-		return 0;
-	}
-
-	/*
-	 * Do it the general way.
-	 */
 	err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
 	if (err >= 0) {
 		key->shared.pgoff =
-- 
cgit v1.2.3


From e9b15b54d3646108bbd3e054158b402025d3e704 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 23 Nov 2005 13:37:44 -0800
Subject: [PATCH] Fix crash in unregister_console()

If unregister_console() is inadvertently called while no consoles are
registered, it will crash trying to dereference NULL pointer.  It is
necessary to fix that because register_console() provides no indication
that it actually registered the console passed in.  In fact, it may well
decide not to register it based on various things...

(akpm: It'd be better to make register_console() return something and fix the
callers.  All 106 of them...)

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/printk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index ac8a08f36207..5287be83e3e7 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -956,7 +956,7 @@ int unregister_console(struct console *console)
 	if (console_drivers == console) {
 		console_drivers=console->next;
 		res = 0;
-	} else {
+	} else if (console_drivers) {
 		for (a=console_drivers->next, b=console_drivers ;
 		     a; b=a, a=b->next) {
 			if (a == console) {
-- 
cgit v1.2.3


From a9d9baa1e819b2f92f9cfa5240f766c535e636a6 Mon Sep 17 00:00:00 2001
From: Ashok Raj <ashok.raj@intel.com>
Date: Mon, 28 Nov 2005 13:43:46 -0800
Subject: [PATCH] clean up lock_cpu_hotplug() in cpufreq

There are some callers in cpufreq hotplug notify path that the lowest
function calls lock_cpu_hotplug().  The lock is already held during
cpu_up() and cpu_down() calls when the notify calls are broadcast to
registered clients.

Ideally if possible, we could disable_preempt() at the highest caller and
make sure we dont sleep in the path down in cpufreq->driver_target() calls
but the calls are so intertwined and cumbersome to cleanup.

Hence we consistently use lock_cpu_hotplug() and unlock_cpu_hotplug() in
all places.

 - Removed export of cpucontrol semaphore and made it static.
 - removed explicit uses of up/down with lock_cpu_hotplug()
   so we can keep track of the the callers in same thread context and
   just keep refcounts without calling a down() that causes a deadlock.
 - Removed current_in_hotplug() uses
 - Removed PF_HOTPLUG_CPU in sched.h introduced for the current_in_hotplug()
   temporary workaround.

Tested with insmod of cpufreq_stat.ko, and logical online/offline
to make sure we dont have any hang situations.

Signed-off-by: Ashok Raj <ashok.raj@intel.com>
Cc: Zwane Mwaikambo <zwane@linuxpower.ca>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/cpufreq/cpufreq.c | 12 ++-----
 include/linux/cpu.h       |  7 ++--
 include/linux/sched.h     |  1 -
 kernel/cpu.c              | 83 ++++++++++++++++++++++++++++-------------------
 4 files changed, 54 insertions(+), 49 deletions(-)

(limited to 'kernel')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 1c0f62d0f938..815902c2c856 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1113,21 +1113,13 @@ int __cpufreq_driver_target(struct cpufreq_policy *policy,
 {
 	int retval = -EINVAL;
 
-	/*
-	 * If we are already in context of hotplug thread, we dont need to
-	 * acquire the hotplug lock. Otherwise acquire cpucontrol to prevent
-	 * hotplug from removing this cpu that we are working on.
-	 */
-	if (!current_in_cpu_hotplug())
-		lock_cpu_hotplug();
-
+	lock_cpu_hotplug();
 	dprintk("target for CPU %u: %u kHz, relation %u\n", policy->cpu,
 		target_freq, relation);
 	if (cpu_online(policy->cpu) && cpufreq_driver->target)
 		retval = cpufreq_driver->target(policy, target_freq, relation);
 
-	if (!current_in_cpu_hotplug())
-		unlock_cpu_hotplug();
+	unlock_cpu_hotplug();
 
 	return retval;
 }
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 43c44530ef9d..0ed1d4853c69 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -65,10 +65,9 @@ extern struct sysdev_class cpu_sysdev_class;
 
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
-extern struct semaphore cpucontrol;
-#define lock_cpu_hotplug()	down(&cpucontrol)
-#define unlock_cpu_hotplug()	up(&cpucontrol)
-#define lock_cpu_hotplug_interruptible() down_interruptible(&cpucontrol)
+extern void lock_cpu_hotplug(void);
+extern void unlock_cpu_hotplug(void);
+extern int lock_cpu_hotplug_interruptible(void);
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2038bd27b041..b0ad6f30679e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -908,7 +908,6 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
-#define PF_HOTPLUG_CPU	0x01000000	/* Currently performing CPU hotplug */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d61ba88f34e5..e882c6babf41 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,47 +16,76 @@
 #include <asm/semaphore.h>
 
 /* This protects CPUs going up and down... */
-DECLARE_MUTEX(cpucontrol);
-EXPORT_SYMBOL_GPL(cpucontrol);
+static DECLARE_MUTEX(cpucontrol);
 
 static struct notifier_block *cpu_chain;
 
-/*
- * Used to check by callers if they need to acquire the cpucontrol
- * or not to protect a cpu from being removed. Its sometimes required to
- * call these functions both for normal operations, and in response to
- * a cpu being added/removed. If the context of the call is in the same
- * thread context as a CPU hotplug thread, we dont need to take the lock
- * since its already protected
- * check drivers/cpufreq/cpufreq.c for its usage - Ashok Raj
- */
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *lock_cpu_hotplug_owner;
+static int lock_cpu_hotplug_depth;
 
-int current_in_cpu_hotplug(void)
+static int __lock_cpu_hotplug(int interruptible)
 {
-	return (current->flags & PF_HOTPLUG_CPU);
+	int ret = 0;
+
+	if (lock_cpu_hotplug_owner != current) {
+		if (interruptible)
+			ret = down_interruptible(&cpucontrol);
+		else
+			down(&cpucontrol);
+	}
+
+	/*
+	 * Set only if we succeed in locking
+	 */
+	if (!ret) {
+		lock_cpu_hotplug_depth++;
+		lock_cpu_hotplug_owner = current;
+	}
+
+	return ret;
 }
 
-EXPORT_SYMBOL_GPL(current_in_cpu_hotplug);
+void lock_cpu_hotplug(void)
+{
+	__lock_cpu_hotplug(0);
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
+void unlock_cpu_hotplug(void)
+{
+	if (--lock_cpu_hotplug_depth == 0) {
+		lock_cpu_hotplug_owner = NULL;
+		up(&cpucontrol);
+	}
+}
+EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+
+int lock_cpu_hotplug_interruptible(void)
+{
+	return __lock_cpu_hotplug(1);
+}
+EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
+#endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
 int register_cpu_notifier(struct notifier_block *nb)
 {
 	int ret;
 
-	if ((ret = down_interruptible(&cpucontrol)) != 0)
+	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
 		return ret;
 	ret = notifier_chain_register(&cpu_chain, nb);
-	up(&cpucontrol);
+	unlock_cpu_hotplug();
 	return ret;
 }
 EXPORT_SYMBOL(register_cpu_notifier);
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
-	down(&cpucontrol);
+	lock_cpu_hotplug();
 	notifier_chain_unregister(&cpu_chain, nb);
-	up(&cpucontrol);
+	unlock_cpu_hotplug();
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
@@ -112,13 +141,6 @@ int cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	/*
-	 * Leave a trace in current->flags indicating we are already in
-	 * process of performing CPU hotplug. Callers can check if cpucontrol
-	 * is already acquired by current thread, and if so not cause
-	 * a dead lock by not acquiring the lock
-	 */
-	current->flags |= PF_HOTPLUG_CPU;
 	err = notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
 						(void *)(long)cpu);
 	if (err == NOTIFY_BAD) {
@@ -171,7 +193,6 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out:
-	current->flags &= ~PF_HOTPLUG_CPU;
 	unlock_cpu_hotplug();
 	return err;
 }
@@ -182,7 +203,7 @@ int __devinit cpu_up(unsigned int cpu)
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = down_interruptible(&cpucontrol)) != 0)
+	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
 		return ret;
 
 	if (cpu_online(cpu) || !cpu_present(cpu)) {
@@ -190,11 +211,6 @@ int __devinit cpu_up(unsigned int cpu)
 		goto out;
 	}
 
-	/*
-	 * Leave a trace in current->flags indicating we are already in
-	 * process of performing CPU hotplug.
-	 */
-	current->flags |= PF_HOTPLUG_CPU;
 	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
 	if (ret == NOTIFY_BAD) {
 		printk("%s: attempt to bring up CPU %u failed\n",
@@ -217,7 +233,6 @@ out_notify:
 	if (ret != 0)
 		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
 out:
-	current->flags &= ~PF_HOTPLUG_CPU;
-	up(&cpucontrol);
+	unlock_cpu_hotplug();
 	return ret;
 }
-- 
cgit v1.2.3


From c13cf856cbe16aec3007604dc013cbf3a16c6686 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 28 Nov 2005 13:43:48 -0800
Subject: [PATCH] fork.c: proc_fork_connector() called under write_lock()

Don't do that - it does GFP_KERNEL allocations, for a start.

(Reported by Guillaume Thouvenin <guillaume.thouvenin@bull.net>)

Acked-by: Matt Helsley <matthltc@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1c1cf8dc396b..d0d49879ab7c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1135,13 +1135,13 @@ static task_t *copy_process(unsigned long clone_flags,
 			__get_cpu_var(process_counts)++;
 	}
 
-	proc_fork_connector(p);
 	if (!current->signal->tty && p->signal->tty)
 		p->signal->tty = NULL;
 
 	nr_threads++;
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
+	proc_fork_connector(p);
 	retval = 0;
 
 fork_out:
-- 
cgit v1.2.3


From ee500f274914653a7d3dfca7d0140a3d21658e32 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Mon, 28 Nov 2005 13:43:55 -0800
Subject: [PATCH] fix 32bit overflow in timespec_to_sample()

fix 32bit overflow in timespec_to_sample()

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/posix-cpu-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 84af54c39e1b..cae4f5728997 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -36,7 +36,7 @@ timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
 	union cpu_time_count ret;
 	ret.sched = 0;		/* high half always zero when .cpu used */
 	if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-		ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+		ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
 	} else {
 		ret.cpu = timespec_to_cputime(tp);
 	}
-- 
cgit v1.2.3


From bce61dd49d6ba7799be2de17c772e4c701558f14 Mon Sep 17 00:00:00 2001
From: Ben Collins <bcollins@debian.org>
Date: Mon, 28 Nov 2005 13:43:56 -0800
Subject: [PATCH] Fix hardcoded cpu=0 in workqueue for per_cpu_ptr() calls

Tracked this down on an Ultra Enterprise 3000.  It's a 6-way machine.  Odd
thing about this machine (and it's good for finding bugs like this) is that
the CPU id's are not 0 based.  For instance, on my machine the CPU's are
6/7/10/11/14/15.

This caused some NULL pointer dereference in kernel/workqueue.c because for
single_threaded workqueue's, it hardcoded the cpu to 0.

I changed the 0's to any_online_cpu(cpu_online_mask), which cpumask.h
claims is "First cpu in mask".  So this fits the same usage.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/workqueue.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42df83d7fad2..2bd5aee1c736 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -102,7 +102,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
-			cpu = 0;
+			cpu = any_online_cpu(cpu_online_map);
 		BUG_ON(!list_empty(&work->entry));
 		__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 		ret = 1;
@@ -118,7 +118,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 	int cpu = smp_processor_id();
 
 	if (unlikely(is_single_threaded(wq)))
-		cpu = 0;
+		cpu = any_online_cpu(cpu_online_map);
 
 	__queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
@@ -266,8 +266,8 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 
 	if (is_single_threaded(wq)) {
-		/* Always use cpu 0's area. */
-		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, 0));
+		/* Always use first cpu's area. */
+		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, any_online_cpu(cpu_online_map)));
 	} else {
 		int cpu;
 
@@ -320,7 +320,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
 	lock_cpu_hotplug();
 	if (singlethread) {
 		INIT_LIST_HEAD(&wq->list);
-		p = create_workqueue_thread(wq, 0);
+		p = create_workqueue_thread(wq, any_online_cpu(cpu_online_map));
 		if (!p)
 			destroy = 1;
 		else
@@ -374,7 +374,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	/* We don't need the distraction of CPUs appearing and vanishing. */
 	lock_cpu_hotplug();
 	if (is_single_threaded(wq))
-		cleanup_workqueue_thread(wq, 0);
+		cleanup_workqueue_thread(wq, any_online_cpu(cpu_online_map));
 	else {
 		for_each_online_cpu(cpu)
 			cleanup_workqueue_thread(wq, cpu);
-- 
cgit v1.2.3


From 8c4b8add83c93306b07d78469fd351dc462e4b66 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Mon, 28 Nov 2005 13:44:05 -0800
Subject: [PATCH] cpuset fork locking fix

Move the cpuset_fork() call below the write_unlock_irq call in
kernel/fork.c copy_process().

Since the cpuset-dual-semaphore-locking-overhaul.patch, the cpuset_fork()
routine acquires task_lock(), so cannot be called while holding the
tasklist_lock for write.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/fork.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d0d49879ab7c..fb8572a42297 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1124,8 +1124,6 @@ static task_t *copy_process(unsigned long clone_flags,
 	if (unlikely(p->ptrace & PT_PTRACED))
 		__ptrace_link(p, current->parent);
 
-	cpuset_fork(p);
-
 	attach_pid(p, PIDTYPE_PID, p->pid);
 	attach_pid(p, PIDTYPE_TGID, p->tgid);
 	if (thread_group_leader(p)) {
@@ -1142,6 +1140,7 @@ static task_t *copy_process(unsigned long clone_flags,
 	total_forks++;
 	write_unlock_irq(&tasklist_lock);
 	proc_fork_connector(p);
+	cpuset_fork(p);
 	retval = 0;
 
 fork_out:
-- 
cgit v1.2.3


From 5bd0190bf3d7e53043a048e809ffa29d41b9d6ac Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Tue, 29 Nov 2005 19:34:32 -0800
Subject: [PATCH] Fix crash when ptrace poking hugepage areas

set_page_dirty() will not cope with being handed a page * which is part of
a compound page, but not the master page in that compound page.  This case
can occur via access_process_vm() if you attemp to write to another
process's hugepage memory area using ptrace() (causing an oops or hang).

This patch fixes the bug by only calling set_page_dirty() from
access_process_vm() if the page is not a compound page.  We already use a
similar fix in bio_set_pages_dirty() for the case of direct io to
hugepages.

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/ptrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 17ee7e5a3451..656476eedb1b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -241,7 +241,8 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
 		if (write) {
 			copy_to_user_page(vma, page, addr,
 					  maddr + offset, buf, bytes);
-			set_page_dirty_lock(page);
+			if (!PageCompound(page))
+				set_page_dirty_lock(page);
 		} else {
 			copy_from_user_page(vma, page, addr,
 					    buf, maddr + offset, bytes);
-- 
cgit v1.2.3


From 123d3c13e2853a11b4d599d754b356acb12886e2 Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 29 Nov 2005 19:34:37 -0800
Subject: [PATCH] fix swsusp on machines not supporting S4

Fix swsusp on machines not supporting S4.  With recent changes, it is not
possible to trigger it using /sys filesystem.  Swsusp does not really need
any support from low-level code, it is possible to reboot or halt at the
end of suspend.

Signed-off-by: Pavel Machek <pavel@suse.cz>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/power/main.c | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6ee2cad530e8..d253f3ae2fa5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,7 +24,7 @@
 
 DECLARE_MUTEX(pm_sem);
 
-struct pm_ops * pm_ops = NULL;
+struct pm_ops *pm_ops;
 suspend_disk_method_t pm_disk_mode = PM_DISK_SHUTDOWN;
 
 /**
@@ -151,6 +151,18 @@ static char *pm_states[PM_SUSPEND_MAX] = {
 #endif
 };
 
+static inline int valid_state(suspend_state_t state)
+{
+	/* Suspend-to-disk does not really need low-level support.
+	 * It can work with reboot if needed. */
+	if (state == PM_SUSPEND_DISK)
+		return 1;
+
+	if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
+		return 0;
+	return 1;
+}
+
 
 /**
  *	enter_state - Do common work of entering low-power state.
@@ -167,7 +179,7 @@ static int enter_state(suspend_state_t state)
 {
 	int error;
 
-	if (pm_ops && pm_ops->valid && !pm_ops->valid(state))
+	if (!valid_state(state))
 		return -ENODEV;
 	if (down_trylock(&pm_sem))
 		return -EBUSY;
@@ -238,9 +250,8 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
 	char * s = buf;
 
 	for (i = 0; i < PM_SUSPEND_MAX; i++) {
-		if (pm_states[i] && pm_ops && (!pm_ops->valid
-			||(pm_ops->valid && pm_ops->valid(i))))
-			s += sprintf(s,"%s ",pm_states[i]);
+		if (pm_states[i] && valid_state(i))
+			s += sprintf(s,"%s ", pm_states[i]);
 	}
 	s += sprintf(s,"\n");
 	return (s - buf);
-- 
cgit v1.2.3