From 77b92f5b13c7f025ad46dbbf3feccd85b2954feb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:39:40 -0700
Subject: [PATCH] Fix URLs in Kconfig files

From: Rusty Russell <rusty@rustcorp.com.au>

From: "Petri T. Koistinen" <petri.koistinen@iki.fi>

1) Various URLs in the Kconfig files are out of date: update them.

2) URLs should be of form <http://url-goes-here>.

3) References to files in the source should be of form
   <file:path-from-top>

4) Email addresses should be of form <foo@bar.com>
---
 kernel/power/Kconfig | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 033eea403f26..68bc8a8603ea 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -9,9 +9,9 @@ config PM
 
 	  Power Management is most important for battery powered laptop
 	  computers; if you have a laptop, check out the Linux Laptop home
-	  page on the WWW at
-	  <http://www.cs.utexas.edu/users/kharker/linux-laptop/> and the
-	  Battery Powered Linux mini-HOWTO, available from
+	  page on the WWW at <http://www.linux-on-laptops.com/> or
+	  Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
+	  and the Battery Powered Linux mini-HOWTO, available from
 	  <http://www.tldp.org/docs.html#howto>.
 
 	  Note that, even if you say N here, Linux on the x86 architecture
-- 
cgit v1.2.3


From 0eb217f9b539fccf5aafaba8c9a06e170825f68b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:40:05 -0700
Subject: [PATCH] generalise system_running

From: Olof Johansson <olof@austin.ibm.com>

It's currently a boolean, but that means that system_running goes to zero
again when shutting down.  So we then use code (in the page allocator) which
is only designed to be used during bootup - it is marked __init.

So we need to be able to distinguish early boot state from late shutdown
state.  Rename system_running to system_state and give it the three
appropriate states.
---
 arch/ppc/platforms/pmac_nvram.c | 8 ++++----
 include/linux/kernel.h          | 8 +++++++-
 init/main.c                     | 8 ++------
 kernel/kmod.c                   | 2 +-
 kernel/printk.c                 | 3 ++-
 kernel/sched.c                  | 3 ++-
 kernel/sys.c                    | 8 ++++----
 mm/page_alloc.c                 | 2 +-
 8 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/arch/ppc/platforms/pmac_nvram.c b/arch/ppc/platforms/pmac_nvram.c
index f381f3f745f9..3b3f984fb929 100644
--- a/arch/ppc/platforms/pmac_nvram.c
+++ b/arch/ppc/platforms/pmac_nvram.c
@@ -154,11 +154,11 @@ static unsigned char __pmac pmu_nvram_read_byte(int addr)
 	struct adb_request req;
 	DECLARE_COMPLETION(req_complete); 
 	
-	req.arg = system_running ? &req_complete : NULL;
+	req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL;
 	if (pmu_request(&req, pmu_nvram_complete, 3, PMU_READ_NVRAM,
 			(addr >> 8) & 0xff, addr & 0xff))
 		return 0xff;
-	if (system_running)
+	if (system_state == SYSTEM_RUNNING)
 		wait_for_completion(&req_complete);
 	while (!req.complete)
 		pmu_poll();
@@ -170,11 +170,11 @@ static void __pmac pmu_nvram_write_byte(int addr, unsigned char val)
 	struct adb_request req;
 	DECLARE_COMPLETION(req_complete); 
 	
-	req.arg = system_running ? &req_complete : NULL;
+	req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL;
 	if (pmu_request(&req, pmu_nvram_complete, 4, PMU_WRITE_NVRAM,
 			(addr >> 8) & 0xff, addr & 0xff, val))
 		return;
-	if (system_running)
+	if (system_state == SYSTEM_RUNNING)
 		wait_for_completion(&req_complete);
 	while (!req.complete)
 		pmu_poll();
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e11e79199357..c1171e77c76b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -109,9 +109,15 @@ static inline void console_verbose(void)
 extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_on_oops;
-extern int system_running;
+extern int system_state;		/* See values below */
 extern int tainted;
 extern const char *print_tainted(void);
+
+/* Values used for system_state */
+#define SYSTEM_BOOTING 0
+#define SYSTEM_RUNNING 1
+#define SYSTEM_SHUTDOWN 2
+
 #define TAINT_PROPRIETARY_MODULE	(1<<0)
 #define TAINT_FORCED_MODULE		(1<<1)
 #define TAINT_UNSAFE_SMP		(1<<2)
diff --git a/init/main.c b/init/main.c
index 9d1ed1de14c5..348ce7db30f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -94,11 +94,7 @@ extern void driver_init(void);
 extern void tc_init(void);
 #endif
 
-/*
- * Are we up and running (ie do we have all the infrastructure
- * set up)
- */
-int system_running;
+int system_state;	/* SYSTEM_BOOTING/RUNNING/SHUTDOWN */
 
 /*
  * Boot command-line arguments
@@ -613,7 +609,7 @@ static int init(void * unused)
 	 */
 	free_initmem();
 	unlock_kernel();
-	system_running = 1;
+	system_state = SYSTEM_RUNNING;
 
 	if (sys_open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5261de82029b..0002fcd4c554 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 	};
 	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
-	if (!system_running)
+	if (system_state != SYSTEM_RUNNING)
 		return -EBUSY;
 
 	if (path[0] == '\0')
diff --git a/kernel/printk.c b/kernel/printk.c
index a7be1f922f34..5f2b3c9bbd6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...)
 			log_level_unknown = 1;
 	}
 
-	if (!cpu_online(smp_processor_id()) && !system_running) {
+	if (!cpu_online(smp_processor_id()) &&
+	    system_state != SYSTEM_RUNNING) {
 		/*
 		 * Some console drivers may assume that per-cpu resources have
 		 * been allocated.  So don't allow them to be called by this
diff --git a/kernel/sched.c b/kernel/sched.c
index d5f21712ffbb..9e19d4c0d4a9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2982,7 +2982,8 @@ void __might_sleep(char *file, int line)
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) && system_running) {
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
diff --git a/kernel/sys.c b/kernel/sys.c
index 33a14e13079e..bc498b12edcc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -436,7 +436,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system.\n");
 		machine_restart(NULL);
@@ -452,7 +452,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_HALT:
 		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "System halted.\n");
 		machine_halt();
@@ -462,7 +462,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Power down.\n");
 		machine_power_off();
@@ -478,7 +478,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		buffer[sizeof(buffer) - 1] = '\0';
 
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
 		machine_restart(buffer);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d035d836c15..9764a4e78e45 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -734,7 +734,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde
 	struct page * page;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(!system_running))
+	if (unlikely(system_state == SYSTEM_BOOTING))
 		return get_boot_pages(gfp_mask, order);
 #endif
 	page = alloc_pages(gfp_mask, order);
-- 
cgit v1.2.3


From b283f09cf8f51c29bf90e42e22099f76d0f33378 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:41:20 -0700
Subject: [PATCH] Fix get_wchan() FIXME wrt. order of functions

From: William Lee Irwin III <wli@holomorphy.com>

This addresses the issue with get_wchan() that the various functions acting
as scheduling-related primitives are not, in fact, contiguous in the text
segment.  It creates an ELF section for scheduling primitives to be placed
in, and places currently-detected (i.e.  skipped during stack decoding)
scheduling primitives and others like io_schedule() and down(), which are
currently missed by get_wchan() code, into this section also.

The net effects are more reliability of get_wchan()'s results and the new
ability, made use of by this code, to arbitrarily place scheduling
primitives in the source code without disturbing get_wchan()'s accuracy.

Suggestions by Arnd Bergmann and Matthew Wilcox regarding reducing the
invasiveness of the patch were incorporated during prior rounds of review.
I've at least tried to sweep all arches in this patch.
---
 arch/alpha/kernel/process.c                |  2 --
 arch/alpha/kernel/semaphore.c              |  9 ++++----
 arch/alpha/kernel/vmlinux.lds.S            |  1 +
 arch/arm/kernel/process.c                  |  2 --
 arch/arm/kernel/semaphore.c                |  8 ++++---
 arch/arm/kernel/vmlinux.lds.S              |  1 +
 arch/arm26/kernel/process.c                |  2 --
 arch/arm26/kernel/semaphore.c              |  8 ++++---
 arch/arm26/kernel/vmlinux-arm26-xip.lds.in |  1 +
 arch/arm26/kernel/vmlinux-arm26.lds.in     |  1 +
 arch/cris/arch-v10/kernel/process.c        |  3 +--
 arch/cris/arch-v10/vmlinux.lds.S           |  1 +
 arch/cris/kernel/semaphore.c               |  5 ++--
 arch/h8300/kernel/process.c                |  3 ---
 arch/h8300/kernel/semaphore.c              |  5 ++--
 arch/h8300/kernel/vmlinux.lds.S            |  1 +
 arch/i386/kernel/process.c                 |  2 --
 arch/i386/kernel/semaphore.c               | 17 +++++++-------
 arch/i386/kernel/vmlinux.lds.S             |  1 +
 arch/ia64/kernel/process.c                 |  2 --
 arch/ia64/kernel/semaphore.c               |  7 +++---
 arch/ia64/kernel/vmlinux.lds.S             |  1 +
 arch/m68k/kernel/process.c                 |  5 ----
 arch/m68k/kernel/semaphore.c               |  5 ++--
 arch/m68k/kernel/vmlinux-std.lds           |  1 +
 arch/m68k/kernel/vmlinux-sun3.lds          |  1 +
 arch/m68knommu/kernel/process.c            |  5 ----
 arch/m68knommu/kernel/semaphore.c          |  5 ++--
 arch/m68knommu/kernel/vmlinux.lds.S        |  1 +
 arch/mips/kernel/process.c                 |  2 --
 arch/mips/kernel/semaphore.c               |  5 ++--
 arch/mips/kernel/vmlinux.lds.S             |  1 +
 arch/parisc/kernel/semaphore.c             |  5 ++--
 arch/parisc/kernel/vmlinux.lds.S           |  1 +
 arch/ppc/kernel/process.c                  |  2 --
 arch/ppc/kernel/semaphore.c                |  5 ++--
 arch/ppc/kernel/vmlinux.lds.S              |  1 +
 arch/ppc64/kernel/process.c                |  2 --
 arch/ppc64/kernel/semaphore.c              |  5 ++--
 arch/ppc64/kernel/vmlinux.lds.S            |  1 +
 arch/s390/kernel/process.c                 |  2 --
 arch/s390/kernel/semaphore.c               |  5 ++--
 arch/s390/kernel/vmlinux.lds.S             |  1 +
 arch/sh/kernel/process.c                   |  4 +---
 arch/sh/kernel/semaphore.c                 |  5 ++--
 arch/sh/kernel/vmlinux.lds.S               |  1 +
 arch/sparc/kernel/process.c                |  4 +---
 arch/sparc/kernel/semaphore.c              |  5 ++--
 arch/sparc/kernel/vmlinux.lds.S            |  1 +
 arch/sparc/lib/rwsem.S                     |  3 ++-
 arch/sparc64/kernel/process.c              |  4 +---
 arch/sparc64/kernel/semaphore.c            |  9 ++++----
 arch/sparc64/kernel/vmlinux.lds.S          |  1 +
 arch/sparc64/lib/rwsem.c                   |  5 ++--
 arch/v850/kernel/process.c                 |  3 ---
 arch/v850/kernel/semaphore.c               |  5 ++--
 arch/v850/kernel/vmlinux.lds.S             |  1 +
 arch/x86_64/kernel/process.c               |  2 --
 arch/x86_64/kernel/semaphore.c             |  5 ++--
 arch/x86_64/kernel/vmlinux.lds.S           |  1 +
 arch/x86_64/lib/thunk.S                    |  3 ++-
 include/asm-generic/vmlinux.lds.h          |  5 ++++
 include/linux/init.h                       |  2 ++
 include/linux/sched.h                      |  2 ++
 kernel/sched.c                             | 37 ++++++++++++++++--------------
 kernel/timer.c                             |  4 ++--
 lib/rwsem.c                                |  5 ++--
 67 files changed, 137 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index e427bae12ffe..297e4b48bfe2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -513,8 +513,6 @@ thread_saved_pc(task_t *t)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c
index b52a0df303fe..4d60a0ccd6f7 100644
--- a/arch/alpha/kernel/semaphore.c
+++ b/arch/alpha/kernel/semaphore.c
@@ -7,6 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 
 /*
  * This is basically the PPC semaphore scheme ported to use
@@ -60,7 +61,7 @@ static inline int __sem_update_count(struct semaphore *sem, int incr)
  * Either form may be used in conjunction with "up()".
  */
 
-void
+void __sched
 __down_failed(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
@@ -101,7 +102,7 @@ __down_failed(struct semaphore *sem)
 #endif
 }
 
-int
+int __sched
 __down_failed_interruptible(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
@@ -159,7 +160,7 @@ __up_wakeup(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void
+void __sched
 down(struct semaphore *sem)
 {
 #if WAITQUEUE_DEBUG
@@ -173,7 +174,7 @@ down(struct semaphore *sem)
 	__down(sem);
 }
 
-int
+int __sched
 down_interruptible(struct semaphore *sem)
 {
 #if WAITQUEUE_DEBUG
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 7afd00d5d46b..d159b8f0d022 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -17,6 +17,7 @@ SECTIONS
   _text = .;					/* Text and read-only data */
   .text : { 
 	*(.text) 
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
   } :kernel
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 863c4076daad..8423921e821a 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -414,8 +414,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c
index a50902e8bec7..da39eb3dca31 100644
--- a/arch/arm/kernel/semaphore.c
+++ b/arch/arm/kernel/semaphore.c
@@ -13,6 +13,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -54,7 +55,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -87,7 +88,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * sem)
  * registers (r0 to r3 and lr), but not ip, as we use it as a return
  * value in some cases..
  */
-asm("	.align	5				\n\
+asm("	.section .sched.text			\n\
+	.align	5				\n\
 	.globl	__down_failed			\n\
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 56af3401b34d..a5db0ddca6a4 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -73,6 +73,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c
index 09a2f52ad8a8..ce23571617a1 100644
--- a/arch/arm26/kernel/process.c
+++ b/arch/arm26/kernel/process.c
@@ -400,8 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/arm26/kernel/semaphore.c b/arch/arm26/kernel/semaphore.c
index e7964ce1d0d9..60591a738592 100644
--- a/arch/arm26/kernel/semaphore.c
+++ b/arch/arm26/kernel/semaphore.c
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -56,7 +57,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +90,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * sem)
  * registers (r0 to r3 and lr), but not ip, as we use it as a return
  * value in some cases..
  */
-asm("	.align	5				\n\
+asm("	.section .sched.text			\n\
+	.align	5				\n\
 	.globl	__down_failed			\n\
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
diff --git a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
index 602a77c022d7..61eedf0bc42f 100644
--- a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
+++ b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
@@ -66,6 +66,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/arm26/kernel/vmlinux-arm26.lds.in b/arch/arm26/kernel/vmlinux-arm26.lds.in
index 8782fe36f0a8..2393f3805a49 100644
--- a/arch/arm26/kernel/vmlinux-arm26.lds.in
+++ b/arch/arm26/kernel/vmlinux-arm26.lds.in
@@ -67,6 +67,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 62e3a4fbf33a..c785b54e6cbd 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -16,6 +16,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/init.h>
 
 #ifdef CONFIG_ETRAX_GPIO
 void etrax_gpio_wake_up_check(void); /* drivers/gpio.c */
@@ -216,8 +217,6 @@ asmlinkage int sys_execve(const char *fname, char **argv, char **envp,
  * These bracket the sleeping functions..
  */
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched     ((unsigned long) scheduling_functions_start_here)
 #define last_sched      ((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S
index b2c27e147f29..6b73a2c0dad8 100644
--- a/arch/cris/arch-v10/vmlinux.lds.S
+++ b/arch/cris/arch-v10/vmlinux.lds.S
@@ -25,6 +25,7 @@ SECTIONS
 	__stext = .;
 	.text : {
 		*(.text)
+		SCHED_TEXT
 		*(.fixup)
 		*(.text.__*)
 	}
diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c
index d62b355e1706..b884263d3cd4 100644
--- a/arch/cris/kernel/semaphore.c
+++ b/arch/cris/kernel/semaphore.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 /*
@@ -94,7 +95,7 @@ void __up(struct semaphore *sem)
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_VAR
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
@@ -104,7 +105,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int ret = 0;
 	DOWN_VAR
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index bd6ccd542399..8640ea20dba0 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -264,8 +264,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -289,7 +287,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c
index 690efce1e437..1ebb79baaa8c 100644
--- a/arch/h8300/kernel/semaphore.c
+++ b/arch/h8300/kernel/semaphore.c
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -95,7 +96,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -106,7 +107,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 60787f07eb2b..3a643954a8fe 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -82,6 +82,7 @@ SECTIONS
 #endif
 	__stext = . ;
         	*(.text)
+	SCHED_TEXT
 	. = ALIGN(0x4) ;
 		*(.exit.text)
 		*(.text.*)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 3495f1aedf67..7fed9d3823ed 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -632,8 +632,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 #define top_esp                (THREAD_SIZE - sizeof(unsigned long))
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
index 5acd544f0cbd..073912cfcf44 100644
--- a/arch/i386/kernel/semaphore.c
+++ b/arch/i386/kernel/semaphore.c
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/init.h>
 #include <asm/semaphore.h>
 
 /*
@@ -53,7 +54,7 @@ asmlinkage void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-asmlinkage void __down(struct semaphore * sem)
+asmlinkage void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -90,7 +91,7 @@ asmlinkage void __down(struct semaphore * sem)
 	tsk->state = TASK_RUNNING;
 }
 
-asmlinkage int __down_interruptible(struct semaphore * sem)
+asmlinkage int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -187,7 +188,7 @@ asmlinkage int __down_trylock(struct semaphore * sem)
  * value..
  */
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed\n"
 "__down_failed:\n\t"
@@ -210,7 +211,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed_interruptible\n"
 "__down_failed_interruptible:\n\t"
@@ -231,7 +232,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed_trylock\n"
 "__down_failed_trylock:\n\t"
@@ -252,7 +253,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __up_wakeup\n"
 "__up_wakeup:\n\t"
@@ -271,7 +272,7 @@ asm(
  */
 #if defined(CONFIG_SMP)
 asm(
-".text\n"
+".section .sched.text\n"
 ".align	4\n"
 ".globl	__write_lock_failed\n"
 "__write_lock_failed:\n\t"
@@ -285,7 +286,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align	4\n"
 ".globl	__read_lock_failed\n"
 "__read_lock_failed:\n\t"
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 3623d7e2934a..0253c586547b 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -16,6 +16,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a1d09d5c91c4..0d245cbcd1f6 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -660,8 +660,6 @@ get_wchan (struct task_struct *p)
 	/*
 	 * These bracket the sleeping functions..
 	 */
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 #	define first_sched	((unsigned long) scheduling_functions_start_here)
 #	define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
index f3926a3c4d73..2724ef3fbae2 100644
--- a/arch/ia64/kernel/semaphore.c
+++ b/arch/ia64/kernel/semaphore.c
@@ -24,6 +24,7 @@
  * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
  */
 #include <linux/sched.h>
+#include <linux/init.h>
 
 #include <asm/errno.h>
 #include <asm/semaphore.h>
@@ -44,8 +45,7 @@ __up (struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void
-__down (struct semaphore *sem)
+void __sched __down (struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,8 +82,7 @@ __down (struct semaphore *sem)
 	tsk->state = TASK_RUNNING;
 }
 
-int
-__down_interruptible (struct semaphore * sem)
+int __sched __down_interruptible (struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index e5589e49d9da..5c45718a9c82 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -41,6 +41,7 @@ SECTIONS
     {
 	*(.text.ivt)
 	*(.text)
+	SCHED_TEXT
 	*(.gnu.linkonce.t*)
     }
   .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 8d72a5c5b0c7..fc2c753c332b 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -65,8 +65,6 @@ asmlinkage void ret_from_fork(void);
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 	struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
 	/* Check whether the thread is blocked in resume() */
 	if (sw->retpc > (unsigned long)scheduling_functions_start_here &&
@@ -387,8 +385,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -407,7 +403,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c
index 690efce1e437..1ebb79baaa8c 100644
--- a/arch/m68k/kernel/semaphore.c
+++ b/arch/m68k/kernel/semaphore.c
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -95,7 +96,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -106,7 +107,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index bd41fc992169..6dc62684c7b9 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -12,6 +12,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x4e75
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 2e81cde14987..f293e567192c 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -13,6 +13,7 @@ SECTIONS
   .text : {
 	*(.head)
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x4e75
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index c8b87371641a..896d596a1bd8 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -406,8 +406,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -426,7 +424,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
@@ -439,8 +436,6 @@ unsigned long get_wchan(struct task_struct *p)
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 	struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
 
 	/* Check whether the thread is blocked in resume() */
diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c
index 33d704fcf883..c083f4772add 100644
--- a/arch/m68knommu/kernel/semaphore.c
+++ b/arch/m68knommu/kernel/semaphore.c
@@ -6,6 +6,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -96,7 +97,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -107,7 +108,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 1ab8a31ef964..a362870b6e4e 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -191,6 +191,7 @@ SECTIONS {
 	.text : {
 		_stext = . ;
         	*(.text)
+		SCHED_TEXT
         	*(.text.lock)
 
 		. = ALIGN(16);          /* Exception table              */
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index f8ba26770bf4..f4ab9c66b27f 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -283,8 +283,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c
index 11b937f20604..51c3e772c029 100644
--- a/arch/mips/kernel/semaphore.c
+++ b/arch/mips/kernel/semaphore.c
@@ -6,6 +6,7 @@
 #include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 
 #ifdef CONFIG_CPU_HAS_LLDSCD
@@ -104,7 +105,7 @@ static inline int waking_non_zero(struct semaphore *sem)
  * Either form may be used in conjunction with "up()".
  */
 
-void __down_failed(struct semaphore * sem)
+void __sched __down_failed(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	wait_queue_t wait;
@@ -227,7 +228,7 @@ static inline int waking_non_zero_interruptible(struct semaphore *sem,
 
 #endif /* !CONFIG_CPU_HAS_LLDSCD */
 
-int __down_failed_interruptible(struct semaphore * sem)
+int __sched __down_failed_interruptible(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	wait_queue_t wait;
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index b72639f8db65..098cfaa23c0e 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -28,6 +28,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
     *(.text)
+    SCHED_TEXT
     *(.fixup)
     *(.gnu.warning)
   } =0
diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c
index ffb4851451fc..ee806bcc3726 100644
--- a/arch/parisc/kernel/semaphore.c
+++ b/arch/parisc/kernel/semaphore.c
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 /*
  * Semaphores are complex as we wish to avoid using two variables.
@@ -58,7 +59,7 @@ void __up(struct semaphore *sem)
 	sem->count += (sem->count < 0) ? 1 : - 1;
 	
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_HEAD
 
@@ -74,7 +75,7 @@ void __down(struct semaphore * sem)
 	UPDATE_COUNT
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DOWN_HEAD
 
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 14d0882a19d2..e5d5aeef96e5 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text ALIGN(16) : {
 	*(.text*)
+	SCHED_TEXT
 	*(.PARISC.unwind)
 	*(.fixup)
 	*(.lock.text)		/* out-of-line lock text */
diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c
index ada32baeda19..3363a030e00f 100644
--- a/arch/ppc/kernel/process.c
+++ b/arch/ppc/kernel/process.c
@@ -661,8 +661,6 @@ void __init ll_puts(const char *s)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched    ((unsigned long) scheduling_functions_start_here)
 #define last_sched     ((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c
index 7bf51fba5c14..2fe429b27c14 100644
--- a/arch/ppc/kernel/semaphore.c
+++ b/arch/ppc/kernel/semaphore.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/errno.h>
@@ -69,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __down(struct semaphore *sem)
+void __sched __down(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -99,7 +100,7 @@ void __down(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 81b95d449a22..b710d55c5b08 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
   .text      :
   {
     *(.text)
+    SCHED_TEXT
     *(.fixup)
     *(.got1)
     __got2_start = .;
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
index cec7225a6ac1..f74b14d7e58e 100644
--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -475,8 +475,6 @@ static inline int validate_sp(unsigned long sp, struct task_struct *p)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched    (*(unsigned long *)scheduling_functions_start_here)
 #define last_sched     (*(unsigned long *)scheduling_functions_end_here)
 
diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c
index c977029e2465..d723632d59f3 100644
--- a/arch/ppc64/kernel/semaphore.c
+++ b/arch/ppc64/kernel/semaphore.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/errno.h>
@@ -70,7 +71,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __down(struct semaphore *sem)
+void __sched __down(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -99,7 +100,7 @@ void __down(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S
index a8531b1f9ef2..1d9b61143aaa 100644
--- a/arch/ppc64/kernel/vmlinux.lds.S
+++ b/arch/ppc64/kernel/vmlinux.lds.S
@@ -13,6 +13,7 @@ SECTIONS
   /* Read-only sections, merged into text segment: */
   .text : {
 	*(.text .text.*)
+	SCHED_TEXT
 	*(.fixup)
 	. = ALIGN(4096);
 	_etext = .;
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 3676307d1d8a..050585ab5d2a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -384,8 +384,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c
index 8203f5e0228d..8dfb690c159f 100644
--- a/arch/s390/kernel/semaphore.c
+++ b/arch/s390/kernel/semaphore.c
@@ -11,6 +11,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -60,7 +61,7 @@ void __up(struct semaphore *sem)
  *   count > 0: decrement count, wake up queue and exit.
  *   count <= 0: set count to -1, go to sleep.
  */
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,7 +83,7 @@ void __down(struct semaphore * sem)
  *   count > 0: wake up queue and exit.
  *   count <= 0: set count to 0, wake up queue and exit.
  */
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index c9ca7a8e93b3..b4534b2867c3 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -23,6 +23,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x0700
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 773006661b50..7d45ea0acd09 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -464,8 +464,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -481,7 +479,7 @@ unsigned long get_wchan(struct task_struct *p)
 	 * The same comment as on the Alpha applies here, too ...
 	 */
 	pc = thread_saved_pc(p);
-	if (pc >= (unsigned long) interruptible_sleep_on && pc < (unsigned long) add_timer) {
+	if (pc >= first_sched && pc < last_sched) {
 		schedule_frame = ((unsigned long *)(long)p->thread.sp)[1];
 		return (unsigned long)((unsigned long *)schedule_frame)[1];
 	}
diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c
index 0943ad666a67..a3c24dcbf01d 100644
--- a/arch/sh/kernel/semaphore.c
+++ b/arch/sh/kernel/semaphore.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/init.h>
 #include <asm/semaphore.h>
 #include <asm/semaphore-helper.h>
 
@@ -103,7 +104,7 @@ void __up(struct semaphore *sem)
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_VAR
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
@@ -113,7 +114,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int ret = 0;
 	DOWN_VAR
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 2cc86534c130..da0f5d728b3e 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -22,6 +22,7 @@ SECTIONS
 	} = 0
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x0009
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index beae70a970e4..70261b211997 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/reboot.h>
 #include <linux/delay.h>
 #include <linux/pm.h>
+#include <linux/init.h>
 
 #include <asm/auxio.h>
 #include <asm/oplib.h>
@@ -694,9 +695,6 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 	return retval;
 }
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c
index 5a8f3d176a8f..77e63b92ca30 100644
--- a/arch/sparc/kernel/semaphore.c
+++ b/arch/sparc/kernel/semaphore.c
@@ -4,6 +4,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -45,7 +46,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -78,7 +79,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 0862360d865d..8d4bbfaf304c 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -12,6 +12,7 @@ SECTIONS
   .text 0xf0004000 :
   {
     *(.text)
+    SCHED_TEXT
     *(.gnu.warning)
   } =0
   _etext = .;
diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S
index 98b757cb67c6..e7578dc600b8 100644
--- a/arch/sparc/lib/rwsem.S
+++ b/arch/sparc/lib/rwsem.S
@@ -8,7 +8,7 @@
 #include <asm/ptrace.h>
 #include <asm/psr.h>
 
-	.text
+	.section .sched.text
 	.align	4
 
 	.globl		___down_read
@@ -113,6 +113,7 @@ ___down_write:
 	ba		2b
 	 restore	%l5, %g0, %g5
 
+	.text
 	.globl		___up_read
 ___up_read:
 	rd		%psr, %g3
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 1be2b97e4672..0caf962e8155 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/config.h>
 #include <linux/reboot.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 
 #include <asm/oplib.h>
 #include <asm/uaccess.h>
@@ -823,9 +824,6 @@ out:
 	return error;
 }
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
index a9e66d666ceb..9ddfcb9a1900 100644
--- a/arch/sparc64/kernel/semaphore.c
+++ b/arch/sparc64/kernel/semaphore.c
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 /*
  * Atomically update sem->count.
@@ -90,7 +91,7 @@ void up(struct semaphore *sem)
 	: "g5", "g7", "memory", "cc");
 }
 
-static void __down(struct semaphore * sem)
+static void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -108,7 +109,7 @@ static void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
 {
 	might_sleep();
 	/* This atomically does:
@@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem)
 	return ret;
 }
 
-static int __down_interruptible(struct semaphore * sem)
+static int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -216,7 +217,7 @@ static int __down_interruptible(struct semaphore * sem)
 	return retval;
 }
 
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
 {
 	int ret = 0;
 	
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index ad95e88a3cbc..8faeee09fab2 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
   .text 0x0000000000404000 :
   {
     *(.text)
+    SCHED_TEXT
     *(.gnu.warning)
   } =0
   _etext = .;
diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c
index 8e1dfdda91fa..e19968dbc2d1 100644
--- a/arch/sparc64/lib/rwsem.c
+++ b/arch/sparc64/lib/rwsem.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/rwsem.h>
+#include <linux/init.h>
 #include <linux/module.h>
 
 extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
@@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore
 extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
 extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *));
 
-void __down_read(struct rw_semaphore *sem)
+void __sched __down_read(struct rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"! beginning __down_read\n"
@@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(__down_read_trylock);
 
-void __down_write(struct rw_semaphore *sem)
+void __sched __down_write(struct rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"! beginning __down_write\n\t"
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 5c29ae51a303..977d75772d81 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -203,8 +203,6 @@ int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here (void);
-extern void scheduling_functions_end_here (void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -228,7 +226,6 @@ unsigned long get_wchan (struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c
index b78d714384db..2d20886863d8 100644
--- a/arch/v850/kernel/semaphore.c
+++ b/arch/v850/kernel/semaphore.c
@@ -15,6 +15,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -56,7 +57,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +90,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 028c224fa66a..07ab0f292d1c 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -64,6 +64,7 @@
 #define TEXT_CONTENTS							      \
 		__stext = . ;						      \
         	*(.text)						      \
+		SCHED_TEXT
 			*(.exit.text)	/* 2.5 convention */		      \
 			*(.text.exit)	/* 2.4 convention */		      \
 			*(.text.lock)					      \
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7b2414765ca3..d1d9471581a8 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -576,8 +576,6 @@ asmlinkage long sys_vfork(struct pt_regs regs)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
index 5e517814dd07..2bcd4a7ec38d 100644
--- a/arch/x86_64/kernel/semaphore.c
+++ b/arch/x86_64/kernel/semaphore.c
@@ -14,6 +14,7 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/errno.h>
 
 #include <asm/semaphore.h>
@@ -54,7 +55,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -91,7 +92,7 @@ void __down(struct semaphore * sem)
 	tsk->state = TASK_RUNNING;
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7b9e1beb360e..c612e4d213a1 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index 876cb937f9f1..acc1e2ca7ed7 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -35,6 +35,7 @@
 	.endm
 	
 
+	.section .sched.text
 #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
 	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
 	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
@@ -65,7 +66,7 @@ restore_norax:
 
 #ifdef CONFIG_SMP
 /* Support for read/write spinlocks. */
-	
+	.text
 /* rax:	pointer to rwlock_t */	
 ENTRY(__write_lock_failed)
 	lock
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 59c2b950e8b8..a4b6c768cf49 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -51,3 +51,8 @@
 		*(.security_initcall.init) 				\
 		__security_initcall_end = .;				\
 	}
+
+#define SCHED_TEXT							\
+		__scheduling_functions_start_here = .;			\
+		*(.sched.text)						\
+		__scheduling_functions_end_here = .;
diff --git a/include/linux/init.h b/include/linux/init.h
index 45069e275b3d..c6842477243c 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -46,6 +46,8 @@
 #define __exitdata	__attribute__ ((__section__(".exit.data")))
 #define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit")))
 
+#define __sched		__attribute__((__section__(".sched.text")))
+
 #ifdef MODULE
 #define __exit		__attribute__ ((__section__(".exit.text")))
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5fa0c07a7f8..054b3c0d5962 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -170,6 +170,8 @@ extern void update_one_process(struct task_struct *p, unsigned long user,
 			       unsigned long system, int cpu);
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
+extern const unsigned long scheduling_functions_start_here;
+extern const unsigned long scheduling_functions_end_here;
 
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
diff --git a/kernel/sched.c b/kernel/sched.c
index 9e19d4c0d4a9..b42029abe679 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+extern unsigned long __scheduling_functions_start_here;
+extern unsigned long __scheduling_functions_end_here;
+const unsigned long scheduling_functions_start_here =
+			(unsigned long)&__scheduling_functions_start_here;
+const unsigned long scheduling_functions_end_here =
+			(unsigned long)&__scheduling_functions_end_here;
+
 /*
  * Default context-switch locking:
  */
@@ -1587,12 +1594,10 @@ out:
 	rebalance_tick(rq, 0);
 }
 
-void scheduling_functions_start_here(void) { }
-
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule);
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void preempt_schedule(void)
+asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 
@@ -1869,7 +1874,7 @@ void fastcall complete_all(struct completion *x)
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 
-void fastcall wait_for_completion(struct completion *x)
+void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
@@ -1907,7 +1912,7 @@ EXPORT_SYMBOL(wait_for_completion);
 	__remove_wait_queue(q, &wait);			\
 	spin_unlock_irqrestore(&q->lock, flags);
 
-void fastcall interruptible_sleep_on(wait_queue_head_t *q)
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1920,7 +1925,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1935,7 +1940,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
-void fastcall sleep_on(wait_queue_head_t *q)
+void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1948,7 +1953,7 @@ void fastcall sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(sleep_on);
 
-long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1963,8 +1968,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
-void scheduling_functions_end_here(void) { }
-
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
@@ -2424,7 +2427,7 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-void __cond_resched(void)
+void __sched __cond_resched(void)
 {
 	set_current_state(TASK_RUNNING);
 	schedule();
@@ -2438,7 +2441,7 @@ EXPORT_SYMBOL(__cond_resched);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void yield(void)
+void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
@@ -2453,7 +2456,7 @@ EXPORT_SYMBOL(yield);
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
-void io_schedule(void)
+void __sched io_schedule(void)
 {
 	struct runqueue *rq = this_rq();
 
@@ -2464,7 +2467,7 @@ void io_schedule(void)
 
 EXPORT_SYMBOL(io_schedule);
 
-long io_schedule_timeout(long timeout)
+long __sched io_schedule_timeout(long timeout)
 {
 	struct runqueue *rq = this_rq();
 	long ret;
@@ -3010,7 +3013,7 @@ EXPORT_SYMBOL(__might_sleep);
  *
  * Called inside preempt_disable().
  */
-void __preempt_spin_lock(spinlock_t *lock)
+void __sched __preempt_spin_lock(spinlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_spin_lock(lock);
@@ -3026,7 +3029,7 @@ void __preempt_spin_lock(spinlock_t *lock)
 
 EXPORT_SYMBOL(__preempt_spin_lock);
 
-void __preempt_write_lock(rwlock_t *lock)
+void __sched __preempt_write_lock(rwlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_write_lock(lock);
diff --git a/kernel/timer.c b/kernel/timer.c
index f53e0749b0d2..cbcb5522866d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data)
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
-fastcall signed long schedule_timeout(signed long timeout)
+fastcall signed long __sched schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;
@@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long nanosleep_restart(struct restart_block *restart)
+static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	unsigned long expire = restart->arg0, now = jiffies;
 	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 95469d7fb796..85dcae7e9337 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -5,6 +5,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <linux/module.h>
 
 struct rwsem_waiter {
@@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore
 /*
  * wait for the read lock to be granted
  */
-struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem)
+struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
@@ -178,7 +179,7 @@ struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
  * wait for the write lock to be granted
  */
-struct rw_semaphore fastcall *rwsem_down_write_failed(struct rw_semaphore *sem)
+struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
-- 
cgit v1.2.3


From c334f752d8e9d3847d4459d06f7544dea9a49923 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:50 -0700
Subject: [PATCH] posix message queues: code move

From: Manfred Spraul <manfred@colorfullife.com>

cleanup of sysv ipc as a preparation for posix message queues:

- replace !CONFIG_SYSVIPC wrappers for copy_semundo and exit_sem with
  static inline wrappers.  Now the whole ipc/util.c file is only used if
  CONFIG_SYSVIPC is set, use makefile magic instead of #ifdef.

- remove the prototypes for copy_semundo and exit_sem from kernel/fork.c

- they belong into a header file.

- create a new msgutil.c with the helper functions for message queues.

- cleanup the helper functions: run Lindent, add __user tags.
---
 include/linux/msg.h |   3 --
 include/linux/sem.h |  17 ++++++-
 ipc/Makefile        |   4 +-
 ipc/msg.c           | 105 -------------------------------------------
 ipc/msgutil.c       | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 ipc/util.c          |  19 --------
 ipc/util.h          |  10 +++++
 kernel/fork.c       |   4 +-
 8 files changed, 155 insertions(+), 134 deletions(-)
 create mode 100644 ipc/msgutil.c

(limited to 'kernel')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index b235e862a3dd..2c4c6aa643ff 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -74,9 +74,6 @@ struct msg_msg {
 	/* the actual message follows immediately */
 };
 
-#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
-#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
-
 /* one msq_queue structure for each present queue on the system */
 struct msg_queue {
 	struct kern_ipc_perm q_perm;
diff --git a/include/linux/sem.h b/include/linux/sem.h
index b337c509ac29..aaf45764a56e 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -134,7 +134,22 @@ struct sysv_sem {
 	struct sem_undo_list *undo_list;
 };
 
-void exit_sem(struct task_struct *p);
+#ifdef CONFIG_SYSVIPC
+
+extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
+extern void exit_sem(struct task_struct *tsk);
+
+#else
+static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void exit_sem(struct task_struct *tsk)
+{
+	return;
+}
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/ipc/Makefile b/ipc/Makefile
index ccc6c64c2493..6cd32a30f03f 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -2,7 +2,5 @@
 # Makefile for the linux ipc.
 #
 
-obj-y   := util.o
-
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
-obj-$(CONFIG_SYSVIPC) += msg.o sem.o shm.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
diff --git a/ipc/msg.c b/ipc/msg.c
index 709ff71bf5c1..37e2d3bb17cb 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -51,11 +51,6 @@ struct msg_sender {
 	struct task_struct* tsk;
 };
 
-struct msg_msgseg {
-	struct msg_msgseg* next;
-	/* the next part of the message follows immediately */
-};
-
 #define SEARCH_ANY		1
 #define SEARCH_EQUAL		2
 #define SEARCH_NOTEQUAL		3
@@ -129,106 +124,6 @@ static int newque (key_t key, int msgflg)
 	return msg_buildid(id,msq->q_perm.seq);
 }
 
-static void free_msg(struct msg_msg* msg)
-{
-	struct msg_msgseg* seg;
-
-	security_msg_msg_free(msg);
-
-	seg = msg->next;
-	kfree(msg);
-	while(seg != NULL) {
-		struct msg_msgseg* tmp = seg->next;
-		kfree(seg);
-		seg = tmp;
-	}
-}
-
-static struct msg_msg* load_msg(void* src, int len)
-{
-	struct msg_msg* msg;
-	struct msg_msgseg** pseg;
-	int err;
-	int alen;
-
-	alen = len;
-	if(alen > DATALEN_MSG)
-		alen = DATALEN_MSG;
-
-	msg = (struct msg_msg *) kmalloc (sizeof(*msg) + alen, GFP_KERNEL);
-	if(msg==NULL)
-		return ERR_PTR(-ENOMEM);
-
-	msg->next = NULL;
-	msg->security = NULL;
-
-	if (copy_from_user(msg+1, src, alen)) {
-		err = -EFAULT;
-		goto out_err;
-	}
-
-	len -= alen;
-	src = ((char*)src)+alen;
-	pseg = &msg->next;
-	while(len > 0) {
-		struct msg_msgseg* seg;
-		alen = len;
-		if(alen > DATALEN_SEG)
-			alen = DATALEN_SEG;
-		seg = (struct msg_msgseg *) kmalloc (sizeof(*seg) + alen, GFP_KERNEL);
-		if(seg==NULL) {
-			err=-ENOMEM;
-			goto out_err;
-		}
-		*pseg = seg;
-		seg->next = NULL;
-		if(copy_from_user (seg+1, src, alen)) {
-			err = -EFAULT;
-			goto out_err;
-		}
-		pseg = &seg->next;
-		len -= alen;
-		src = ((char*)src)+alen;
-	}
-	
-	err = security_msg_msg_alloc(msg);
-	if (err)
-		goto out_err;
-
-	return msg;
-
-out_err:
-	free_msg(msg);
-	return ERR_PTR(err);
-}
-
-static int store_msg(void* dest, struct msg_msg* msg, int len)
-{
-	int alen;
-	struct msg_msgseg *seg;
-
-	alen = len;
-	if(alen > DATALEN_MSG)
-		alen = DATALEN_MSG;
-	if(copy_to_user (dest, msg+1, alen))
-		return -1;
-
-	len -= alen;
-	dest = ((char*)dest)+alen;
-	seg = msg->next;
-	while(len > 0) {
-		alen = len;
-		if(alen > DATALEN_SEG)
-			alen = DATALEN_SEG;
-		if(copy_to_user (dest, seg+1, alen))
-			return -1;
-		len -= alen;
-		dest = ((char*)dest)+alen;
-		seg=seg->next;
-	}
-	return 0;
-}
-
 static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss)
 {
 	mss->tsk=current;
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
new file mode 100644
index 000000000000..e48d777de2a3
--- /dev/null
+++ b/ipc/msgutil.c
@@ -0,0 +1,127 @@
+/*
+ * linux/ipc/util.c
+ * Copyright (C) 1999, 2004 Manfred Spraul
+ *
+ * This file is released under GNU General Public Licence version 2 or
+ * (at your option) any later version.
+ *
+ * See the file COPYING for more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/ipc.h>
+#include <asm/uaccess.h>
+
+#include "util.h"
+
+struct msg_msgseg {
+	struct msg_msgseg* next;
+	/* the next part of the message follows immediately */
+};
+
+#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
+#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
+
+struct msg_msg *load_msg(void __user *src, int len)
+{
+	struct msg_msg *msg;
+	struct msg_msgseg **pseg;
+	int err;
+	int alen;
+
+	alen = len;
+	if (alen > DATALEN_MSG)
+		alen = DATALEN_MSG;
+
+	msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	if (msg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	msg->next = NULL;
+	msg->security = NULL;
+
+	if (copy_from_user(msg + 1, src, alen)) {
+		err = -EFAULT;
+		goto out_err;
+	}
+
+	len -= alen;
+	src = ((char *)src) + alen;
+	pseg = &msg->next;
+	while (len > 0) {
+		struct msg_msgseg *seg;
+		alen = len;
+		if (alen > DATALEN_SEG)
+			alen = DATALEN_SEG;
+		seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen,
+						 GFP_KERNEL);
+		if (seg == NULL) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+		*pseg = seg;
+		seg->next = NULL;
+		if (copy_from_user(seg + 1, src, alen)) {
+			err = -EFAULT;
+			goto out_err;
+		}
+		pseg = &seg->next;
+		len -= alen;
+		src = ((char *)src) + alen;
+	}
+
+	err = security_msg_msg_alloc(msg);
+	if (err)
+		goto out_err;
+
+	return msg;
+
+out_err:
+	free_msg(msg);
+	return ERR_PTR(err);
+}
+
+int store_msg(void __user *dest, struct msg_msg *msg, int len)
+{
+	int alen;
+	struct msg_msgseg *seg;
+
+	alen = len;
+	if (alen > DATALEN_MSG)
+		alen = DATALEN_MSG;
+	if (copy_to_user(dest, msg + 1, alen))
+		return -1;
+
+	len -= alen;
+	dest = ((char *)dest) + alen;
+	seg = msg->next;
+	while (len > 0) {
+		alen = len;
+		if (alen > DATALEN_SEG)
+			alen = DATALEN_SEG;
+		if (copy_to_user(dest, seg + 1, alen))
+			return -1;
+		len -= alen;
+		dest = ((char *)dest) + alen;
+		seg = seg->next;
+	}
+	return 0;
+}
+
+void free_msg(struct msg_msg *msg)
+{
+	struct msg_msgseg *seg;
+
+	security_msg_msg_free(msg);
+
+	seg = msg->next;
+	kfree(msg);
+	while (seg != NULL) {
+		struct msg_msgseg *tmp = seg->next;
+		kfree(seg);
+		seg = tmp;
+	}
+}
diff --git a/ipc/util.c b/ipc/util.c
index 6d94883edae0..f74c5eef57d0 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -25,8 +25,6 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-#if defined(CONFIG_SYSVIPC)
-
 #include "util.h"
 
 /**
@@ -531,20 +529,3 @@ int ipc_parse_version (int *cmd)
 }
 
 #endif /* __ia64__ */
-
-#else
-/*
- * Dummy functions when SYSV IPC isn't configured
- */
-
-int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
-{
-	return 0;
-}
-
-void exit_sem(struct task_struct *tsk)
-{
-	return;
-}
-
-#endif /* CONFIG_SYSVIPC */
diff --git a/ipc/util.h b/ipc/util.h
index 79c8fc901317..e6434942c097 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -4,6 +4,10 @@
  *
  * ipc helper functions (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
+
+#ifndef _IPC_UTIL_H
+#define _IPC_UTIL_H
+
 #define USHRT_MAX 0xffff
 #define SEQ_MULTIPLIER	(IPCMNI)
 
@@ -62,3 +66,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
 #else
 int ipc_parse_version (int *cmd);
 #endif
+
+extern void free_msg(struct msg_msg *msg);
+extern struct msg_msg *load_msg(void __user *src, int len);
+extern int store_msg(void __user *dest, struct msg_msg *msg, int len);
+
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b17a249c50d..a1f20cabbdd3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/namespace.h>
 #include <linux/personality.h>
+#include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -39,9 +40,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_sem(struct task_struct *tsk);
-
 /* The idle threads do not count..
  * Protected by write_lock_irq(&tasklist_lock)
  */
-- 
cgit v1.2.3


From c50142a5433ed504fff2b1af152f8f7628830dfb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:03 -0700
Subject: [PATCH] posix message queues: syscall stubs

From: Manfred Spraul <manfred@colorfullife.com>

Add -ENOSYS stubs for the posix message queue syscalls.  The API is a direct
mapping of the api from the unix spec, with two exceptions:

- mq_close() doesn't exist.  Message queue file descriptors can be closed
  with close().

- mq_notify(SIGEV_THREAD) cannot be implemented in the kernel.  The kernel
  returns a pollable file descriptor .  User space must poll (or read) this
  descriptor and call the notifier function if the file descriptor is
  signaled.
---
 arch/i386/kernel/entry.S  |  9 +++++++++
 include/asm-i386/unistd.h | 11 ++++++++++-
 include/linux/mqueue.h    | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h  |  9 +++++++++
 kernel/sys.c              |  6 ++++++
 5 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mqueue.h

(limited to 'kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3024740ba84c..14e64d3ea25c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -882,5 +882,14 @@ ENTRY(sys_call_table)
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
+	.long sys_ni_syscall	/* sys_mbind */
+	.long sys_ni_syscall	/* 275 sys_get_mempolicy */
+	.long sys_ni_syscall	/* sys_set_mempolicy */
+	.long sys_mq_open
+	.long sys_mq_unlink
+	.long sys_mq_timedsend
+	.long sys_mq_timedreceive	/* 280 */
+	.long sys_mq_notify
+	.long sys_mq_getsetattr
 
 syscall_table_size=(.-sys_call_table)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index a2d58a99491e..620a232084f3 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -279,8 +279,17 @@
 #define __NR_utimes		271
 #define __NR_fadvise64_64	272
 #define __NR_vserver		273
+#define __NR_mbind		274
+#define __NR_get_mempolicy	275
+#define __NR_set_mempolicy	276
+#define __NR_mq_open 		277
+#define __NR_mq_unlink		(__NR_mq_open+1)
+#define __NR_mq_timedsend	(__NR_mq_open+2)
+#define __NR_mq_timedreceive	(__NR_mq_open+3)
+#define __NR_mq_notify		(__NR_mq_open+4)
+#define __NR_mq_getsetattr	(__NR_mq_open+5)
 
-#define NR_syscalls 274
+#define NR_syscalls 283
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
new file mode 100644
index 000000000000..c0c5fcc89f0e
--- /dev/null
+++ b/include/linux/mqueue.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2003 Krzysztof Benedyczak & Michal Wronski
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   It is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this software; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _LINUX_MQUEUE_H
+#define _LINUX_MQUEUE_H
+
+#define MQ_PRIO_MAX 	32768
+
+typedef int mqd_t;
+
+struct mq_attr {
+	long	mq_flags;	/* message queue flags			*/
+	long	mq_maxmsg;	/* maximum number of messages		*/
+	long	mq_msgsize;	/* maximum message size			*/
+	long	mq_curmsgs;	/* number of messages currently queued	*/
+};
+
+#define NOTIFY_NONE	0
+#define NOTIFY_WOKENUP	1
+#define NOTIFY_REMOVED	2
+
+#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index aaf87aeacafb..7ee5f67abb5f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,6 +48,8 @@ struct timex;
 struct timezone;
 struct tms;
 struct utimbuf;
+typedef int mqd_t;
+struct mq_attr;
 
 #include <linux/config.h>
 #include <linux/types.h>
@@ -450,6 +452,13 @@ asmlinkage long sys_shmget(key_t key, size_t size, int flag);
 asmlinkage long sys_shmdt(char __user *shmaddr);
 asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
 
+asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
+asmlinkage long sys_mq_unlink(const char __user *name);
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
+asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
+
 asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/kernel/sys.c b/kernel/sys.c
index bc498b12edcc..7d1bf5c57aca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -260,6 +260,12 @@ cond_syscall(sys_msgctl)
 cond_syscall(sys_shmget)
 cond_syscall(sys_shmdt)
 cond_syscall(sys_shmctl)
+cond_syscall(sys_mq_open)
+cond_syscall(sys_mq_unlink)
+cond_syscall(sys_mq_timedsend)
+cond_syscall(sys_mq_timedreceive)
+cond_syscall(sys_mq_notify)
+cond_syscall(sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
-- 
cgit v1.2.3


From be94d44e818a56406016111fc48a1084b9f8e435 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:16 -0700
Subject: [PATCH] posix message queues: implementation

From: Manfred Spraul <manfred@colorfullife.com>

Actual implementation of the posix message queues, written by Krzysztof
Benedyczak and Michal Wronski.  The complete implementation is dependant on
CONFIG_POSIX_MQUEUE.

It passed the openposix test suite with two exceptions: one mq_unlink test
was bad and tested undefined behavior.  And Linux succeeds
mq_close(open(,,,)).  The spec mandates EBADF, but we have decided to ignore
that: we would have to add a new syscall just for the right error code.

The patch intentionally doesn't use all helpers from fs/libfs for kernel-only
filesystems: step 5 allows user space mounts of the file system.


Signal changes:

The patch redefines SI_MESGQ using __SI_CODE: The generic Linux ABI uses
a negative value (i.e.  from user) for SI_MESGQ, but the kernel internal
value must be posive to pass check_kill_value.  Additionally, the patch
adds support into copy_siginfo_to_user to copy the "new" signal type to
user space.


Changes in signal code caused by POSIX message queues patch:

General & rationale:

  mqueues generated signals (only upon notification) must have si_code
  == SI_MESGQ.  In fact such a signal is send from one process which
  caused notification (== sent message to empty message queue) to
  another which requested it.  Both processes can be of course unrelated
  in terms of uids/euids.  So SI_MESGQ signals must be classified as
  SI_FROMKERNEL to pass check_kill_permissions (not need to say that
  this signals ARE from kernel).

  Signals generated by message queues notification need the same
  fields in siginfo struct's union _sifields as POSIX.1b signals and we
  can reuse its union entry.

  SI_MESGQ was previously defined to -3 in kernel and also in glibc.
  So in userspace SI_MESGQ must be still visible as -3.

Solution:

  SI_MESGQ is defined in the same style as SI_TIMER using __SI_CODE macro.

  Details:

    Fortunately copy_siginfo_to_user copies si_code as short.  So we
    can use remaining part of int value freely.  __SI_CODE does the
    work.  SI_MESGQ is in kernel:

 		6<<16 | (-3 & 0xffff) what is > 0

    but to userspace is copied

 		(short) SI_MESGQ == -3

Actual changes:

  Changes in include/asm-generic/siginfo.h

  __SI_MESGQ added in signal.h to represent inside-kernel prefix of
  SI_MESGQ.  SI_MESGQ is redefined from -3 to __SI_CODE(__SI_MESGQ, -3)

  Except mips architecture those changes should be arch independent
  (asm-generic/siginfo.h is included in arch versions).  On mips
  SI_MESGQ is redefined to -4 in order to be compatible with IRIX.  But
  the same schema can be used.

  Change in copy_siginfo_to_user: We only add one line to order the
  same copy semantics as for _SI_RT.

  This change isn't very portable - some arch have its own
  copy_siginfo_to_user.  All those should have similar change (but
  possibly not one-line as _SI_RT case was sometimes ignored because i
  wasn't used yet, e.g.  see ia64 signal.c).

Update:
mq: only fail with invalid timespec if mq_timed{send,receive} needs to block
From: Jakub Jelinek <jakub@redhat.com>

POSIX requires EINVAL to be set if:
"The process or thread would have blocked, and the abs_timeout parameter
specified a nanoseconds field value less than zero or greater than or equal
to 1000 million."
but 2.6.5-mm3 returns -EINVAL even if the process or thread would not block
(if the queue is not empty for timedreceive or not full for timedsend).
---
 CREDITS                            |   17 +
 Documentation/filesystems/proc.txt |   25 +
 include/asm-generic/siginfo.h      |    4 +-
 init/Kconfig                       |   18 +
 ipc/Makefile                       |    2 +
 ipc/mqueue.c                       | 1165 ++++++++++++++++++++++++++++++++++++
 kernel/signal.c                    |    1 +
 7 files changed, 1231 insertions(+), 1 deletion(-)
 create mode 100644 ipc/mqueue.c

(limited to 'kernel')

diff --git a/CREDITS b/CREDITS
index dc9b943d10f1..52128c120f63 100644
--- a/CREDITS
+++ b/CREDITS
@@ -289,6 +289,15 @@ S: Via Delle Palme, 9
 S: Terni 05100
 S: Italy
 
+N: Krzysztof Benedyczak
+E: golbi@mat.uni.torun.pl
+W: http://www.mat.uni.torun.pl/~golbi
+D: POSIX message queues fs (with M. Wronski)
+S: ul. Podmiejska 52
+S: Radunica
+S: 83-000 Pruszcz Gdanski
+S: Poland
+
 N: Randolph Bentson
 E: bentson@grieg.seaslug.org
 W: http://www.aa.net/~bentson/
@@ -3485,6 +3494,14 @@ S: 12725 SW Millikan Way, Suite 400
 S: Beaverton, OR 97005
 S: USA
 
+N: Michal Wronski
+E: wrona@mat.uni.torun.pl
+W: http://www.mat.uni.torun.pl/~wrona
+D: POSIX message queues fs (with K. Benedyczak)
+S: ul. Teczowa 23/12
+S: 80-680 Gdansk-Sobieszewo
+S: Poland
+
 N: Frank Xia
 E: qx@math.columbia.edu
 D: Xiafs filesystem [defunct]
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22fd3adcc96e..378722d5bb70 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -38,6 +38,7 @@ Table of Contents
   2.8	/proc/sys/net/ipv4 - IPV4 settings
   2.9	Appletalk
   2.10	IPX
+  2.11	/proc/sys/fs/mqueue - POSIX message queues filesystem
 
 ------------------------------------------------------------------------------
 Preface
@@ -1814,6 +1815,30 @@ The /proc/net/ipx_route  table  holds  a list of IPX routes. For each route it
 gives the  destination  network, the router node (or Directly) and the network
 address of the router (or Connected) for internal networks.
 
+2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
+----------------------------------------------------------
+
+The "mqueue"  filesystem provides  the necessary kernel features to enable the
+creation of a  user space  library that  implements  the  POSIX message queues
+API (as noted by the  MSG tag in the  POSIX 1003.1-2001 version  of the System
+Interfaces specification.)
+
+The "mqueue" filesystem contains values for determining/setting  the amount of
+resources used by the file system.
+
+/proc/sys/fs/mqueue/queues_max is a read/write  file for  setting/getting  the
+maximum number of message queues allowed on the system.
+
+/proc/sys/fs/mqueue/msg_max  is  a  read/write file  for  setting/getting  the
+maximum number of messages in a queue value.  In fact it is the limiting value
+for another (user) limit which is set in mq_open invocation. This attribute of
+a queue must be less or equal then msg_max.
+
+/proc/sys/fs/mqueue/msgsize_max is  a read/write  file for setting/getting the
+maximum  message size value (it is every  message queue's attribute set during
+its creation).
+
+
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index e95efd9e00c6..fe02b1a4d286 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -123,6 +123,7 @@ typedef struct siginfo {
 #define __SI_FAULT	(3 << 16)
 #define __SI_CHLD	(4 << 16)
 #define __SI_RT		(5 << 16)
+#define __SI_MESGQ	(6 << 16)
 #define __SI_CODE(T,N)	((T) | ((N) & 0xffff))
 #else
 #define __SI_KILL	0
@@ -131,6 +132,7 @@ typedef struct siginfo {
 #define __SI_FAULT	0
 #define __SI_CHLD	0
 #define __SI_RT		0
+#define __SI_MESGQ	0
 #define __SI_CODE(T,N)	(N)
 #endif
 
@@ -142,7 +144,7 @@ typedef struct siginfo {
 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere */
 #define SI_QUEUE	-1		/* sent by sigqueue */
 #define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
-#define SI_MESGQ	-3		/* sent by real time mesq state change */
+#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
 #define SI_ASYNCIO	-4		/* sent by AIO completion */
 #define SI_SIGIO	-5		/* sent by queued SIGIO */
 #define SI_TKILL	-6		/* sent by tkill system call */
diff --git a/init/Kconfig b/init/Kconfig
index c10fec8ebe9e..9eff25e8f6ed 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -90,6 +90,24 @@ config SYSVIPC
 	  section 6.4 of the Linux Programmer's Guide, available from
 	  <http://www.tldp.org/guides.html>.
 
+config POSIX_MQUEUE
+	bool "POSIX Message Queues"
+	depends on EXPERIMENTAL
+	---help---
+	  POSIX variant of message queues is a part of IPC. In POSIX message
+	  queues every message has a priority which decides about succession
+	  of receiving it by a process. If you want to compile and run
+	  programs written e.g. for Solaris with use of its POSIX message
+	  queues (functions mq_*) say Y here. To use this feature you will
+	  also need mqueue library, available from
+	  <http://www.mat.uni.torun.pl/~wrona/posix_ipc/>
+
+	  POSIX message queues are visible as a filesystem called 'mqueue'
+	  and can be mounted somewhere if you want to do filesystem
+	  operations on message queues.
+
+	  If unsure, say Y.
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	help
diff --git a/ipc/Makefile b/ipc/Makefile
index 6cd32a30f03f..913790207d85 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -4,3 +4,5 @@
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
 obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
new file mode 100644
index 000000000000..4de249718675
--- /dev/null
+++ b/ipc/mqueue.c
@@ -0,0 +1,1165 @@
+/*
+ * POSIX message queues filesystem for Linux.
+ *
+ * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
+ *                          Michal Wronski          (wrona@mat.uni.torun.pl)
+ *
+ * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
+ * Lockless receive & send, fd based notify:
+ * 			    Manfred Spraul	    (manfred@colorfullife.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/mqueue.h>
+#include <linux/msg.h>
+#include "util.h"
+
+#define MQUEUE_MAGIC	0x19800202
+#define DIRENT_SIZE	20
+#define FILENT_SIZE	80
+
+#define SEND		0
+#define RECV		1
+
+#define STATE_NONE	0
+#define STATE_PENDING	1
+#define STATE_READY	2
+
+#define NP_NONE		((void*)NOTIFY_NONE)
+#define NP_WOKENUP	((void*)NOTIFY_WOKENUP)
+#define NP_REMOVED	((void*)NOTIFY_REMOVED)
+/* used by sysctl */
+#define FS_MQUEUE 	1
+#define CTL_QUEUESMAX 	2
+#define CTL_MSGMAX 	3
+#define CTL_MSGSIZEMAX 	4
+
+/* default values */
+#define DFLT_QUEUESMAX	64	/* max number of message queues */
+#define DFLT_MSGMAX 	40	/* max number of messages in each queue */
+#define HARD_MSGMAX 	(131072/sizeof(void*))
+#define DFLT_MSGSIZEMAX 16384	/* max message size */
+
+struct ext_wait_queue {		/* queue of sleeping tasks */
+	struct task_struct *task;
+	struct list_head list;
+	struct msg_msg *msg;	/* ptr of loaded message */
+	int state;		/* one of STATE_* values */
+};
+
+struct mqueue_inode_info {
+	struct mq_attr attr;
+	struct msg_msg **messages;
+
+	pid_t notify_owner;	/* != 0 means notification registered */
+	struct sigevent notify;
+	struct file *notify_filp;
+
+	/* for tasks waiting for free space and messages, respectively */
+	struct ext_wait_queue e_wait_q[2];
+	wait_queue_head_t wait_q;
+
+	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
+	spinlock_t lock;
+	struct inode vfs_inode;
+};
+
+static struct inode_operations mqueue_dir_inode_operations;
+static struct file_operations mqueue_file_operations;
+static struct file_operations mqueue_notify_fops;
+static struct super_operations mqueue_super_ops;
+static void remove_notification(struct mqueue_inode_info *info);
+
+static spinlock_t mq_lock;
+static kmem_cache_t *mqueue_inode_cachep;
+static struct vfsmount *mqueue_mnt;
+
+static unsigned int queues_count;
+static unsigned int queues_max 	= DFLT_QUEUESMAX;
+static unsigned int msg_max 	= DFLT_MSGMAX;
+static unsigned int msgsize_max = DFLT_MSGSIZEMAX;
+
+static struct ctl_table_header * mq_sysctl_table;
+
+static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
+{
+	return container_of(inode, struct mqueue_inode_info, vfs_inode);
+}
+
+static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mtime = inode->i_ctime = inode->i_atime =
+				CURRENT_TIME;
+
+		if (S_ISREG(mode)) {
+			struct mqueue_inode_info *info;
+
+			inode->i_fop = &mqueue_file_operations;
+			inode->i_size = FILENT_SIZE;
+			/* mqueue specific info */
+			info = MQUEUE_I(inode);
+			spin_lock_init(&info->lock);
+			init_waitqueue_head(&info->wait_q);
+			INIT_LIST_HEAD(&info->e_wait_q[0].list);
+			INIT_LIST_HEAD(&info->e_wait_q[1].list);
+			info->notify_owner = 0;
+			info->qsize = 0;
+			info->attr.mq_curmsgs = 0;
+			info->messages = NULL;
+		} else if (S_ISDIR(mode)) {
+			inode->i_nlink++;
+			inode->i_op = &mqueue_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+		}
+	}
+	return inode;
+}
+
+static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+
+	sb->s_flags = MS_NOUSER;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = MQUEUE_MAGIC;
+	sb->s_op = &mqueue_super_ops;
+
+	inode = mqueue_get_inode(sb, S_IFDIR | S_IRWXUGO);
+	if (!inode)
+		return -ENOMEM;
+
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
+					 int flags, const char *dev_name,
+					 void *data)
+{
+	return get_sb_single(fs_type, flags, data, mqueue_fill_super);
+}
+
+static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+		SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&p->vfs_inode);
+}
+
+static struct inode *mqueue_alloc_inode(struct super_block *sb)
+{
+	struct mqueue_inode_info *ei;
+
+	ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void mqueue_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
+}
+
+static void mqueue_delete_inode(struct inode *inode)
+{
+	struct mqueue_inode_info *info;
+	int i;
+
+	if (S_ISDIR(inode->i_mode)) {
+		clear_inode(inode);
+		return;
+	}
+	info = MQUEUE_I(inode);
+	spin_lock(&info->lock);
+	for (i = 0; i < info->attr.mq_curmsgs; i++)
+		free_msg(info->messages[i]);
+	kfree(info->messages);
+	spin_unlock(&info->lock);
+
+	clear_inode(inode);
+
+	spin_lock(&mq_lock);
+	queues_count--;
+	spin_unlock(&mq_lock);
+}
+
+static int mqueue_create(struct inode *dir, struct dentry *dentry,
+				int mode, struct nameidata *nd)
+{
+	struct inode *inode;
+	int error;
+
+	spin_lock(&mq_lock);
+	if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
+		error = -ENOSPC;
+		goto out_lock;
+	}
+	queues_count++;
+	spin_unlock(&mq_lock);
+
+	inode = mqueue_get_inode(dir->i_sb, mode);
+	if (!inode) {
+		error = -ENOMEM;
+		spin_lock(&mq_lock);
+		queues_count--;
+		goto out_lock;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	return 0;
+out_lock:
+	spin_unlock(&mq_lock);
+	return error;
+}
+
+static int mqueue_flush_file(struct file *filp)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+
+	spin_lock(&info->lock);
+	if (current->tgid == info->notify_owner)
+		remove_notification(info);
+
+	spin_unlock(&info->lock);
+	return 0;
+}
+
+/* Adds current to info->e_wait_q[sr] before element with smaller prio */
+static void wq_add(struct mqueue_inode_info *info, int sr,
+			struct ext_wait_queue *ewp)
+{
+	struct ext_wait_queue *walk;
+
+	ewp->task = current;
+
+	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
+		if (walk->task->static_prio <= current->static_prio) {
+			list_add_tail(&ewp->list, &walk->list);
+			return;
+		}
+	}
+	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
+}
+
+/*
+ * Puts current task to sleep. Caller must hold queue lock. After return
+ * lock isn't held.
+ * sr: SEND or RECV
+ */
+static int wq_sleep(struct mqueue_inode_info *info, int sr,
+			long timeout, struct ext_wait_queue *ewp)
+{
+	int retval;
+	signed long time;
+
+	wq_add(info, sr, ewp);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock(&info->lock);
+		time = schedule_timeout(timeout);
+
+		while (ewp->state == STATE_PENDING)
+			cpu_relax();
+
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out;
+		}
+		spin_lock(&info->lock);
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out_unlock;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (time == 0) {
+			retval = -ETIMEDOUT;
+			break;
+		}
+	}
+	list_del(&ewp->list);
+out_unlock:
+	spin_unlock(&info->lock);
+out:
+	return retval;
+}
+
+/*
+ * Returns waiting task that should be serviced first or NULL if none exists
+ */
+static struct ext_wait_queue *wq_get_first_waiter(
+		struct mqueue_inode_info *info, int sr)
+{
+	struct list_head *ptr;
+
+	ptr = info->e_wait_q[sr].list.prev;
+	if (ptr == &info->e_wait_q[sr].list)
+		return NULL;
+	return list_entry(ptr, struct ext_wait_queue, list);
+}
+
+/* Auxiliary functions to manipulate messages' list */
+static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
+{
+	int k;
+
+	k = info->attr.mq_curmsgs - 1;
+	while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
+		info->messages[k + 1] = info->messages[k];
+		k--;
+	}
+	info->attr.mq_curmsgs++;
+	info->qsize += ptr->m_ts;
+	info->messages[k + 1] = ptr;
+}
+
+static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
+{
+	info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
+	return info->messages[info->attr.mq_curmsgs];
+}
+
+/*
+ * The next function is only to split too long sys_mq_timedsend
+ */
+static void __do_notify(struct mqueue_inode_info *info)
+{
+	/* notification
+	 * invoked when there is registered process and there isn't process
+	 * waiting synchronously for message AND state of queue changed from
+	 * empty to not empty. Here we are sure that no one is waiting
+	 * synchronously. */
+	if (info->notify_owner && info->attr.mq_curmsgs == 1) {
+		/* sends signal */
+		if (info->notify.sigev_notify == SIGEV_SIGNAL) {
+			struct siginfo sig_i;
+
+			sig_i.si_signo = info->notify.sigev_signo;
+			sig_i.si_errno = 0;
+			sig_i.si_code = SI_MESGQ;
+			sig_i.si_value = info->notify.sigev_value;
+			sig_i.si_pid = current->tgid;
+			sig_i.si_uid = current->uid;
+
+			kill_proc_info(info->notify.sigev_signo,
+				       &sig_i, info->notify_owner);
+		} else if (info->notify.sigev_notify == SIGEV_THREAD) {
+			info->notify_filp->private_data = (void*)NP_WOKENUP;
+			wake_up(&info->wait_q);
+		}
+		/* after notification unregisters process */
+		info->notify_owner = 0;
+	}
+}
+
+static long prepare_timeout(const struct timespec __user *u_arg)
+{
+	struct timespec ts, nowts;
+	long timeout;
+
+	if (u_arg) {
+		if (unlikely(copy_from_user(&ts, u_arg,
+					sizeof(struct timespec))))
+			return -EFAULT;
+
+		if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0
+			|| ts.tv_nsec >= NSEC_PER_SEC))
+			return -EINVAL;
+		nowts = CURRENT_TIME;
+		/* first subtract as jiffies can't be too big */
+		ts.tv_sec -= nowts.tv_sec;
+		if (ts.tv_nsec < nowts.tv_nsec) {
+			ts.tv_nsec += NSEC_PER_SEC;
+			ts.tv_sec--;
+		}
+		ts.tv_nsec -= nowts.tv_nsec;
+		if (ts.tv_sec < 0)
+			return 0;
+
+		timeout = timespec_to_jiffies(&ts) + 1;
+	} else
+		return MAX_SCHEDULE_TIMEOUT;
+
+	return timeout;
+}
+
+/*
+ * File descriptor based notification, intended to be used to implement
+ * SIGEV_THREAD:
+ * SIGEV_THREAD means that a notification function should be called in the
+ * context of a new thread. The kernel can't do that. Therefore mq_notify
+ * calls with SIGEV_THREAD return a new file descriptor. A user space helper
+ * must create a new thread and then read from the given file descriptor.
+ * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must
+ * call the notification function. If it's NOTIFY_REMOVED, then the
+ * notification was removed. The file descriptor supports poll, thus one
+ * supervisor thread can manage multiple message queue notifications.
+ *
+ * The implementation must support multiple outstanding notifications:
+ * It's possible that a new notification is added and signaled before user
+ * space calls mqueue_notify_read for the previous notification.
+ * Therefore the notification state is stored in the private_data field of
+ * the file descriptor.
+ */
+static unsigned int mqueue_notify_poll(struct file *filp,
+					struct poll_table_struct *poll_tab)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+	int retval;
+
+	poll_wait(filp, &info->wait_q, poll_tab);
+
+	if (filp->private_data == NP_NONE)
+		retval = 0;
+	else
+		retval = POLLIN | POLLRDNORM;
+	return retval;
+}
+
+static ssize_t mqueue_notify_read(struct file *filp, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+	char result;
+
+	if (!count)
+		return 0;
+	if (*ppos != 0)
+		return 0;
+	spin_lock(&info->lock);
+	while (filp->private_data == NP_NONE) {
+		DEFINE_WAIT(wait);
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			return -EAGAIN;
+		}
+		prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock(&info->lock);
+		schedule();
+		finish_wait(&info->wait_q, &wait);
+		spin_lock(&info->lock);
+	}
+	spin_unlock(&info->lock);
+	result = (char)(unsigned long)filp->private_data;
+	if (put_user(result, buf))
+		return -EFAULT;
+	*ppos = 1;
+	return 1;
+}
+
+static int mqueue_notify_release(struct inode *inode, struct file *filp)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+
+	spin_lock(&info->lock);
+	if (info->notify_owner && info->notify_filp == filp)
+		info->notify_owner = 0;
+	filp->private_data = NP_REMOVED;
+	spin_unlock(&info->lock);
+
+	return 0;
+}
+
+static void remove_notification(struct mqueue_inode_info *info)
+{
+	if (info->notify.sigev_notify == SIGEV_THREAD) {
+		info->notify_filp->private_data = NP_REMOVED;
+		wake_up(&info->wait_q);
+	}
+	info->notify_owner = 0;
+}
+
+/*
+ * Invoked when creating a new queue via sys_mq_open
+ */
+static struct file *do_create(struct dentry *dir, struct dentry *dentry,
+			int oflag, mode_t mode, struct mq_attr __user *u_attr)
+{
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct msg_msg **msgs = NULL;
+	struct mq_attr attr;
+	int ret;
+
+	if (u_attr != NULL) {
+		if (copy_from_user(&attr, u_attr, sizeof(attr)))
+			return ERR_PTR(-EFAULT);
+
+		if (attr.mq_maxmsg <= 0 || attr.mq_msgsize <= 0)
+			return ERR_PTR(-EINVAL);
+		if (capable(CAP_SYS_RESOURCE)) {
+			if (attr.mq_maxmsg > HARD_MSGMAX)
+				return ERR_PTR(-EINVAL);
+		} else {
+			if (attr.mq_maxmsg > msg_max ||
+					attr.mq_msgsize > msgsize_max)
+				return ERR_PTR(-EINVAL);
+		}
+	} else {
+		attr.mq_maxmsg = DFLT_MSGMAX;
+		attr.mq_msgsize = DFLT_MSGSIZEMAX;
+	}
+	msgs = kmalloc(attr.mq_maxmsg * sizeof(*msgs), GFP_KERNEL);
+	if (!msgs)
+		return ERR_PTR(-ENOMEM);
+
+	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
+	if (ret) {
+		kfree(msgs);
+		return ERR_PTR(ret);
+	}
+
+	inode = dentry->d_inode;
+	info = MQUEUE_I(inode);
+
+	info->attr.mq_maxmsg = attr.mq_maxmsg;
+	info->attr.mq_msgsize = attr.mq_msgsize;
+	info->messages = msgs;
+
+	filp = dentry_open(dentry, mqueue_mnt, oflag);
+	if (!IS_ERR(filp))
+		dget(dentry);
+
+	return filp;
+}
+
+/* Opens existing queue */
+static struct file *do_open(struct dentry *dentry, int oflag)
+{
+static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
+					MAY_READ | MAY_WRITE };
+	struct file *filp;
+
+	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL))
+		return ERR_PTR(-EACCES);
+
+	filp = dentry_open(dentry, mqueue_mnt, oflag);
+
+	if (!IS_ERR(filp))
+		dget(dentry);
+
+	return filp;
+}
+
+asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
+				struct mq_attr __user *u_attr)
+{
+	struct dentry *dentry;
+	struct file *filp;
+	char *name;
+	int fd, error;
+
+	if (IS_ERR(name = getname(u_name)))
+		return PTR_ERR(name);
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		goto out_putname;
+
+	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	if (IS_ERR(dentry)) {
+		error = PTR_ERR(dentry);
+		goto out_err;
+	}
+	mntget(mqueue_mnt);
+
+	if (oflag & O_CREAT) {
+		if (dentry->d_inode) {	/* entry already exists */
+			filp = (oflag & O_EXCL) ? ERR_PTR(-EEXIST) :
+					do_open(dentry, oflag);
+		} else {
+			filp = do_create(mqueue_mnt->mnt_root, dentry,
+						oflag, mode, u_attr);
+		}
+	} else
+		filp = (dentry->d_inode) ? do_open(dentry, oflag) :
+					ERR_PTR(-ENOENT);
+
+	dput(dentry);
+
+	if (IS_ERR(filp)) {
+		error = PTR_ERR(filp);
+		goto out_putfd;
+	}
+
+	fd_install(fd, filp);
+	goto out_upsem;
+
+out_putfd:
+	mntput(mqueue_mnt);
+	put_unused_fd(fd);
+out_err:
+	fd = error;
+out_upsem:
+	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+out_putname:
+	putname(name);
+	return fd;
+}
+
+asmlinkage long sys_mq_unlink(const char __user *u_name)
+{
+	int err;
+	char *name;
+	struct dentry *dentry;
+	struct inode *inode = NULL;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+
+	if (!dentry->d_inode) {
+		err = -ENOENT;
+		goto out_err;
+	}
+
+	if (permission(dentry->d_inode, MAY_WRITE, NULL)) {
+		err = -EACCES;
+		goto out_err;
+	}
+	inode = dentry->d_inode;
+	if (inode)
+		atomic_inc(&inode->i_count);
+
+	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+out_err:
+	dput(dentry);
+
+out_unlock:
+	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	putname(name);
+	if (inode)
+		iput(inode);
+
+	return err;
+}
+
+/* Pipelined send and receive functions.
+ *
+ * If a receiver finds no waiting message, then it registers itself in the
+ * list of waiting receivers. A sender checks that list before adding the new
+ * message into the message array. If there is a waiting receiver, then it
+ * bypasses the message array and directly hands the message over to the
+ * receiver.
+ * The receiver accepts the message and returns without grabbing the queue
+ * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
+ * are necessary. The same algorithm is used for sysv semaphores, see
+ * ipc/sem.c fore more details.
+ *
+ * The same algorithm is used for senders.
+ */
+
+/* pipelined_send() - send a message directly to the task waiting in
+ * sys_mq_timedreceive() (without inserting message into a queue). */
+static inline void pipelined_send(struct mqueue_inode_info *info,
+				  struct msg_msg *message,
+				  struct ext_wait_queue *receiver)
+{
+	receiver->msg = message;
+	list_del(&receiver->list);
+	receiver->state = STATE_PENDING;
+	wake_up_process(receiver->task);
+	wmb();
+	receiver->state = STATE_READY;
+}
+
+/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
+ * gets its message and put to the queue (we have one free place for sure). */
+static inline void pipelined_receive(struct mqueue_inode_info *info)
+{
+	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
+
+	if (!sender)
+		return;
+
+	msg_insert(sender->msg, info);
+	list_del(&sender->list);
+	sender->state = STATE_PENDING;
+	wake_up_process(sender->task);
+	wmb();
+	sender->state = STATE_READY;
+}
+
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
+	size_t msg_len, unsigned int msg_prio,
+	const struct timespec __user *u_abs_timeout)
+{
+	struct file *filp;
+	struct inode *inode;
+	struct ext_wait_queue wait;
+	struct ext_wait_queue *receiver;
+	struct msg_msg *msg_ptr;
+	struct mqueue_inode_info *info;
+	long timeout;
+	int ret;
+
+	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
+		return -EINVAL;
+
+	timeout = prepare_timeout(u_abs_timeout);
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (unlikely(!filp))
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	if (unlikely((filp->f_flags & O_ACCMODE) == O_RDONLY))
+		goto out_fput;
+
+	if (unlikely(msg_len > info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	/* First try to allocate memory, before doing anything with
+	 * existing queues. */
+	msg_ptr = load_msg((void *)u_msg_ptr, msg_len);
+	if (unlikely(IS_ERR(msg_ptr))) {
+		ret = PTR_ERR(msg_ptr);
+		goto out_fput;
+	}
+	msg_ptr->m_ts = msg_len;
+	msg_ptr->m_type = msg_prio;
+
+	spin_lock(&info->lock);
+
+	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+		} else if (unlikely(timeout < 0)) {
+			spin_unlock(&info->lock);
+			ret = timeout;
+		} else {
+			wait.task = current;
+			wait.msg = (void *) msg_ptr;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, SEND, timeout, &wait);
+			if (ret < 0)
+				free_msg(msg_ptr);
+		}
+	} else {
+		receiver = wq_get_first_waiter(info, RECV);
+		if (receiver) {
+			pipelined_send(info, msg_ptr, receiver);
+		} else {
+			/* adds message to the queue */
+			msg_insert(msg_ptr, info);
+			__do_notify(info);
+		}
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+				CURRENT_TIME;
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
+	size_t msg_len, unsigned int __user *u_msg_prio,
+	const struct timespec __user *u_abs_timeout)
+{
+	long timeout;
+	ssize_t ret;
+	struct msg_msg *msg_ptr;
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct ext_wait_queue wait;
+
+	timeout = prepare_timeout(u_abs_timeout);
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (unlikely(!filp))
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	if (unlikely((filp->f_flags & O_ACCMODE) == O_WRONLY))
+		goto out_fput;
+
+	/* checks if buffer is big enough */
+	if (unlikely(msg_len < info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	spin_lock(&info->lock);
+	if (info->attr.mq_curmsgs == 0) {
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+			msg_ptr = NULL;
+		} else if (unlikely(timeout < 0)) {
+			spin_unlock(&info->lock);
+			ret = timeout;
+			msg_ptr = NULL;
+		} else {
+			wait.task = current;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, RECV, timeout, &wait);
+			msg_ptr = wait.msg;
+		}
+	} else {
+		msg_ptr = msg_get(info);
+
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+				CURRENT_TIME;
+
+		/* There is now free space in queue. */
+		pipelined_receive(info);
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+	if (ret == 0) {
+		ret = msg_ptr->m_ts;
+
+		if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
+			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
+			ret = -EFAULT;
+		}
+		free_msg(msg_ptr);
+	}
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+/*
+ * Notes: the case when user wants us to deregister (with NULL as pointer
+ * or SIGEV_NONE) and he isn't currently owner of notification will be
+ * silently discarded. It isn't explicitly defined in the POSIX.
+ */
+asmlinkage long sys_mq_notify(mqd_t mqdes,
+				const struct sigevent __user *u_notification)
+{
+	int ret, fd;
+	struct file *filp, *nfilp;
+	struct inode *inode;
+	struct sigevent notification;
+	struct mqueue_inode_info *info;
+
+	if (u_notification == NULL) {
+		notification.sigev_notify = SIGEV_NONE;
+	} else {
+		if (copy_from_user(&notification, u_notification,
+					sizeof(struct sigevent)))
+			return -EFAULT;
+
+		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
+			     notification.sigev_notify != SIGEV_SIGNAL &&
+			     notification.sigev_notify != SIGEV_THREAD))
+			return -EINVAL;
+		if (notification.sigev_notify == SIGEV_SIGNAL &&
+			(notification.sigev_signo < 0 ||
+			 notification.sigev_signo > _NSIG)) {
+			return -EINVAL;
+		}
+	}
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (!filp)
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	ret = 0;
+	if (notification.sigev_notify == SIGEV_THREAD) {
+		ret = get_unused_fd();
+		if (ret < 0)
+			goto out_fput;
+		fd = ret;
+		nfilp = get_empty_filp();
+		if (!nfilp) {
+			ret = -ENFILE;
+			goto out_dropfd;
+		}
+		nfilp->private_data = NP_NONE;
+		nfilp->f_op = &mqueue_notify_fops;
+		nfilp->f_vfsmnt = mntget(mqueue_mnt);
+		nfilp->f_dentry = dget(filp->f_dentry);
+		nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+		nfilp->f_mode = FMODE_READ;
+	} else {
+		nfilp = NULL;
+		fd = -1;
+	}
+
+	spin_lock(&info->lock);
+
+	if (notification.sigev_notify == SIGEV_NONE) {
+		if (info->notify_owner == current->tgid) {
+			remove_notification(info);
+			inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		}
+	} else if (info->notify_owner) {
+		ret = -EBUSY;
+	} else if (notification.sigev_notify == SIGEV_THREAD) {
+		info->notify_filp = nfilp;
+		fd_install(fd, nfilp);
+		ret = fd;
+		fd = -1;
+		nfilp = NULL;
+		info->notify.sigev_notify = SIGEV_THREAD;
+		info->notify_owner = current->tgid;
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}  else {
+		info->notify.sigev_signo = notification.sigev_signo;
+		info->notify.sigev_value = notification.sigev_value;
+		info->notify.sigev_notify = SIGEV_SIGNAL;
+		info->notify_owner = current->tgid;
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+	spin_unlock(&info->lock);
+out_dropfd:
+	if (fd != -1)
+		put_unused_fd(fd);
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
+			const struct mq_attr __user *u_mqstat,
+			struct mq_attr __user *u_omqstat)
+{
+	int ret;
+	struct mq_attr mqstat, omqstat;
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+
+	if (u_mqstat != NULL) {
+		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
+			return -EFAULT;
+	}
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (!filp)
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	spin_lock(&info->lock);
+
+	omqstat = info->attr;
+	omqstat.mq_flags = filp->f_flags;
+	if (u_mqstat) {
+		if (mqstat.mq_flags & O_NONBLOCK)
+			filp->f_flags |= O_NONBLOCK;
+		else
+			filp->f_flags &= ~O_NONBLOCK;
+
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+
+	spin_unlock(&info->lock);
+
+	ret = 0;
+	if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
+						sizeof(struct mq_attr)))
+		ret = -EFAULT;
+
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+static struct inode_operations mqueue_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.create = mqueue_create,
+	.unlink = simple_unlink,
+};
+
+static struct file_operations mqueue_file_operations = {
+	.flush = mqueue_flush_file,
+};
+
+static struct file_operations mqueue_notify_fops = {
+	.poll = mqueue_notify_poll,
+	.read = mqueue_notify_read,
+	.release = mqueue_notify_release,
+};
+
+
+static struct super_operations mqueue_super_ops = {
+	.alloc_inode = mqueue_alloc_inode,
+	.destroy_inode = mqueue_destroy_inode,
+	.delete_inode = mqueue_delete_inode,
+	.drop_inode = generic_delete_inode,
+};
+
+static struct file_system_type mqueue_fs_type = {
+	.name = "mqueue",
+	.get_sb = mqueue_get_sb,
+	.kill_sb = kill_anon_super,
+};
+
+static int msg_max_limit_min = DFLT_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
+
+static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX;
+static int msg_maxsize_limit_max = INT_MAX;
+
+static ctl_table mq_sysctls[] = {
+	{
+		.ctl_name	= CTL_QUEUESMAX,
+		.procname	= "queues_max",
+		.data		= &queues_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_MSGMAX,
+		.procname	= "msg_max",
+		.data		= &msg_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.ctl_name	= CTL_MSGSIZEMAX,
+		.procname	= "msgsize_max",
+		.data		= &msgsize_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_dir[] = {
+	{
+		.ctl_name	= FS_MQUEUE,
+		.procname	= "mqueue",
+		.mode		= 0555,
+		.child		= mq_sysctls,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_root[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= mq_sysctl_dir,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init init_mqueue_fs(void)
+{
+	int error;
+
+	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
+				sizeof(struct mqueue_inode_info), 0,
+				SLAB_HWCACHE_ALIGN, init_once, NULL);
+	if (mqueue_inode_cachep == NULL)
+		return -ENOMEM;
+
+	mq_sysctl_table = register_sysctl_table(mq_sysctl_root, 0);
+	if (!mq_sysctl_table) {
+		error = -ENOMEM;
+		goto out_cache;
+	}
+
+	error = register_filesystem(&mqueue_fs_type);
+	if (error)
+		goto out_sysctl;
+
+	if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) {
+		error = PTR_ERR(mqueue_mnt);
+		goto out_filesystem;
+	}
+
+	/* internal initialization - not common for vfs */
+	queues_count = 0;
+	spin_lock_init(&mq_lock);
+
+	return 0;
+
+out_filesystem:
+	unregister_filesystem(&mqueue_fs_type);
+out_sysctl:
+	unregister_sysctl_table(mq_sysctl_table);
+out_cache:
+	if (kmem_cache_destroy(mqueue_inode_cachep)) {
+		printk(KERN_INFO
+			"mqueue_inode_cache: not all structures were freed\n");
+	}
+	return error;
+}
+
+__initcall(init_mqueue_fs);
diff --git a/kernel/signal.c b/kernel/signal.c
index 32992a71683b..e6b7904df68f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2047,6 +2047,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 		err |= __put_user(from->si_stime, &to->si_stime);
 		break;
 	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_int, &to->si_int);
-- 
cgit v1.2.3


From 87c22e8470366e81aa82bcbadaf147c4ecdfb182 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:45 -0700
Subject: [PATCH] compat emulation for posix message queues

From: Arnd Bergmann <arnd@arndb.de>

I have tested the code with the open posix test suite and found the same
four failures for both 64-bit and compat mode, most tests pass.  The patch
is against -mc1, but I guess it also applies to the other trees around.

What worries me more than mq_attr compatibility is the conversion of struct
sigevent, which might turn out really hard when more fields in there are
used.  AFAICS, the only other part in the kernel ABI is sys_timer_create(),
so maybe it's not too late to deprecate the current structure and create a
structure that can be used properly for compat syscalls.
---
 arch/ia64/ia32/ia32_signal.c     |   7 +-
 arch/mips/kernel/signal32.c      |   7 +-
 arch/s390/kernel/compat_signal.c |   5 +-
 arch/sparc64/kernel/signal32.c   |   7 +-
 arch/x86_64/ia32/ia32_signal.c   |   6 +-
 include/asm-ppc64/ppc32.h        |  14 ---
 include/linux/compat.h           |  17 ++++
 include/linux/mqueue.h           |   4 +-
 include/linux/posix_types.h      |   1 +
 include/linux/syscalls.h         |   1 -
 include/linux/types.h            |   1 +
 ipc/Makefile                     |   3 +-
 ipc/compat_mq.c                  | 196 +++++++++++++++++++++++++++++++++++++++
 kernel/sys.c                     |   5 +
 14 files changed, 251 insertions(+), 23 deletions(-)
 create mode 100644 ipc/compat_mq.c

(limited to 'kernel')

diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c
index 8b1374c172b6..bb1e836fb227 100644
--- a/arch/ia64/ia32/ia32_signal.c
+++ b/arch/ia64/ia32/ia32_signal.c
@@ -114,7 +114,12 @@ copy_siginfo_from_user32 (siginfo_t *to, siginfo_t32 *from)
 			err |= __get_user(to->si_band, &from->si_band);
 			err |= __get_user(to->si_fd, &from->si_fd);
 			break;
-			/* case __SI_RT: This is not generated by the kernel as of now.  */
+		      case __SI_RT: /* This is not generated by the kernel as of now.  */
+		      case __SI_MESGQ:
+			err |= __get_user(to->si_pid, &from->si_pid);
+			err |= __get_user(to->si_uid, &from->si_uid);
+			err |= __get_user(to->si_int, &from->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 5c1489f4fdc2..c52074f84300 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -358,7 +358,12 @@ static int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_pid, &to->si_pid);
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 44fe6e477e92..373040404a5a 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -74,6 +74,10 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
 		switch (from->si_code >> 16) {
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_int, &to->si_int);
+			/* fallthrough */
 		case __SI_KILL >> 16:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
@@ -96,7 +100,6 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 			break;
 		default:
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
 		}
 	}
 	return err;
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index cc3019d6dd65..e2f62a666d8c 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -129,7 +129,12 @@ int copy_siginfo_to_user32(siginfo_t32 __user *to, siginfo_t *from)
 			err |= __put_user(from->si_trapno, &to->si_trapno);
 			err |= __put_user((long)from->si_addr, &to->si_addr);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_pid, &to->si_pid);
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index bce5fbc5be2c..1a828de6a55d 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -85,7 +85,11 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from)
 			err |= __put_user(from->si_overrun, &to->si_overrun); 
 			err |= __put_user((u32)(u64)from->si_ptr, &to->si_ptr);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h
index 53865a8c4f8d..7338ea298a19 100644
--- a/include/asm-ppc64/ppc32.h
+++ b/include/asm-ppc64/ppc32.h
@@ -141,20 +141,6 @@ struct ucontext32 {
 	struct mcontext32	uc_mcontext;
 };
 
-typedef struct compat_sigevent {
-	compat_sigval_t sigev_value;
-	int sigev_signo;
-	int sigev_notify;
-	union {
-		int _pad[SIGEV_PAD_SIZE];
-		int _tid;
-		struct {
-			compat_uptr_t _function;
-			compat_uptr_t _attribute;
-		} _sigev_thread;
-	} _sigev_un;
-} compat_sigevent_t;
-
 struct ipc_kludge_32 {
 	unsigned int msgp;
 	int msgtyp;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 7b82209ab4ab..796204f59bd9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -13,6 +13,7 @@
 #include <linux/sem.h>
 
 #include <asm/compat.h>
+#include <asm/siginfo.h>
 
 #define compat_jiffies_to_clock_t(x)	\
 		(((unsigned long)(x) * COMPAT_USER_HZ) / HZ)
@@ -90,6 +91,22 @@ typedef union compat_sigval {
 	compat_uptr_t	sival_ptr;
 } compat_sigval_t;
 
+typedef struct compat_sigevent {
+	compat_sigval_t sigev_value;
+	compat_int_t sigev_signo;
+	compat_int_t sigev_notify;
+	union {
+		compat_int_t _pad[SIGEV_PAD_SIZE];
+		compat_int_t _tid;
+
+		struct {
+			compat_uptr_t _function;
+			compat_uptr_t _attribute;
+		} _sigev_thread;
+	} _sigev_un;
+} compat_sigevent_t;
+
+
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
 long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index fdab3b8ee242..fc40b774b913 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -18,9 +18,9 @@
 #ifndef _LINUX_MQUEUE_H
 #define _LINUX_MQUEUE_H
 
-#define MQ_PRIO_MAX 	32768
+#include <linux/types.h>
 
-typedef int mqd_t;
+#define MQ_PRIO_MAX 	32768
 
 struct mq_attr {
 	long	mq_flags;	/* message queue flags			*/
diff --git a/include/linux/posix_types.h b/include/linux/posix_types.h
index 3ee2ed9de1db..f04c98cf44f3 100644
--- a/include/linux/posix_types.h
+++ b/include/linux/posix_types.h
@@ -42,6 +42,7 @@ typedef void (*__kernel_sighandler_t)(int);
 
 /* Type of a SYSV IPC key.  */
 typedef int __kernel_key_t;
+typedef int __kernel_mqd_t;
 
 #include <asm/posix_types.h>
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7ee5f67abb5f..89ffe55898f2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,7 +48,6 @@ struct timex;
 struct timezone;
 struct tms;
 struct utimbuf;
-typedef int mqd_t;
 struct mq_attr;
 
 #include <linux/config.h>
diff --git a/include/linux/types.h b/include/linux/types.h
index 3b407b06b48f..93f5f3653561 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -31,6 +31,7 @@ typedef __kernel_key_t		key_t;
 typedef __kernel_suseconds_t	suseconds_t;
 typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
+typedef __kernel_mqd_t		mqd_t;
 
 #ifdef __KERNEL__
 typedef __kernel_uid32_t	uid_t;
diff --git a/ipc/Makefile b/ipc/Makefile
index 913790207d85..0a6d626cd794 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -4,5 +4,6 @@
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
 obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
-obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+obj_mq-$(CONFIG_COMPAT) += compat_mq.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
new file mode 100644
index 000000000000..1520df89c424
--- /dev/null
+++ b/ipc/compat_mq.c
@@ -0,0 +1,196 @@
+/*
+ *  ipc/compat_mq.c
+ *    32 bit emulation for POSIX message queue system calls
+ *
+ *    Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author: Arnd Bergmann <arnd@arndb.de>
+ */
+
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mqueue.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+struct compat_mq_attr {
+	compat_long_t mq_flags;      /* message queue flags		     */
+	compat_long_t mq_maxmsg;     /* maximum number of messages	     */
+	compat_long_t mq_msgsize;    /* maximum message size		     */
+	compat_long_t mq_curmsgs;    /* number of messages currently queued  */
+	compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
+};
+
+static inline int get_compat_mq_attr(struct mq_attr *attr,
+			const struct compat_mq_attr __user *uattr)
+{
+	if (verify_area(VERIFY_READ, uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __get_user(attr->mq_flags, &uattr->mq_flags)
+		| __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __get_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+static inline int put_compat_mq_attr(const struct mq_attr *attr,
+			struct compat_mq_attr __user *uattr)
+{
+	if (clear_user(uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __put_user(attr->mq_flags, &uattr->mq_flags)
+		| __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __put_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+asmlinkage long compat_sys_mq_open(const char __user *u_name,
+			int oflag, compat_mode_t mode,
+			struct compat_mq_attr __user *u_attr)
+{
+	struct mq_attr attr;
+	mm_segment_t oldfs;
+	char *name;
+	long ret;
+
+	if ((oflag & O_CREAT) == 0 || !u_attr)
+		return sys_mq_open(u_name, oflag, mode, 0);
+
+	if (get_compat_mq_attr(&attr, u_attr))
+		return -EFAULT;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_open(name, oflag, mode, &attr);
+	set_fs(oldfs);
+
+	putname(name);
+	return ret;
+}
+
+static struct timespec __user *compat_prepare_timeout(
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec ts, __user *u_ts;
+
+	if (!u_abs_timeout)
+		return 0;
+
+	u_ts = compat_alloc_user_space(sizeof(*u_ts));
+	if (get_compat_timespec(&ts, u_abs_timeout)
+		|| copy_to_user(u_ts, &ts, sizeof(*u_ts)))
+		return ERR_PTR(-EFAULT);
+
+	return u_ts;
+}
+
+asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes,
+			const char __user *u_msg_ptr,
+			size_t msg_len, unsigned int msg_prio,
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec __user *u_ts;
+
+	u_ts = compat_prepare_timeout(u_abs_timeout);
+	if (IS_ERR(u_ts))
+		return -EFAULT;
+
+	return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len,
+			msg_prio, u_ts);
+}
+
+asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes,
+			char __user *u_msg_ptr,
+			size_t msg_len, unsigned int __user *u_msg_prio,
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec *u_ts;
+
+	u_ts = compat_prepare_timeout(u_abs_timeout);
+	if (IS_ERR(u_ts))
+		return -EFAULT;
+
+	return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len,
+			u_msg_prio, u_ts);
+}
+
+static int get_compat_sigevent(struct sigevent *event,
+		const struct compat_sigevent __user *u_event)
+{
+	if (verify_area(VERIFY_READ, u_event, sizeof(*u_event)))
+		return -EFAULT;
+
+	return __get_user(event->sigev_value.sival_int,
+			  &u_event->sigev_value.sival_int)
+	     | __get_user(event->sigev_signo, &u_event->sigev_signo)
+	     | __get_user(event->sigev_notify, &u_event->sigev_notify)
+	     | __get_user(event->sigev_notify_thread_id,
+			  &u_event->sigev_notify_thread_id);
+}
+
+asmlinkage long compat_sys_mq_notify(mqd_t mqdes,
+			const struct compat_sigevent __user *u_notification)
+{
+	mm_segment_t oldfs;
+	struct sigevent notification;
+	char cookie[NOTIFY_COOKIE_LEN];
+	compat_uptr_t u_cookie;
+	long ret;
+
+	if (!u_notification)
+		return sys_mq_notify(mqdes, 0);
+
+	if (get_compat_sigevent(&notification, u_notification))
+		return -EFAULT;
+
+	if (notification.sigev_notify == SIGEV_THREAD) {
+		u_cookie = (compat_uptr_t)notification.sigev_value.sival_int;
+		if (copy_from_user(cookie, compat_ptr(u_cookie),
+						NOTIFY_COOKIE_LEN)) {
+			return -EFAULT;
+		}
+		notification.sigev_value.sival_ptr = cookie;
+	}
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_notify(mqdes, &notification);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
+			const struct compat_mq_attr __user *u_mqstat,
+			struct compat_mq_attr __user *u_omqstat)
+{
+	struct mq_attr mqstat, omqstat;
+	struct mq_attr *p_mqstat = 0, *p_omqstat = 0;
+	mm_segment_t oldfs;
+	long ret;
+
+	if (u_mqstat) {
+		p_mqstat = &mqstat;
+		if (get_compat_mq_attr(p_mqstat, u_mqstat))
+			return -EFAULT;
+	}
+
+	if (u_omqstat)
+		p_omqstat = &omqstat;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_getsetattr(mqdes, p_mqstat, p_omqstat);
+	set_fs(oldfs);
+
+	if (ret)
+		return ret;
+
+	return (u_omqstat) ? put_compat_mq_attr(&omqstat, u_omqstat) : 0;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d1bf5c57aca..81f9e02f2071 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -266,6 +266,11 @@ cond_syscall(sys_mq_timedsend)
 cond_syscall(sys_mq_timedreceive)
 cond_syscall(sys_mq_notify)
 cond_syscall(sys_mq_getsetattr)
+cond_syscall(compat_sys_mq_open)
+cond_syscall(compat_sys_mq_timedsend)
+cond_syscall(compat_sys_mq_timedreceive)
+cond_syscall(compat_sys_mq_notify)
+cond_syscall(compat_sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
-- 
cgit v1.2.3


From 7860b37198b0650f51bfafebac820386b552a071 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:46 -0700
Subject: [PATCH] move job control fields from task_struct to signal_struct

From: Roland McGrath <roland@redhat.com>

This patch moves all the fields relating to job control from task_struct to
signal_struct, so that all this info is properly per-process rather than
being per-thread.
---
 arch/ia64/kernel/unaligned.c    |   2 +-
 arch/sparc64/solaris/misc.c     |   2 +-
 drivers/char/n_tty.c            |   3 +-
 drivers/char/rocket.c           |   2 +-
 drivers/char/sx.c               |   2 +-
 drivers/char/tty_io.c           | 116 +++++++++++++++++-----------------------
 drivers/char/vt.c               |   2 +-
 drivers/char/vt_ioctl.c         |   3 +-
 drivers/net/slip.c              |   2 +-
 drivers/s390/char/keyboard.c    |   2 +-
 fs/binfmt_elf.c                 |   4 +-
 fs/compat_ioctl.c               |   2 +-
 fs/dquot.c                      |  10 ++--
 fs/exec.c                       |   5 ++
 fs/open.c                       |   2 +-
 fs/proc/array.c                 |  22 ++++----
 include/linux/sched.h           |  17 +++---
 kernel/acct.c                   |   2 +-
 kernel/exit.c                   |  22 ++++----
 kernel/fork.c                   |  10 ++--
 kernel/pid.c                    |   8 +--
 kernel/signal.c                 |   5 +-
 kernel/sys.c                    |  18 +++----
 net/bridge/netfilter/ebtables.c |   2 +-
 net/ipv4/netfilter/ipt_owner.c  |   2 +-
 net/ipv6/netfilter/ip6t_owner.c |   2 +-
 26 files changed, 133 insertions(+), 136 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 2247254be7ac..b1a68e4367bc 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1337,7 +1337,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			 * be holding locks...
 			 */
 			if (user_mode(regs))
-				tty_write_message(current->tty, buf);
+				tty_write_message(current->signal->tty, buf);
 			buf[len-1] = '\0';	/* drop '\r' */
 			printk(KERN_WARNING "%s", buf);	/* watch for command names containing %s */
 		}
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index ea7b2c439653..cea38c0cbb5c 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -402,7 +402,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid)
 			   Solaris setpgrp and setsid? */
 			ret = sys_setpgid(0, 0);
 			if (ret) return ret;
-			current->tty = NULL;
+			current->signal->tty = NULL;
 			return process_group(current);
 		}
 	case 2: /* getsid */
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index 0c02e2debbb1..08f46259e183 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -999,7 +999,8 @@ do_it_again:
 	/* NOTE: not yet done after every sleep pending a thorough
 	   check of the logic of this change. -- jlc */
 	/* don't stop on /dev/console */
-	if (file->f_op->write != redirected_tty_write && current->tty == tty) {
+	if (file->f_op->write != redirected_tty_write &&
+	    current->signal->tty == tty) {
 		if (tty->pgrp <= 0)
 			printk("read_chan: tty->pgrp <= 0!\n");
 		else if (process_group(current) != tty->pgrp) {
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 38544de9fbd9..b0da37eab8e7 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -953,7 +953,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	info->session = current->session;
+	info->session = current->signal->session;
 	info->pgrp = process_group(current);
 
 	if ((info->flags & ROCKET_INITIALIZED) == 0) {
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 25c95fbc65d3..643163b08a8f 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -1420,7 +1420,7 @@ static int sx_open  (struct tty_struct * tty, struct file * filp)
 
 	line = tty->index;
 	sx_dprintk (SX_DEBUG_OPEN, "%d: opening line %d. tty=%p ctty=%p, np=%d)\n", 
-	            current->pid, line, tty, current->tty, sx_nports);
+	            current->pid, line, tty, current->signal->tty, sx_nports);
 
 	if ((line < 0) || (line >= SX_NPORTS) || (line >= sx_nports))
 		return -ENODEV;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 0ba52078f637..e4607d86a755 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -321,7 +321,7 @@ struct tty_driver *get_tty_driver(dev_t device, int *index)
  */
 int tty_check_change(struct tty_struct * tty)
 {
-	if (current->tty != tty)
+	if (current->signal->tty != tty)
 		return 0;
 	if (tty->pgrp <= 0) {
 		printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n");
@@ -486,17 +486,14 @@ void do_tty_hangup(void *data)
 	if (tty->session > 0) {
 		struct list_head *l;
 		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-			task_t *task = p;
-			do {
-				if (task->tty == tty)
-					task->tty = NULL;
-				if (task->leader) {
-					send_group_sig_info(SIGHUP, SEND_SIG_PRIV, task);
-					send_group_sig_info(SIGCONT, SEND_SIG_PRIV, task);
-				}
-			} while_each_thread(p, task);
+			if (p->signal->tty == tty)
+				p->signal->tty = NULL;
+			if (!p->signal->leader)
+				continue;
+			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
-				p->tty_old_pgrp = tty->pgrp;
+				p->signal->tty_old_pgrp = tty->pgrp;
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -575,15 +572,15 @@ void disassociate_ctty(int on_exit)
 
 	lock_kernel();
 
-	tty = current->tty;
+	tty = current->signal->tty;
 	if (tty) {
 		tty_pgrp = tty->pgrp;
 		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
 			tty_vhangup(tty);
 	} else {
-		if (current->tty_old_pgrp) {
-			kill_pg(current->tty_old_pgrp, SIGHUP, on_exit);
-			kill_pg(current->tty_old_pgrp, SIGCONT, on_exit);
+		if (current->signal->tty_old_pgrp) {
+			kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit);
+			kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit);
 		}
 		unlock_kernel();	
 		return;
@@ -594,17 +591,13 @@ void disassociate_ctty(int on_exit)
 			kill_pg(tty_pgrp, SIGCONT, on_exit);
 	}
 
-	current->tty_old_pgrp = 0;
+	current->signal->tty_old_pgrp = 0;
 	tty->session = 0;
 	tty->pgrp = -1;
 
 	read_lock(&tasklist_lock);
-	for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) {
-		task_t *task = p;
-		do {
-			task->tty = NULL;
-		} while_each_thread(p, task);
-	}
+	for_each_task_pid(current->signal->session, PIDTYPE_SID, p, l, pid)
+		p->signal->tty = NULL;
 	read_unlock(&tasklist_lock);
 	unlock_kernel();
 }
@@ -1257,20 +1250,11 @@ static void release_dev(struct file * filp)
 		struct pid *pid;
 
 		read_lock(&tasklist_lock);
-		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-			task_t *task = p;
-			do {
-				task->tty = NULL;
-			} while_each_thread(p, task);
-		}
-		if (o_tty) {
-			for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) {
-				task_t *task = p;
-				do {
-					task->tty = NULL;
-				} while_each_thread(p, task);
-			}
-		}
+		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid)
+			p->signal->tty = NULL;
+		if (o_tty)
+			for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid)
+				p->signal->tty = NULL;
 		read_unlock(&tasklist_lock);
 	}
 
@@ -1341,10 +1325,10 @@ static int tty_open(struct inode * inode, struct file * filp)
 retry_open:
 	noctty = filp->f_flags & O_NOCTTY;
 	if (device == MKDEV(TTYAUX_MAJOR,0)) {
-		if (!current->tty)
+		if (!current->signal->tty)
 			return -ENXIO;
-		driver = current->tty->driver;
-		index = current->tty->index;
+		driver = current->signal->tty->driver;
+		index = current->signal->tty->index;
 		filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
 		/* noctty = 1; */
 		goto got_driver;
@@ -1445,14 +1429,14 @@ got_driver:
 		goto retry_open;
 	}
 	if (!noctty &&
-	    current->leader &&
-	    !current->tty &&
+	    current->signal->leader &&
+	    !current->signal->tty &&
 	    tty->session == 0) {
 	    	task_lock(current);
-		current->tty = tty;
+		current->signal->tty = tty;
 		task_unlock(current);
-		current->tty_old_pgrp = 0;
-		tty->session = current->session;
+		current->signal->tty_old_pgrp = 0;
+		tty->session = current->signal->session;
 		tty->pgrp = process_group(current);
 	}
 	return 0;
@@ -1510,7 +1494,7 @@ static int tiocsti(struct tty_struct *tty, char * arg)
 {
 	char ch, mbz = 0;
 
-	if ((current->tty != tty) && !capable(CAP_SYS_ADMIN))
+	if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (get_user(ch, arg))
 		return -EFAULT;
@@ -1601,14 +1585,14 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 	struct pid *pid;
 	task_t *p;
 
-	if (current->leader &&
-	    (current->session == tty->session))
+	if (current->signal->leader &&
+	    (current->signal->session == tty->session))
 		return 0;
 	/*
 	 * The process must be a session leader and
 	 * not have a controlling tty already.
 	 */
-	if (!current->leader || current->tty)
+	if (!current->signal->leader || current->signal->tty)
 		return -EPERM;
 	if (tty->session > 0) {
 		/*
@@ -1621,21 +1605,17 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 			 */
 
 			read_lock(&tasklist_lock);
-			for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-				task_t *task = p;
-				do {
-					task->tty = NULL;
-				} while_each_thread(p, task);
-			}
+			for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid)
+				p->signal->tty = NULL;
 			read_unlock(&tasklist_lock);
 		} else
 			return -EPERM;
 	}
 	task_lock(current);
-	current->tty = tty;
+	current->signal->tty = tty;
 	task_unlock(current);
-	current->tty_old_pgrp = 0;
-	tty->session = current->session;
+	current->signal->tty_old_pgrp = 0;
+	tty->session = current->signal->session;
 	tty->pgrp = process_group(current);
 	return 0;
 }
@@ -1646,7 +1626,7 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	 */
-	if (tty == real_tty && current->tty != real_tty)
+	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 	return put_user(real_tty->pgrp, arg);
 }
@@ -1660,15 +1640,15 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 		return -ENOTTY;
 	if (retval)
 		return retval;
-	if (!current->tty ||
-	    (current->tty != real_tty) ||
-	    (real_tty->session != current->session))
+	if (!current->signal->tty ||
+	    (current->signal->tty != real_tty) ||
+	    (real_tty->session != current->signal->session))
 		return -ENOTTY;
 	if (get_user(pgrp, (pid_t *) arg))
 		return -EFAULT;
 	if (pgrp < 0)
 		return -EINVAL;
-	if (session_of_pgrp(pgrp) != current->session)
+	if (session_of_pgrp(pgrp) != current->signal->session)
 		return -EPERM;
 	real_tty->pgrp = pgrp;
 	return 0;
@@ -1680,7 +1660,7 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	*/
-	if (tty == real_tty && current->tty != real_tty)
+	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 	if (real_tty->session <= 0)
 		return -ENOTTY;
@@ -1838,12 +1818,12 @@ int tty_ioctl(struct inode * inode, struct file * file,
 			clear_bit(TTY_EXCLUSIVE, &tty->flags);
 			return 0;
 		case TIOCNOTTY:
-			if (current->tty != tty)
+			if (current->signal->tty != tty)
 				return -ENOTTY;
-			if (current->leader)
+			if (current->signal->leader)
 				disassociate_ctty(0);
 			task_lock(current);
-			current->tty = NULL;
+			current->signal->tty = NULL;
 			task_unlock(current);
 			return 0;
 		case TIOCSCTTY:
@@ -1947,9 +1927,9 @@ static void __do_SAK(void *arg)
 		tty->driver->flush_buffer(tty);
 	read_lock(&tasklist_lock);
 	for_each_task_pid(session, PIDTYPE_SID, p, l, pid) {
-		if (p->tty == tty || session > 0) {
+		if (p->signal->tty == tty || session > 0) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): p->session==tty->session\n",
+			    " (%s): p->signal->session==tty->session\n",
 			    p->pid, p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 2febed52e19f..a1a59abc915c 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2278,7 +2278,7 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 
 	if (tty->driver->type != TTY_DRIVER_TYPE_CONSOLE)
 		return -EINVAL;
-	if (current->tty != tty && !capable(CAP_SYS_ADMIN))
+	if (current->signal->tty != tty && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (get_user(type, (char *)arg))
 		return -EFAULT;
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index d8c6acc8e62c..0685fe7be2d1 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -382,7 +382,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
 	perm = 0;
-	if (current->tty == tty || capable(CAP_SYS_TTY_CONFIG))
+	if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
 		perm = 1;
  
 	kbd = kbd_table + console;
@@ -1221,4 +1221,3 @@ void change_console(unsigned int new_console)
 
 	complete_change_console(new_console);
 }
-
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 601df52ebb29..e783ac0fa71e 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -1307,7 +1307,7 @@ static int sl_ioctl(struct net_device *dev,struct ifreq *rq,int cmd)
 		/* Resolve race condition, when ioctl'ing hanged up 
 		   and opened by another process device.
 		 */
-		if (sl->tty != current->tty && sl->pid != current->pid) {
+		if (sl->tty != current->signal->tty && sl->pid != current->pid) {
 			spin_unlock_bh(&sl->lock);
 			return -EPERM;
 		}
diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 892ebc7739b0..b124ebb7fc9b 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -471,7 +471,7 @@ kbd_ioctl(struct kbd_data *kbd, struct file *file,
 	 * To have permissions to do most of the vt ioctls, we either have
 	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
-	perm = current->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG);
+	perm = current->signal->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG);
 	switch (cmd) {
 	case KDGKBTYPE:
 		return put_user(KB_101, (char*) arg);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9cc7cc648b42..e5b79a294c80 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1129,7 +1129,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pid = p->pid;
 	prstatus->pr_ppid = p->parent->pid;
 	prstatus->pr_pgrp = process_group(p);
-	prstatus->pr_sid = p->session;
+	prstatus->pr_sid = p->signal->session;
 	jiffies_to_timeval(p->utime, &prstatus->pr_utime);
 	jiffies_to_timeval(p->stime, &prstatus->pr_stime);
 	jiffies_to_timeval(p->cutime, &prstatus->pr_cutime);
@@ -1157,7 +1157,7 @@ static void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_pid = p->pid;
 	psinfo->pr_ppid = p->parent->pid;
 	psinfo->pr_pgrp = process_group(p);
-	psinfo->pr_sid = p->session;
+	psinfo->pr_sid = p->signal->session;
 
 	i = p->state ? ffz(~p->state) + 1 : 0;
 	psinfo->pr_state = i;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 23baed6180ff..de45d833d0f4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1604,7 +1604,7 @@ static int vt_check(struct file *file)
 	 * To have permissions to do most of the vt ioctls, we either have
 	 * to be the owner of the tty, or super-user.
 	 */
-	if (current->tty == tty || capable(CAP_SYS_ADMIN))
+	if (current->signal->tty == tty || capable(CAP_SYS_ADMIN))
 		return 1;
 	return 0;                                                    
 }
diff --git a/fs/dquot.c b/fs/dquot.c
index e6b39e66207a..5749044d028e 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -689,12 +689,12 @@ static void print_warning(struct dquot *dquot, const char warntype)
 
 	if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
 		return;
-	tty_write_message(current->tty, dquot->dq_sb->s_id);
+	tty_write_message(current->signal->tty, dquot->dq_sb->s_id);
 	if (warntype == ISOFTWARN || warntype == BSOFTWARN)
-		tty_write_message(current->tty, ": warning, ");
+		tty_write_message(current->signal->tty, ": warning, ");
 	else
-		tty_write_message(current->tty, ": write failed, ");
-	tty_write_message(current->tty, quotatypes[dquot->dq_type]);
+		tty_write_message(current->signal->tty, ": write failed, ");
+	tty_write_message(current->signal->tty, quotatypes[dquot->dq_type]);
 	switch (warntype) {
 		case IHARDWARN:
 			msg = " file limit reached.\n";
@@ -715,7 +715,7 @@ static void print_warning(struct dquot *dquot, const char warntype)
 			msg = " block quota exceeded.\n";
 			break;
 	}
-	tty_write_message(current->tty, msg);
+	tty_write_message(current->signal->tty, msg);
 }
 
 static inline void flush_warnings(struct dquot **dquots, char *warntype)
diff --git a/fs/exec.c b/fs/exec.c
index 225afb0d94e5..62bf2c537abd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -601,6 +601,11 @@ static inline int de_thread(struct task_struct *tsk)
 		newsig->group_stop_count = 0;
 		newsig->curr_target = NULL;
 		init_sigpending(&newsig->shared_pending);
+
+		newsig->pgrp = oldsig->pgrp;
+		newsig->session = oldsig->session;
+		newsig->leader = oldsig->leader;
+		newsig->tty_old_pgrp = oldsig->tty_old_pgrp;
 	}
 
 	if (thread_group_empty(current))
diff --git a/fs/open.c b/fs/open.c
index 9a9ce5be4dbc..ce11096afcad 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1037,7 +1037,7 @@ EXPORT_SYMBOL(sys_close);
 asmlinkage long sys_vhangup(void)
 {
 	if (capable(CAP_SYS_TTY_CONFIG)) {
-		tty_vhangup(current->tty);
+		tty_vhangup(current->signal->tty);
 		return 0;
 	}
 	return -EPERM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7af62577287e..ac9ccac5d1ee 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -168,7 +168,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		p->pid && p->ptrace ? p->parent->pid : 0,
 		p->uid, p->euid, p->suid, p->fsuid,
 		p->gid, p->egid, p->sgid, p->fsgid);
-	read_unlock(&tasklist_lock);	
+	read_unlock(&tasklist_lock);
 	task_lock(p);
 	buffer += sprintf(buffer,
 		"FDSize:\t%d\n"
@@ -301,7 +301,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	sigset_t sigign, sigcatch;
 	char state;
 	int res;
-	pid_t ppid;
+ 	pid_t ppid, pgid = -1, sid = -1;
 	int num_threads = 0;
 	struct mm_struct *mm;
 
@@ -311,10 +311,6 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	mm = task->mm;
 	if(mm)
 		mm = mmgrab(mm);
-	if (task->tty) {
-		tty_pgrp = task->tty->pgrp;
-		tty_nr = new_encode_dev(tty_devnum(task->tty));
-	}
 	task_unlock(task);
 	if (mm) {
 		down_read(&mm->mmap_sem);
@@ -335,7 +331,15 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 		spin_unlock_irq(&task->sighand->siglock);
 	}
-	read_unlock(&tasklist_lock);		
+	if (task->signal) {
+		if (task->signal->tty) {
+			tty_pgrp = task->signal->tty->pgrp;
+			tty_nr = new_encode_dev(tty_devnum(task->signal->tty));
+		}
+		pgid = process_group(task);
+		sid = task->signal->session;
+	}
+	read_unlock(&tasklist_lock);
 
 	/* scale priority and nice values from timeslices to -20..20 */
 	/* to make it look like a "normal" Unix priority/nice value  */
@@ -352,8 +356,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		task->comm,
 		state,
 		ppid,
-		process_group(task),
-		task->session,
+		pgid,
+		sid,
 		tty_nr,
 		tty_pgrp,
 		task->flags,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 054b3c0d5962..5a1229121123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -269,6 +269,15 @@ struct signal_struct {
 
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
+
+	/* job control IDs */
+	pid_t pgrp;
+	pid_t tty_old_pgrp;
+	pid_t session;
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
 };
 
 /*
@@ -398,12 +407,7 @@ struct task_struct {
 	unsigned long personality;
 	int did_exec:1;
 	pid_t pid;
-	pid_t __pgrp;		/* Accessed via process_group() */
-	pid_t tty_old_pgrp;
-	pid_t session;
 	pid_t tgid;
-	/* boolean value for session group leader */
-	int leader;
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
@@ -446,7 +450,6 @@ struct task_struct {
 	char comm[16];
 /* file system info */
 	int link_count, total_link_count;
-	struct tty_struct *tty; /* NULL if no tty */
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 /* CPU-specific state of this task */
@@ -499,7 +502,7 @@ struct task_struct {
 
 static inline pid_t process_group(struct task_struct *tsk)
 {
-	return tsk->group_leader->__pgrp;
+	return tsk->signal->pgrp;
 }
 
 extern void __put_task_struct(struct task_struct *tsk);
diff --git a/kernel/acct.c b/kernel/acct.c
index 9dbab88b2d31..b417066778a7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -347,7 +347,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
-	ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0;
+	ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 
 	ac.ac_flag = 0;
 	if (current->flags & PF_FORKNOEXEC)
diff --git a/kernel/exit.c b/kernel/exit.c
index 308f6959add6..810eebd77559 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -136,13 +136,13 @@ int session_of_pgrp(int pgrp)
 
 	read_lock(&tasklist_lock);
 	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
-		if (p->session > 0) {
-			sid = p->session;
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
 			goto out;
 		}
 	p = find_task_by_pid(pgrp);
 	if (p)
-		sid = p->session;
+		sid = p->signal->session;
 out:
 	read_unlock(&tasklist_lock);
 	
@@ -170,7 +170,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
-			    && p->real_parent->session == p->session) {
+			    && p->real_parent->signal->session == p->signal->session) {
 			ret = 0;
 			break;
 		}
@@ -259,14 +259,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
 	struct task_struct *curr = current;
 
-	if (curr->session != session) {
+	if (curr->signal->session != session) {
 		detach_pid(curr, PIDTYPE_SID);
-		curr->session = session;
+		curr->signal->session = session;
 		attach_pid(curr, PIDTYPE_SID, session);
 	}
 	if (process_group(curr) != pgrp) {
 		detach_pid(curr, PIDTYPE_PGID);
-		curr->group_leader->__pgrp = pgrp;
+		curr->signal->pgrp = pgrp;
 		attach_pid(curr, PIDTYPE_PGID, pgrp);
 	}
 }
@@ -341,7 +341,7 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	current->tty = NULL;
+	current->signal->tty = NULL;
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
@@ -564,7 +564,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 	 * outside, so the child pgrp is now orphaned.
 	 */
 	if ((process_group(p) != process_group(father)) &&
-	    (p->session == father->session)) {
+	    (p->signal->session == father->signal->session)) {
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
@@ -675,7 +675,7 @@ static void exit_notify(struct task_struct *tsk)
 	t = tsk->real_parent;
 	
 	if ((process_group(t) != process_group(tsk)) &&
-	    (t->session == tsk->session) &&
+	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
 		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
@@ -780,7 +780,7 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	exit_itimers(tsk);
 	exit_thread();
 
-	if (tsk->leader)
+	if (tsk->signal->leader)
 		disassociate_ctty(1);
 
 	module_put(tsk->thread_info->exec_domain->module);
diff --git a/kernel/fork.c b/kernel/fork.c
index a1f20cabbdd3..d2dd97e866bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -811,6 +811,12 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
 
+	sig->tty = current->signal->tty;
+	sig->pgrp = process_group(current);
+	sig->session = current->signal->session;
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
+
 	return 0;
 }
 
@@ -935,8 +941,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;
 
-	p->leader = 0;		/* session leadership doesn't inherit */
-	p->tty_old_pgrp = 0;
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
@@ -1055,7 +1059,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->session);
+		attach_pid(p, PIDTYPE_SID, p->signal->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	} else
diff --git a/kernel/pid.c b/kernel/pid.c
index 4c85144759c5..6ed44f56ca45 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 
 	attach_pid(thread, PIDTYPE_PID, thread->pid);
 	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->session);
+	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
 	list_add_tail(&thread->tasks, &init_task.tasks);
 
 	attach_pid(leader, PIDTYPE_PID, leader->pid);
 	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->session);
+	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index e6b7904df68f..7a4b479a6f45 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -588,7 +588,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	error = -EPERM;
 	if ((!info || ((unsigned long)info != 1 &&
 			(unsigned long)info != 2 && SI_FROMUSER(info)))
-	    && ((sig != SIGCONT) || (current->session != t->session))
+	    && ((sig != SIGCONT) ||
+		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
 	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
 	    && !capable(CAP_KILL))
@@ -1103,7 +1104,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid)
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) {
-		if (!p->leader)
+		if (!p->signal->leader)
 			continue;
 		err = group_send_sig_info(sig, info, p);
 		if (retval)
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f9e02f2071..9d57482758f3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -990,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 
 	if (p->parent == current || p->real_parent == current) {
 		err = -EPERM;
-		if (p->session != current->session)
+		if (p->signal->session != current->signal->session)
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
@@ -1002,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	}
 
 	err = -EPERM;
-	if (p->leader)
+	if (p->signal->leader)
 		goto out;
 
 	if (pgid != pid) {
@@ -1011,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct list_head *l;
 
 		for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid)
-			if (p->session == current->session)
+			if (p->signal->session == current->signal->session)
 				goto ok_pgid;
 		goto out;
 	}
@@ -1023,7 +1023,7 @@ ok_pgid:
 
 	if (process_group(p) != pgid) {
 		detach_pid(p, PIDTYPE_PGID);
-		p->group_leader->__pgrp = pgid;
+		p->signal->pgrp = pgid;
 		attach_pid(p, PIDTYPE_PGID, pgid);
 	}
 
@@ -1065,7 +1065,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
 	if (!pid) {
-		return current->session;
+		return current->signal->session;
 	} else {
 		int retval;
 		struct task_struct *p;
@@ -1077,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		if(p) {
 			retval = security_task_getsid(p);
 			if (!retval)
-				retval = p->session;
+				retval = p->signal->session;
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
@@ -1098,10 +1098,10 @@ asmlinkage long sys_setsid(void)
 	if (pid)
 		goto out;
 
-	current->leader = 1;
+	current->signal->leader = 1;
 	__set_special_pids(current->pid, current->pid);
-	current->tty = NULL;
-	current->tty_old_pgrp = 0;
+	current->signal->tty = NULL;
+	current->signal->tty_old_pgrp = 0;
 	err = process_group(current);
 out:
 	write_unlock_irq(&tasklist_lock);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 33b687d60efe..f76563312ee4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -46,7 +46,7 @@ static void print_string(char *str)
 	struct tty_struct *my_tty;
 
 	/* The tty for the current task */
-	my_tty = current->tty;
+	my_tty = current->signal->tty;
 	if (my_tty != NULL) {
 		my_tty->driver->write(my_tty, 0, str, strlen(str));
 		my_tty->driver->write(my_tty, 0, "\015\012", 2);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index a1529896ec1b..91c3fd3f1f8f 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -95,7 +95,7 @@ match_sid(const struct sk_buff *skb, pid_t sid)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct files_struct *files;
-		if (p->session != sid)
+		if (p->signal->session != sid)
 			continue;
 
 		task_lock(p);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 02e5ee4e7418..0bb9c661b73c 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -61,7 +61,7 @@ match_sid(const struct sk_buff *skb, pid_t sid)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct files_struct *files;
-		if (p->session != sid)
+		if (p->signal->session != sid)
 			continue;
 
 		task_lock(p);
-- 
cgit v1.2.3


From af70f7673155616ffd004d551e1b612002a58bf0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:59 -0700
Subject: [PATCH] Fix page allocator lower zone protection for NUMA

From: Martin Hicks <mort@wildopensource.com>

This changes __alloc_pages() so it uses precalculated values for the "min".
This should prevent the problem of min incrementing from zone to zone across
many nodes on a NUMA machine.  The result of falling back to other nodes with
the old incremental min calculations was that the min value became very
large.
---
 include/linux/mmzone.h |  39 ++++++++++---
 kernel/sysctl.c        |   2 +-
 mm/page_alloc.c        | 150 +++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 159 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b5398fa7be88..51b8f3f67741 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -54,6 +54,15 @@ struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
 } ____cacheline_aligned_in_smp;
 
+#define ZONE_DMA		0
+#define ZONE_NORMAL		1
+#define ZONE_HIGHMEM		2
+
+#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
+#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
+
+#define GFP_ZONEMASK	0x03
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -70,6 +79,19 @@ struct zone {
 	spinlock_t		lock;
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	/*
+	 * protection[] is a pre-calculated number of extra pages that must be
+	 * available in a zone in order for __alloc_pages() to allocate memory
+	 * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
+	 * be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
+	 * for us to choose to allocate the page from that zone.
+	 *
+	 * It uses both min_free_kbytes and sysctl_lower_zone_protection.
+	 * The protection values are recalculated if either of these values
+	 * change.  The array elements are in zonelist order:
+	 *	[0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
+	 */
+	unsigned long		protection[MAX_NR_ZONES];
 
 	ZONE_PADDING(_pad1_)
 
@@ -157,14 +179,6 @@ struct zone {
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 } ____cacheline_maxaligned_in_smp;
 
-#define ZONE_DMA		0
-#define ZONE_NORMAL		1
-#define ZONE_HIGHMEM		2
-
-#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
-
-#define GFP_ZONEMASK	0x03
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -228,6 +242,11 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive,
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone);
 
+/*
+ * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
+ */
+#define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
+
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
  * @pgdat - pointer to a pg_data_t variable
@@ -299,7 +318,9 @@ static inline int is_normal(struct zone *zone)
 struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
-					  void __user *, size_t *);
+					void __user *, size_t *);
+int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5f3123b0522..f2c8c8ce4926 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -722,7 +722,7 @@ static ctl_table vm_table[] = {
 		.data		= &sysctl_lower_zone_protection,
 		.maxlen		= sizeof(sysctl_lower_zone_protection),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &lower_zone_protection_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9764a4e78e45..c87ca3dd2f11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct task_struct *p = current;
 	int i;
 	int cold;
+	int alloc_type;
 	int do_retry;
 
 	might_sleep_if(wait);
@@ -564,28 +565,27 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (zones[0] == NULL)     /* no zones in the zonelist */
 		return NULL;
 
+	alloc_type = zone_idx(zones[0]);
+
 	/* Go through the zonelist once, looking for a zone with enough free */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
-		unsigned long local_low;
+
+		min = (1<<order) + z->protection[alloc_type];
 
 		/*
-		 * This is the fabled 'incremental min'. We let real-time tasks
-		 * dip their real-time paws a little deeper into reserves.
+		 * We let real-time tasks dip their real-time paws a little
+		 * deeper into reserves.
 		 */
-		local_low = z->pages_low;
 		if (rt_task(p))
-			local_low >>= 1;
-		min += local_low;
+			min -= z->pages_low >> 1;
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-		       		goto got_pg;
+				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/* we're somewhat low on memory, failed to find what we needed */
@@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		wakeup_kswapd(zones[i]);
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
-		unsigned long local_min;
 		struct zone *z = zones[i];
 
-		local_min = z->pages_min;
+		min = (1<<order) + z->protection[alloc_type];
+
 		if (gfp_mask & __GFP_HIGH)
-			local_min >>= 2;
+			min -= z->pages_low >> 2;
 		if (rt_task(p))
-			local_min >>= 1;
-		min += local_min;
+			min -= z->pages_low >> 1;
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += local_min * sysctl_lower_zone_protection;
 	}
 
 	/* here we're in the low on memory slow path */
@@ -642,18 +640,17 @@ rebalance:
 	p->flags &= ~PF_MEMALLOC;
 
 	/* go through the zonelist yet one more time */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
-		min += z->pages_min;
+		min = (1UL << order) + z->protection[alloc_type];
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/*
@@ -1056,6 +1053,8 @@ void show_free_areas(void)
 		ps.nr_page_table_pages);
 
 	for_each_zone(zone) {
+		int i;
+
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1075,6 +1074,10 @@ void show_free_areas(void)
 			K(zone->nr_inactive),
 			K(zone->present_pages)
 			);
+		printk("protections[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->protection[i]);
+		printk("\n");
 	}
 
 	for_each_zone(zone) {
@@ -1272,7 +1275,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
-	} 
+	}
 }
 
 #endif	/* CONFIG_NUMA */
@@ -1744,6 +1747,93 @@ void __init page_alloc_init(void)
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
+static unsigned long higherzone_val(struct zone *z, int max_zone,
+					int alloc_type)
+{
+	int z_idx = zone_idx(z);
+	struct zone *higherzone;
+	unsigned long pages;
+
+	/* there is no higher zone to get a contribution from */
+	if (z_idx == MAX_NR_ZONES-1)
+		return 0;
+
+	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
+
+	/* We always start with the higher zone's protection value */
+	pages = higherzone->protection[alloc_type];
+
+	/*
+	 * We get a lower-zone-protection contribution only if there are
+	 * pages in the higher zone and if we're not the highest zone
+	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
+	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
+	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
+	 */
+	if (higherzone->present_pages && z_idx < alloc_type)
+		pages += higherzone->pages_low * sysctl_lower_zone_protection;
+
+	return pages;
+}
+
+/*
+ * setup_per_zone_protection - called whenver min_free_kbytes or
+ *	sysctl_lower_zone_protection changes.  Ensures that each zone
+ *	has a correct pages_protected value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ *
+ *	This algorithm is way confusing.  I tries to keep the same behavior
+ *	as we had with the incremental min iterative algorithm.
+ */
+static void setup_per_zone_protection(void)
+{
+	struct pglist_data *pgdat;
+	struct zone *zones, *zone;
+	int max_zone;
+	int i, j;
+
+	for_each_pgdat(pgdat) {
+		zones = pgdat->node_zones;
+
+		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
+			if (zones[i].present_pages)
+				max_zone = i;
+
+		/*
+		 * For each of the different allocation types:
+		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+		 */
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			/*
+			 * For each of the zones:
+			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+			 */
+			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+				zone = &zones[j];
+
+				/*
+				 * We never protect zones that don't have memory
+				 * in them (j>max_zone) or zones that aren't in
+				 * the zonelists for a certain type of
+				 * allocation (j>i).  We have to assign these to
+				 * zero because the lower zones take
+				 * contributions from the higher zones.
+				 */
+				if (j > max_zone || j > i) {
+					zone->protection[i] = 0;
+					continue;
+				}
+				/*
+				 * The contribution of the next higher zone
+				 */
+				zone->protection[i] = higherzone_val(zone,
+								max_zone, i);
+				zone->protection[i] += zone->pages_low;
+			}
+		}
+	}
+}
+
 /*
  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
  *	that the pages_{min,low,high} values for each zone are set correctly 
@@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void)
 	unsigned long flags;
 
 	/* Calculate total number of !ZONE_HIGHMEM pages */
-	for_each_zone(zone)
+	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
+	}
 
 	for_each_zone(zone) {
 		spin_lock_irqsave(&zone->lru_lock, flags);
@@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void)
 	if (min_free_kbytes > 16384)
 		min_free_kbytes = 16384;
 	setup_per_zone_pages_min();
+	setup_per_zone_protection();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
 
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
- *	that we can call setup_per_zone_pages_min() whenever min_free_kbytes 
+ *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
@@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 {
 	proc_dointvec(table, write, file, buffer, length);
 	setup_per_zone_pages_min();
+	setup_per_zone_protection();
+	return 0;
+}
+
+/*
+ * lower_zone_protection_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_protection()
+ *	whenever sysctl_lower_zone_protection changes.
+ */
+int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+		 struct file *file, void __user *buffer, size_t *length)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length);
+	setup_per_zone_protection();
 	return 0;
 }
-- 
cgit v1.2.3


From b9e55f3d300af426885d7b0a13e45cd2841118a2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:58:40 -0700
Subject: [PATCH] slab: updates for per-arch alignments

From: Manfred Spraul <manfred@colorfullife.com>

Description:

Right now kmem_cache_create automatically decides about the alignment of
allocated objects. The automatic decisions are sometimes wrong:

- for some objects, it's better to keep them as small as possible to
  reduce the memory usage.  Ingo already added a parameter to
  kmem_cache_create for the sigqueue cache, but it wasn't implemented.

- for s390, normal kmalloc must be 8-byte aligned.  With debugging
  enabled, the default allocation was 4-bytes.  This means that s390 cannot
  enable slab debugging.

- arm26 needs 1 kB aligned objects.  Previously this was impossible to
  generate, therefore arm has its own allocator in
  arm26/machine/small_page.c

- most objects should be cache line aligned, to avoid false sharing.  But
  the cache line size was set at compile time, often to 128 bytes for
  generic kernels.  This wastes memory.  The new code uses the runtime
  determined cache line size instead.

- some caches want an explicit alignment.  One example are the pte_chain
  objects: they must find the start of the object with addr&mask.  Right
  now pte_chain objects are scaled to the cache line size, because that was
  the only alignment that could be generated reliably.

The implementation reuses the "offset" parameter of kmem_cache_create and
now uses it to pass in the requested alignment.  offset was ignored by the
current implementation, and the only user I found is sigqueue, which
intended to set the alignment.

In the long run, it might be interesting for the main tree: due to the 128
byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9
inodes - 20% memory recovered for Athlon systems.


For generic kernels  running on P6 cpus (i.e. 32 byte cachelines), it means

Number of objects per page:

 ext2_inode_cache: 8 instead of 7
 ext3_inode_cache: 8 instead of 7
 fat_inode_cache: 9 instead of 7
 rpc_tasks: 24 instead of 15
 tcp_tw_bucket: 40 instead of 30
 arp_cache: 40 instead of 30
 nfs_write_data: 9 instead of 7
---
 arch/i386/mm/init.c          |   4 +-
 include/asm-i386/processor.h |   2 +
 kernel/fork.c                |   7 ++-
 mm/rmap.c                    |   2 +-
 mm/slab.c                    | 135 +++++++++++++++++++++++++------------------
 5 files changed, 89 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index a9923661f317..040862e6c6a0 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -530,18 +530,18 @@ void __init pgtable_cache_init(void)
 {
 	if (PTRS_PER_PMD > 1) {
 		pmd_cache = kmem_cache_create("pmd",
+					PTRS_PER_PMD*sizeof(pmd_t),
 					PTRS_PER_PMD*sizeof(pmd_t),
 					0,
-					SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 					pmd_ctor,
 					NULL);
 		if (!pmd_cache)
 			panic("pgtable_cache_init(): cannot create pmd cache");
 	}
 	pgd_cache = kmem_cache_create("pgd",
+				PTRS_PER_PGD*sizeof(pgd_t),
 				PTRS_PER_PGD*sizeof(pgd_t),
 				0,
-				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 				pgd_ctor,
 				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
 	if (!pgd_cache)
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 3a5e0ff2a20c..0ebe1aa1afb0 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -403,6 +403,8 @@ struct tss_struct {
 	unsigned long stack[64];
 } __attribute__((packed));
 
+#define ARCH_MIN_TASKALIGN	16
+
 struct thread_struct {
 /* cached TLS descriptors. */
 	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
diff --git a/kernel/fork.c b/kernel/fork.c
index d2dd97e866bb..315a06125e65 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -207,11 +207,14 @@ EXPORT_SYMBOL(autoremove_wake_function);
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	0
+#endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct",
-				  sizeof(struct task_struct),0,
-				  SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+				  sizeof(struct task_struct),ARCH_MIN_TASKALIGN,
+				  0, NULL, NULL);
 	if (!task_struct_cachep)
 		panic("fork_init(): cannot create task_struct SLAB cache");
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 7af41a9b9a4e..c1c7325996a3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -522,9 +522,9 @@ struct pte_chain *pte_chain_alloc(int gfp_flags)
 void __init pte_chain_init(void)
 {
 	pte_chain_cache = kmem_cache_create(	"pte_chain",
+						sizeof(struct pte_chain),
 						sizeof(struct pte_chain),
 						0,
-						SLAB_MUST_HWCACHE_ALIGN,
 						pte_chain_ctor,
 						NULL);
 
diff --git a/mm/slab.c b/mm/slab.c
index d54728b6af32..b1c015cb0a02 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,14 @@
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 
+#ifndef cache_line_size
+#define cache_line_size()	L1_CACHE_BYTES
+#endif
+
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN 0
+#endif
+
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
@@ -268,6 +276,7 @@ struct kmem_cache_s {
 	unsigned int		colour_off;	/* colour offset */
 	unsigned int		colour_next;	/* cache colouring */
 	kmem_cache_t		*slabp_cache;
+	unsigned int		slab_size;
 	unsigned int		dflags;		/* dynamic flags */
 
 	/* constructor func */
@@ -490,8 +499,10 @@ static kmem_cache_t cache_cache = {
 	.objsize	= sizeof(kmem_cache_t),
 	.flags		= SLAB_NO_REAP,
 	.spinlock	= SPIN_LOCK_UNLOCKED,
-	.colour_off	= L1_CACHE_BYTES,
 	.name		= "kmem_cache",
+#if DEBUG
+	.reallen	= sizeof(kmem_cache_t),
+#endif
 };
 
 /* Guard access to the cache-chain. */
@@ -535,7 +546,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 }
 
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
-static void cache_estimate (unsigned long gfporder, size_t size,
+static void cache_estimate (unsigned long gfporder, size_t size, size_t align,
 		 int flags, size_t *left_over, unsigned int *num)
 {
 	int i;
@@ -548,7 +559,7 @@ static void cache_estimate (unsigned long gfporder, size_t size,
 		extra = sizeof(kmem_bufctl_t);
 	}
 	i = 0;
-	while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
+	while (i*size + ALIGN(base+i*extra, align) <= wastage)
 		i++;
 	if (i > 0)
 		i--;
@@ -558,7 +569,7 @@ static void cache_estimate (unsigned long gfporder, size_t size,
 
 	*num = i;
 	wastage -= i*size;
-	wastage -= L1_CACHE_ALIGN(base+i*extra);
+	wastage -= ALIGN(base+i*extra, align);
 	*left_over = wastage;
 }
 
@@ -705,16 +716,20 @@ void __init kmem_cache_init(void)
 	init_MUTEX(&cache_chain_sem);
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
+	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 
-	cache_estimate(0, cache_cache.objsize, 0,
-			&left_over, &cache_cache.num);
+	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+
+	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
+				&left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
 
 	cache_cache.colour = left_over/cache_cache.colour_off;
 	cache_cache.colour_next = 0;
-
+	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+				sizeof(struct slab), cache_line_size());
 
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
@@ -728,7 +743,7 @@ void __init kmem_cache_init(void)
 		 * allow tighter packing of the smaller caches. */
 		sizes->cs_cachep = kmem_cache_create(
 			names->name, sizes->cs_size,
-			0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+			ARCH_KMALLOC_MINALIGN, 0, NULL, NULL);
 		if (!sizes->cs_cachep)
 			BUG();
 
@@ -740,7 +755,7 @@ void __init kmem_cache_init(void)
 
 		sizes->cs_dmacachep = kmem_cache_create(
 			names->name_dma, sizes->cs_size,
-			0, SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL);
+			ARCH_KMALLOC_MINALIGN, SLAB_CACHE_DMA, NULL, NULL);
 		if (!sizes->cs_dmacachep)
 			BUG();
 
@@ -1056,7 +1071,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
- * @offset: The offset to use within the page.
+ * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
  * @dtor: A destructor for the objects.
@@ -1081,16 +1096,15 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
  * memory pressure.
  *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline.  This can be beneficial if you're counting cycles as closely
- * as davem.
+ * %SLAB_HWCACHE_ALIGN - This flag has no effect and will be removed soon.
+ *
  */
 kmem_cache_t *
-kmem_cache_create (const char *name, size_t size, size_t offset,
+kmem_cache_create (const char *name, size_t size, size_t align,
 	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
 	void (*dtor)(void*, kmem_cache_t *, unsigned long))
 {
-	size_t left_over, align, slab_size;
+	size_t left_over, slab_size;
 	kmem_cache_t *cachep = NULL;
 
 	/*
@@ -1101,7 +1115,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 		(size < BYTES_PER_WORD) ||
 		(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
 		(dtor && !ctor) ||
-		(offset < 0 || offset > size)) {
+		(align < 0)) {
 			printk(KERN_ERR "%s: Early error in slab %s\n",
 					__FUNCTION__, name);
 			BUG();
@@ -1118,22 +1132,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 
 #if FORCED_DEBUG
 	/*
-	 * Enable redzoning and last user accounting, except
-	 * - for caches with forced alignment: redzoning would violate the
-	 *   alignment
-	 * - for caches with large objects, if the increased size would
-	 *   increase the object size above the next power of two: caches
-	 *   with object sizes just above a power of two have a significant
-	 *   amount of internal fragmentation
+	 * Enable redzoning and last user accounting, except for caches with
+	 * large objects, if the increased size would increase the object size
+	 * above the next power of two: caches with object sizes just above a
+	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))
-			&& !(flags & SLAB_MUST_HWCACHE_ALIGN)) {
+	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
 		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
-	}
 	flags |= SLAB_POISON;
 #endif
 #endif
-
 	/*
 	 * Always checks flags, a caller might be expecting debug
 	 * support which isn't available.
@@ -1141,15 +1149,23 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 	if (flags & ~CREATE_MASK)
 		BUG();
 
+	if (align) {
+		/* minimum supported alignment: */
+		if (align < BYTES_PER_WORD)
+			align = BYTES_PER_WORD;
+
+		/* combinations of forced alignment and advanced debugging is
+		 * not yet implemented.
+		 */
+		flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+	}
+
 	/* Get cache's description obj. */
 	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto opps;
 	memset(cachep, 0, sizeof(kmem_cache_t));
 
-#if DEBUG
-	cachep->reallen = size;
-#endif
 	/* Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
@@ -1160,30 +1176,31 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 	}
 	
 #if DEBUG
+	cachep->reallen = size;
+
 	if (flags & SLAB_RED_ZONE) {
-		/*
-		 * There is no point trying to honour cache alignment
-		 * when redzoning.
-		 */
-		flags &= ~SLAB_HWCACHE_ALIGN;
+		/* redzoning only works with word aligned caches */
+		align = BYTES_PER_WORD;
+
 		/* add space for red zone words */
 		cachep->dbghead += BYTES_PER_WORD;
 		size += 2*BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
-		flags &= ~SLAB_HWCACHE_ALIGN;
-		size += BYTES_PER_WORD; /* add space */
+		/* user store requires word alignment and
+		 * one word storage behind the end of the real
+		 * object.
+		 */
+		align = BYTES_PER_WORD;
+		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size > 128 && cachep->reallen > L1_CACHE_BYTES && size < PAGE_SIZE) {
+	if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
 		cachep->dbghead += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
 #endif
-	align = BYTES_PER_WORD;
-	if (flags & SLAB_HWCACHE_ALIGN)
-		align = L1_CACHE_BYTES;
 
 	/* Determine if the slab management is 'on' or 'off' slab. */
 	if (size >= (PAGE_SIZE>>3))
@@ -1193,13 +1210,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 		 */
 		flags |= CFLGS_OFF_SLAB;
 
-	if (flags & SLAB_HWCACHE_ALIGN) {
-		/* Need to adjust size so that objs are cache aligned. */
-		/* Small obj size, can get at least two per cache line. */
+	if (!align) {
+		/* Default alignment: compile time specified l1 cache size.
+		 * Except if an object is really small, then squeeze multiple
+		 * into one cacheline.
+		 */
+		align = cache_line_size();
 		while (size <= align/2)
 			align /= 2;
-		size = (size+align-1)&(~(align-1));
 	}
+	size = ALIGN(size, align);
 
 	/* Cal size (in pages) of slabs, and the num of objs per slab.
 	 * This could be made much more intelligent.  For now, try to avoid
@@ -1209,7 +1229,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 	do {
 		unsigned int break_flag = 0;
 cal_wastage:
-		cache_estimate(cachep->gfporder, size, flags,
+		cache_estimate(cachep->gfporder, size, align, flags,
 						&left_over, &cachep->num);
 		if (break_flag)
 			break;
@@ -1243,7 +1263,8 @@ next:
 		cachep = NULL;
 		goto opps;
 	}
-	slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab));
+	slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+				+ sizeof(struct slab), align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -1254,14 +1275,17 @@ next:
 		left_over -= slab_size;
 	}
 
-	/* Offset must be a multiple of the alignment. */
-	offset += (align-1);
-	offset &= ~(align-1);
-	if (!offset)
-		offset = L1_CACHE_BYTES;
-	cachep->colour_off = offset;
-	cachep->colour = left_over/offset;
+	if (flags & CFLGS_OFF_SLAB) {
+		/* really off slab. No need for manual alignment */
+		slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+	}
 
+	cachep->colour_off = cache_line_size();
+	/* Offset must be a multiple of the alignment. */
+	if (cachep->colour_off < align)
+		cachep->colour_off = align;
+	cachep->colour = left_over/cachep->colour_off;
+	cachep->slab_size = slab_size;
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
@@ -1543,8 +1567,7 @@ static inline struct slab* alloc_slabmgmt (kmem_cache_t *cachep,
 			return NULL;
 	} else {
 		slabp = objp+colour_off;
-		colour_off += L1_CACHE_ALIGN(cachep->num *
-				sizeof(kmem_bufctl_t) + sizeof(struct slab));
+		colour_off += cachep->slab_size;
 	}
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
-- 
cgit v1.2.3


From 07ebe427f0289a322c96c38906bb3bb7aacf15b6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:58:52 -0700
Subject: [PATCH] set mod->waiter before calling stop_machine

From: Rusty Russell <rusty@rustcorp.com.au>

mod->waiter needs to be set before we try to stop the module: setting it in
__try_stop_module means it gets set to the kthread, not rmmod.
---
 kernel/module.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 16587e133b1b..a472deef9bdf 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -493,7 +493,6 @@ static inline int __try_stop_module(void *_sref)
 	}
 
 	/* Mark it as dying. */
-	sref->mod->waiter = current;
 	sref->mod->state = MODULE_STATE_GOING;
 	return 0;
 }
@@ -588,6 +587,9 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		}
 	}
 
+	/* Set this up before setting mod->state */
+	mod->waiter = current;
+
 	/* Stop the machine so refcounts can't move and disable module. */
 	ret = try_stop_module(mod, flags, &forced);
 
-- 
cgit v1.2.3


From 3f66b056e1b56427eec0b26e0a20ac08fb8a6dc9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:59:32 -0700
Subject: [PATCH] pmdisk: fix strcmp in sysfs store

From: Herbert Xu <herbert@gondor.apana.org.au>

This patch fixes the sysfs store functions for pmdisk when the input
contains a trailing newline.
---
 kernel/power/disk.c | 7 ++++++-
 kernel/power/main.c | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 2d4cf319b8e1..7e035a9b42d1 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -285,11 +285,16 @@ static ssize_t disk_store(struct subsystem * s, const char * buf, size_t n)
 {
 	int error = 0;
 	int i;
+	int len;
+	char *p;
 	u32 mode = 0;
 
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
+
 	down(&pm_sem);
 	for (i = PM_DISK_FIRMWARE; i < PM_DISK_MAX; i++) {
-		if (!strcmp(buf,pm_disk_modes[i])) {
+		if (!strncmp(buf, pm_disk_modes[i], len)) {
 			mode = i;
 			break;
 		}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index fd212e7ecd9f..d582906fecc6 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -218,10 +218,15 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
 {
 	u32 state = PM_SUSPEND_STANDBY;
 	char ** s;
+	char *p;
 	int error;
+	int len;
+
+	p = memchr(buf, '\n', n);
+	len = p ? p - buf : n;
 
 	for (s = &pm_states[state]; *s; s++, state++) {
-		if (!strcmp(buf,*s))
+		if (!strncmp(buf, *s, len))
 			break;
 	}
 	if (*s)
-- 
cgit v1.2.3


From 5362a3548872e1d7c7b3926f1519999b315e9825 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:59:58 -0700
Subject: [PATCH] Fix sys_time() to get subtick correction from the new xtime

From: "La Monte H.P. Yarroll" <piggy@timesys.com>

This is a Scott Wood patch against 2.6.3.


Use gettimeofday() rather than xtime.tv_sec in sys_time(), since
sys_stime() uses settimeofday() and thus subtracts the subtick correction
from the new xtime.

stime() used settimeofday(), but time() did not use gettimeofday().  Since
settimeofday() subtracts out the current intra-tick correction, and nsec
was 0 (since stime() only allows seconds), this resulted in xtime being
slightly earlier than the time that was set.

If time() had used gettimeofday(), the correction would have been applied,
and everything would be fine.  However, instead time just reads the current
xtime.tv_sec, so if time() is called immediately after stime(), you'll
usually get a value one second earlier.
---
 arch/ia64/ia32/sys_ia32.c         |  7 ++++---
 arch/parisc/kernel/sys_parisc32.c | 14 ++++++++------
 arch/x86_64/ia32/sys_ia32.c       |  7 ++++---
 kernel/time.c                     |  7 ++++---
 4 files changed, 20 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/ia32/sys_ia32.c b/arch/ia64/ia32/sys_ia32.c
index 38aaf7b08bbd..a1d008067121 100644
--- a/arch/ia64/ia32/sys_ia32.c
+++ b/arch/ia64/ia32/sys_ia32.c
@@ -1090,10 +1090,11 @@ asmlinkage long
 sys32_time (int *tloc)
 {
 	int i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
 
-	/* SMP: This is fairly trivial. We grab CURRENT_TIME and
-	   stuff it to user space. No side effects */
-	i = get_seconds();
 	if (tloc) {
 		if (put_user(i, tloc))
 			i = -EFAULT;
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index d59c93baf0c2..7159953b2c44 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -388,14 +388,16 @@ static inline long get_ts32(struct timespec *o, struct compat_timeval *i)
 
 asmlinkage long sys32_time(compat_time_t *tloc)
 {
-    time_t now = get_seconds();
-    compat_time_t now32 = now;
+    struct timeval tv;
 
-    if (tloc)
-    	if (put_user(now32, tloc))
-		now32 = -EFAULT;
+	do_gettimeofday(&tv);
+	compat_time_t now32 = tv.tv_sec;
 
-    return now32;
+	if (tloc)
+		if (put_user(now32, tloc))
+			now32 = -EFAULT;
+
+	return now32;
 }
 
 asmlinkage int
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 4d19333ead18..47c23419f55c 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -832,10 +832,11 @@ sys32_writev(int fd, struct compat_iovec *vector, u32 count)
 asmlinkage long sys32_time(int * tloc)
 {
 	int i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
 
-	/* SMP: This is fairly trivial. We grab CURRENT_TIME and 
-	   stuff it to user space. No side effects */
-	i = get_seconds();
 	if (tloc) {
 		if (put_user(i,tloc))
 			i = -EFAULT;
diff --git a/kernel/time.c b/kernel/time.c
index 33a6fe086304..142a4bd5771e 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -51,10 +51,11 @@ EXPORT_SYMBOL(sys_tz);
 asmlinkage long sys_time(int * tloc)
 {
 	int i;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	i = tv.tv_sec;
 
-	/* SMP: This is fairly trivial. We grab CURRENT_TIME and 
-	   stuff it to user space. No side effects */
-	i = get_seconds();
 	if (tloc) {
 		if (put_user(i,tloc))
 			i = -EFAULT;
-- 
cgit v1.2.3


From ce334bb8f0f084112dcfe96214cacfa0afba7e10 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:04:10 -0700
Subject: [PATCH] export complete_all()

From: Mike Waychison <Michael.Waychison@Sun.COM>

Export complete_all for module use.
---
 kernel/sched.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index b42029abe679..c2d1c0317130 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1847,7 +1847,6 @@ void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exc
 		__wake_up_common(q, mode, nr_exclusive, 0);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
-
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 void fastcall complete(struct completion *x)
@@ -1860,7 +1859,6 @@ void fastcall complete(struct completion *x)
 			 1, 0);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
-
 EXPORT_SYMBOL(complete);
 
 void fastcall complete_all(struct completion *x)
@@ -1873,6 +1871,7 @@ void fastcall complete_all(struct completion *x)
 			 0, 0);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
+EXPORT_SYMBOL(complete_all);
 
 void fastcall __sched wait_for_completion(struct completion *x)
 {
@@ -1894,7 +1893,6 @@ void fastcall __sched wait_for_completion(struct completion *x)
 	x->done--;
 	spin_unlock_irq(&x->wait.lock);
 }
-
 EXPORT_SYMBOL(wait_for_completion);
 
 #define	SLEEP_ON_VAR					\
-- 
cgit v1.2.3


From 8e1aabbc236128b9e696ae61235b17165bb73ada Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:06:08 -0700
Subject: [PATCH] Strip quotes from kernel parameters

From: Rusty Russell <rusty@rustcorp.com.au>

Agustin Martin <agmartin@debian.org> pointed out that this doesn't work:

	options ide-mod options="ide=nodma hdc=cdrom"

The quotes are understood by kernel/params.c (ie.  it skips over spaces
inside them), but are not stripped before handing to the underlying
function.  They should be.
---
 kernel/params.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 4d9a71b743c5..59667bce9ce0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -96,6 +96,13 @@ static char *next_arg(char *args, char **param, char **val)
 	else {
 		args[equals] = '\0';
 		*val = args + equals + 1;
+
+		/* Don't include quotes in value. */
+		if (**val == '"') {
+			(*val)++;
+			if (args[i-1] == '"')
+				args[i-1] = '\0';
+		}
 	}
 
 	if (args[i]) {
-- 
cgit v1.2.3


From 8398bcc6b3eb950a1242f6dc4cfb151b6b9238c3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:08:32 -0700
Subject: [PATCH] eliminate nswap and cnswap

From: Matt Mackall <mpm@selenic.com>

The nswap and cnswap variables counters have never been incremented as
Linux doesn't do task swapping.
---
 arch/alpha/kernel/osf_sys.c | 3 ---
 fs/proc/array.c             | 4 ++--
 include/linux/sched.h       | 2 +-
 kernel/acct.c               | 2 +-
 kernel/exit.c               | 1 -
 kernel/fork.c               | 1 -
 kernel/sys.c                | 3 ---
 7 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 34adfc76dd92..f725059fe47f 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1095,14 +1095,12 @@ osf_getrusage(int who, struct rusage32 *ru)
 		jiffies_to_timeval32(current->stime, &r.ru_stime);
 		r.ru_minflt = current->min_flt;
 		r.ru_majflt = current->maj_flt;
-		r.ru_nswap = current->nswap;
 		break;
 	case RUSAGE_CHILDREN:
 		jiffies_to_timeval32(current->cutime, &r.ru_utime);
 		jiffies_to_timeval32(current->cstime, &r.ru_stime);
 		r.ru_minflt = current->cmin_flt;
 		r.ru_majflt = current->cmaj_flt;
-		r.ru_nswap = current->cnswap;
 		break;
 	default:
 		jiffies_to_timeval32(current->utime + current->cutime,
@@ -1111,7 +1109,6 @@ osf_getrusage(int who, struct rusage32 *ru)
 				   &r.ru_stime);
 		r.ru_minflt = current->min_flt + current->cmin_flt;
 		r.ru_majflt = current->maj_flt + current->cmaj_flt;
-		r.ru_nswap = current->nswap + current->cnswap;
 		break;
 	}
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ac9ccac5d1ee..ae90151e45ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -392,8 +392,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		sigign      .sig[0] & 0x7fffffffUL,
 		sigcatch    .sig[0] & 0x7fffffffUL,
 		wchan,
-		task->nswap,
-		task->cnswap,
+		0UL,
+		0UL,
 		task->exit_signal,
 		task_cpu(task),
 		task->rt_priority,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a1229121123..22080f919266 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -436,7 +436,7 @@ struct task_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
 	u64 start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
-	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
diff --git a/kernel/acct.c b/kernel/acct.c
index b417066778a7..8e32413c41f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -376,7 +376,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_minflt = encode_comp_t(current->min_flt);
 	ac.ac_majflt = encode_comp_t(current->maj_flt);
-	ac.ac_swaps = encode_comp_t(current->nswap);
+	ac.ac_swaps = encode_comp_t(0);
 	ac.ac_exitcode = exitcode;
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 810eebd77559..8157dbc037d6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -92,7 +92,6 @@ repeat:
 	p->parent->cstime += p->stime + p->cstime;
 	p->parent->cmin_flt += p->min_flt + p->cmin_flt;
 	p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
-	p->parent->cnswap += p->nswap + p->cnswap;
 	p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
 	p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
 	sched_exit(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 315a06125e65..da5213611496 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -513,7 +513,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->cmin_flt = tsk->cmaj_flt = 0;
-	tsk->nswap = tsk->cnswap = 0;
 	tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0;
 
 	tsk->mm = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 9d57482758f3..4d414d925889 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1532,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw;
 			r.ru_minflt = p->min_flt;
 			r.ru_majflt = p->maj_flt;
-			r.ru_nswap = p->nswap;
 			break;
 		case RUSAGE_CHILDREN:
 			jiffies_to_timeval(p->cutime, &r.ru_utime);
@@ -1541,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->cnivcsw;
 			r.ru_minflt = p->cmin_flt;
 			r.ru_majflt = p->cmaj_flt;
-			r.ru_nswap = p->cnswap;
 			break;
 		default:
 			jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime);
@@ -1550,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw + p->cnivcsw;
 			r.ru_minflt = p->min_flt + p->cmin_flt;
 			r.ru_majflt = p->maj_flt + p->cmaj_flt;
-			r.ru_nswap = p->nswap + p->cnswap;
 			break;
 	}
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-- 
cgit v1.2.3


From 424e44d11e16e6a64c307f9a29e6b2716319b202 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:15:59 -0700
Subject: [PATCH] fork vma ordering during fork

From: Hugh Dickins <hugh@veritas.com>

First of six patches against 2.6.5-rc3, cleaning up mremap's move_vma, and
fixing truncation orphan issues raised by Rajesh Venkatasubramanian.
Originally done as part of the anonymous objrmap work on mremap move, but
useful fixes now extracted for mainline.  The mremap changes need some
exposure in the -mm tree first, but the first (fork one-liner) is safe enough
to go straight into 2.6.5.


From: Rajesh Venkatasubramanian.  Despite the comment that child vma should
be inserted just after parent vma, 2.5.6 did exactly the reverse: thus a
racing vmtruncate may free the child's ptes, then advance to the parent, and
meanwhile copy_page_range has propagated more ptes from the parent to the
child, leaving file pages still mapped after truncation.
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index da5213611496..fc25a3a15d0e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
       
 			/* insert tmp into the share list, just after mpnt */
 			down(&file->f_mapping->i_shared_sem);
-			list_add_tail(&tmp->shared, &mpnt->shared);
+			list_add(&tmp->shared, &mpnt->shared);
 			up(&file->f_mapping->i_shared_sem);
 		}
 
-- 
cgit v1.2.3


From 93d33a4885a483c708ccb7d24b56e0d5fef7bcab Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:17:38 -0700
Subject: [PATCH] laptop mode

From: Bart Samwel <bart@samwel.tk>

Adds /proc/sys/vm/laptop-mode: a special knob which says "this is a laptop".
In this mode the kernel will attempt to avoid spinning disks up.

Algorithm: the idea is to hold dirty data in memory for a long time, but to
flush everything which has been accumulated if the disk happens to spin up
for other reasons.

- Whenever a disk request completes (read or write), schedule a timer a few
  seconds hence.  If the timer was already pending, reset it to a few seconds
  hence.

- When the timer expires, write back the whole world.  We use
  sync_filesystems() for this because it will force ext3 journal commits as
  well.

- In balance_dirty_pages(), kick off background writeback when we hit the
  high threshold (dirty_ratio), not when we hit the low threshold.  This has
  the effect of causing "lumpy" writeback which is something I spent a year
  fixing, but in laptop mode, it is desirable.

- In try_to_free_pages(), only kick pdflush if the VM is getting into
  distress: we want to keep scanning for clean pages, deferring writeback.

- In page reclaim, avoid writing back the odd random dirty page off the
  LRU: only start I/O if the scanning is working harder.

The effect is to perform a sync() a few seconds after all I/O has ceased.

The value which was written into /proc/sys/vm/laptop-mode determines, in
seconds, the delay between the final I/O and the flush.

Additionally, the patch adds tools which help answer the question "why the
heck does my disk spin up all the time?".  The user may set
/proc/sys/vm/block_dump to a non-zero value and the kernel will print out
information which will identify the process which is performing disk reads or
which is dirtying pagecache.

The user should probably disable syslogd before setting block-dump.
---
 Documentation/laptop-mode.txt | 665 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c     |  14 +
 fs/buffer.c                   |   2 +
 fs/fs-writeback.c             |   3 +
 include/linux/sysctl.h        |   2 +
 include/linux/writeback.h     |   6 +-
 kernel/sysctl.c               |  20 ++
 mm/page-writeback.c           |  69 ++++-
 mm/vmscan.c                   |  61 ++--
 9 files changed, 815 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/laptop-mode.txt

(limited to 'kernel')

diff --git a/Documentation/laptop-mode.txt b/Documentation/laptop-mode.txt
new file mode 100644
index 000000000000..9df8d2677bef
--- /dev/null
+++ b/Documentation/laptop-mode.txt
@@ -0,0 +1,665 @@
+How to conserve battery power using laptop-mode
+-----------------------------------------------
+
+Document Author: Bart Samwel (bart@samwel.tk)
+Date created: January 2, 2004
+Last modified: April 3, 2004
+
+Introduction
+------------
+
+Laptopmode is used to minimize the time that the hard disk needs to be spun up,
+to conserve battery power on laptops. It has been reported to cause significant
+power savings.
+
+Contents
+--------
+
+* Introduction
+* The short story
+* Caveats
+* The details
+* Tips & Tricks
+* Control script
+* ACPI integration
+* Monitoring tool
+
+
+The short story
+---------------
+
+If you just want to use it, run the laptop_mode control script (which is included
+at the end of this document) as follows:
+
+# laptop_mode start
+
+Then set your harddisk spindown time to a relatively low value with hdparm:
+
+hdparm -S 4 /dev/hda
+
+The value -S 4 means 20 seconds idle time before spindown. Your harddisk will
+now only spin up when a disk cache miss occurs, or at least once every 10
+minutes to write back any pending changes.
+
+To stop laptop_mode, run "laptop_mode stop".
+
+
+Caveats
+-------
+
+* The downside of laptop mode is that you have a chance of losing up
+  to 10 minutes of work. If you cannot afford this, don't use it! It's
+  wise to turn OFF laptop mode when you're almost out of battery --
+  although this will make the battery run out faster, at least you'll
+  lose less work when it actually runs out. I'm still looking for someone
+  to submit instructions on how to turn off laptop mode when battery is low,
+  e.g., using ACPI events. I don't have a laptop myself, so if you do and
+  you care to contribute such instructions, please do.
+
+* Most desktop hard drives have a very limited lifetime measured in spindown
+  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
+  Check your drive's rating, and don't wear down your drive's lifetime if you
+  don't need to.
+
+* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+  the control script will not be able to remount them correctly. You must set
+  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
+  wrong options -- or it will fail because it cannot write to /etc/mtab.
+
+* If you have your filesystems listed as type "auto" in fstab, like I did, then
+  the control script will not recognize them as filesystems that need remounting.
+
+* If you have XFS, make SURE that you set the XFS_HZ value in the control script
+  correctly, to the value of HZ of your running kernel. Laptop mode will not
+  work correctly if it is set too low, and you may lose data if it is set too
+  high. The reason for this problem is that XFS does not export its sysctl
+  variables in centisecs (like most other subsystems do) but in "jiffies",
+  which is an internal kernel measure. Once this is fixed things will get better.
+
+
+The details
+-----------
+
+Laptop-mode is controlled by the flag /proc/sys/vm/laptop_mode. When this
+flag is set, any physical disk read operation (that might have caused the
+hard disk to spin up) causes Linux to flush all dirty blocks. The result
+of this is that after a disk has spun down, it will not be spun up anymore
+to write dirty blocks, because those blocks had already been written
+immediately after the most recent read operation
+
+To increase the effectiveness of the laptop_mode strategy, the laptop_mode
+control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
+/proc/sys/vm to about 10 minutes (by default), which means that pages that are
+dirtied are not forced to be written to disk as often. The control script also
+changes the dirty background ratio, so that background writeback of dirty pages
+is not done anymore. Combined with a higher commit value (also 10 minutes) for
+ext3 or ReiserFS filesystems (also done automatically by the control script),
+this results in concentration of disk activity in a small time interval which
+occurs only once every 10 minutes, or whenever the disk is forced to spin up by
+a cache miss. The disk can then be spun down in the periods of inactivity.
+
+If you want to find out which process caused the disk to spin up, you can
+gather information by setting the flag /proc/sys/vm/block_dump. When this flag
+is set, Linux reports all disk read and write operations that take place, and
+all block dirtyings done to files. This makes it possible to debug why a disk
+needs to spin up, and to increase battery life even more. The output of
+block_dump is written to the kernel output, and it can be retrieved using
+"dmesg". When you use block_dump, you may want to turn off klogd, otherwise
+the output of block_dump will be logged, causing disk activity that is not
+normally there.
+
+If 10 minutes is too much or too little downtime for you, you can configure
+this downtime as follows. In the control script, set the MAX_AGE value to the
+maximum number of seconds of disk downtime that you would like. You should
+then set your filesystem's commit interval to the same value. The dirty ratio
+is also configurable from the control script.
+
+If you don't like the idea of the control script remounting your filesystems
+for you, you can change DO_REMOUNTS to 0 in the script.
+
+Thanks to Kiko Piris, the control script can be used to enable laptop mode on
+both the Linux 2.4 and 2.6 series.
+
+
+Tips & Tricks
+-------------
+
+* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
+  of his regular 3 to 3.5 hours) using very aggressive power management (hdparm
+  -B1) and a spindown time of 5 seconds (hdparm -S1).
+
+* You can spin down the disk while playing MP3, by setting the disk readahead
+  to 8MB (hdparm -a 16384). Effectively, the disk will read a complete MP3 at
+  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
+  Kania.)
+
+* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
+  of colours that my display uses it consumes less battery power. I've seen
+  this on powerbooks too. I hope that this is a piece of information that
+  might be useful to the Laptop Mode patch or it's users."
+
+* One thing which will cause disks to spin up is not-present application
+  and dynamic library text pages.  The kernel will load program text off disk
+  on-demand, so each time you invoke an application feature for the first
+  time, the kernel needs to spin the disk up to go and fetch that part of the
+  application.
+
+  So it is useful to increase the disk readahead parameter greatly, so that
+  the kernel will pull all of the executable's pages into memory on the first
+  pagefault.
+
+  The supplied script does this.
+
+* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
+  file after every logging. When you're using laptop-mode and your disk doesn't
+  spin down, this is a likely culprit.
+
+* Richard Atterer observed that laptop mode does not work well with noflushd
+  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
+  from doing its thing.
+
+
+Control script
+--------------
+
+Please note that this control script works for the Linux 2.4 and 2.6 series.
+
+--------------------CONTROL SCRIPT BEGIN------------------------------------------
+#! /bin/sh
+
+# start or stop laptop_mode, best run by a power management daemon when
+# ac gets connected/disconnected from a laptop
+#
+# install as /sbin/laptop_mode
+#
+# Contributors to this script:   Kiko Piris
+#				 Bart Samwel
+#				 Micha Feigin
+#				 Andrew Morton
+#				 Dax Kelson
+#
+# Original Linux 2.4 version by: Jens Axboe
+
+# Remove an option (the first parameter) of the form option=<number> from
+# a mount options string (the rest of the parameters).
+parse_mount_opts () {
+	OPT="$1"
+	shift
+	echo "$*"			| \
+	sed 's/.*/,&,/'			| \
+	sed 's/,'"$OPT"'=[0-9]*,/,/g'	| \
+	sed 's/,,*/,/g'			| \
+	sed 's/^,//'			| \
+	sed 's/,$//'			| \
+	cat -
+}
+
+# Remove an option (the first parameter) without any arguments from
+# a mount option string (the rest of the parameters).
+parse_nonumber_mount_opts () {
+	OPT="$1"
+	shift
+	echo "$*" 			| \
+	sed 's/.*/,&,/'			| \
+	sed 's/,'"$OPT"',/,/g'		| \
+	sed 's/,,*/,/g'			| \
+	sed 's/^,//'			| \
+	sed 's/,$//'			| \
+	cat -
+}
+
+# Find out the state of a yes/no option (e.g. "atime"/"noatime") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, the option name the second, and the default
+# value the third. The remainder is the mount options string.
+#
+# Example:
+# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
+#
+# If fstab contains, say, "rw" for this filesystem, then the result
+# will be "defaults,atime".
+parse_yesno_opts_wfstab () {
+	L_DEV=$1
+	shift
+	OPT=$1
+	shift
+	DEF_OPT=$1
+	shift
+	L_OPTS="$*"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
+	# Watch for a default atime in fstab
+	FSTAB_OPTS="$(cat /etc/fstab | sed 's/  / /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')"
+	if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT")" ] ; then
+		# option not specified in fstab -- choose the default.
+		echo "$PARSEDOPTS1,$DEF_OPT"
+	else
+		# option specified in fstab: extract the value and use it
+		if [ -z "$(echo "$FSTAB_OPTS" | grep "no$OPT")" ] ; then
+			# no$OPT not found -- so we must have $OPT.
+			echo "$PARSEDOPTS1,$OPT"
+		else
+			echo "$PARSEDOPTS1,no$OPT"
+		fi
+	fi
+}
+
+# Find out the state of a numbered option (e.g. "commit=NNN") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, and the option name the second. The
+# remainder is the mount options string in which the replacement
+# must be done.
+#
+# Example:
+# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
+#
+# If fstab contains, say, "commit=3,rw" for this filesystem, then the
+# result will be "rw,commit=3".
+parse_mount_opts_wfstab () {
+	L_DEV=$1
+	shift
+	OPT=$1
+	shift
+	L_OPTS="$*"
+
+	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
+	# Watch for a default commit in fstab
+	FSTAB_OPTS="$(cat /etc/fstab | sed 's/	/ /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')"
+	if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT=")" ] ; then
+		# option not specified in fstab: set it to 0
+		echo "$PARSEDOPTS1,$OPT=0"
+	else
+		# option specified in fstab: extract the value, and use it
+		echo -n "$PARSEDOPTS1,$OPT="
+		echo "$FSTAB_OPTS"	| \
+		sed 's/.*/,&,/'		| \
+		sed 's/.*,'"$OPT"'=//'	| \
+		sed 's/,.*//'		| \
+		cat -
+	fi
+}
+
+KLEVEL="$(uname -r | cut -c1-3)"
+case "$KLEVEL" in
+	"2.4"|"2.6")
+		true
+		;;
+	*)
+		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')"
+		exit 1
+		;;
+esac
+
+# Shall we remount journaled fs. with appropiate commit interval? (1=yes)
+DO_REMOUNTS=1
+
+# age time, in seconds. should be put into a sysconfig file
+MAX_AGE=600
+
+# Dirty synchronous ratio.  At this percentage of dirty pages the process which
+# calls write() does its own writeback
+DIRTY_RATIO=40
+
+#
+# Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
+# exceeded, the kernel will wake pdflush which will then reduce the amount
+# of dirty memory to dirty_background_ratio.  Set this nice and low, so once
+# some writeout has commenced, we do a lot of it.
+#
+DIRTY_BACKGROUND_RATIO=5
+
+READAHEAD=4096		# kilobytes
+
+# kernel default dirty buffer age
+DEF_AGE=30
+DEF_UPDATE=5
+DEF_DIRTY_BACKGROUND_RATIO=10
+DEF_DIRTY_RATIO=40
+DEF_XFS_AGE_BUFFER=15
+DEF_XFS_SYNC_INTERVAL=30
+
+# This must be adjusted manually to the value of HZ in the running kernel,
+# until the XFS people change their external interfaces to work in centisecs
+# like the rest of the external world. Unfortunately this cannot be automated. :(
+XFS_HZ=1000
+
+if [ ! -e /proc/sys/vm/laptop_mode ]; then
+	echo "Kernel is not patched with laptop_mode patch."
+	exit 1
+fi
+
+if [ ! -w /proc/sys/vm/laptop_mode ]; then
+	echo "You do not have enough privileges to enable laptop_mode."
+	exit 1
+fi
+
+case "$1" in
+	start)
+		AGE=$((100*$MAX_AGE))
+		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
+		echo -n "Starting laptop_mode"
+
+		if [ -d /proc/sys/vm/pagebuf ] ; then
+			# This only needs to be set, not reset -- it is only used when
+			# laptop mode is enabled.
+			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# The same goes for these.
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
+			# But not for these -- they are also used in normal
+			# operation.
+			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
+		fi
+
+		case "$KLEVEL" in
+			"2.4")
+				echo "1"				> /proc/sys/vm/laptop_mode
+				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "5"				> /proc/sys/vm/laptop_mode
+				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE,noatime
+						;;
+					"xfs")
+						mount $DEV -t $FST $MP -o remount,$OPTS,noatime
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra $(($READAHEAD * 2)) $DEV
+				fi
+			done
+		fi
+		echo "."
+		;;
+	stop)
+		U_AGE=$((100*$DEF_UPDATE))
+		B_AGE=$((100*$DEF_AGE))
+		echo -n "Stopping laptop_mode"
+		echo "0" > /proc/sys/vm/laptop_mode
+		if [ -f /proc/sys/fs/xfs/age_buffer ] && [ ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# These need to be restored though, if there are no lm_*.
+			echo "$(($XFS_HZ*$DEF_XFS_AGE_BUFFER))" 	> /proc/sys/fs/xfs/age_buffer
+			echo "$(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL))" 	> /proc/sys/fs/xfs/sync_interval
+		fi
+		case "$KLEVEL" in
+			"2.4")
+				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				# Reset commit and atime options to defaults.
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+					"xfs")
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra 256 $DEV
+				fi
+			done
+		fi
+		echo "."
+		;;
+	*)
+		echo "Usage: $0 {start|stop}"
+		;;
+
+esac
+
+exit 0
+
+--------------------CONTROL SCRIPT END--------------------------------------------
+
+
+ACPI integration
+----------------
+
+Dax Kelson submitted this so that the ACPI acpid daemon will
+kick off the laptop_mode script and run hdparm.
+
+---------------------------/etc/acpi/events/ac_adapter BEGIN-------------------------------------------
+event=ac_adapter
+action=/etc/acpi/actions/battery.sh
+---------------------------/etc/acpi/events/ac_adapter END-------------------------------------------
+
+---------------------------/etc/acpi/actions/battery.sh BEGIN-------------------------------------------
+#!/bin/sh
+
+# cpu throttling
+# cat /proc/acpi/processor/CPU0/throttling for more info
+ACAD_THR=0
+BATT_THR=2
+
+# spindown time for HD (man hdparm for valid values)
+# I prefer 2 hours for acad and 20 seconds for batt
+ACAD_HD=244
+BATT_HD=4
+
+# ac/battery event handler
+
+status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/AC/state`
+
+case $status in
+        "on-line")
+                echo "Setting HD spindown to 2 hours"
+                /sbin/laptop-mode stop
+                /sbin/hdparm -S $ACAD_HD /dev/hda > /dev/null 2>&1
+                /sbin/hdparm -B 255 /dev/hda > /dev/null 2>&1
+                #echo -n $ACAD_CPU:$ACAD_THR > /proc/acpi/processor/CPU0/limit
+                exit 0
+        ;;
+        "off-line")
+                echo "Setting HD spindown to 20 seconds"
+                /sbin/laptop-mode start
+                /sbin/hdparm -S $BATT_HD /dev/hda > /dev/null 2>&1
+                /sbin/hdparm -B 1 /dev/hda > /dev/null 2>&1
+                #echo -n $BATT_CPU:$BATT_THR > /proc/acpi/processor/CPU0/limit
+                exit 0
+        ;;
+esac
+---------------------------/etc/acpi/actions/battery.sh END-------------------------------------------
+
+Monitoring tool
+---------------
+
+Bartek Kania submitted this, it can be used to measure how much time your disk
+spends spun up/down.
+
+---------------------------dslm.c BEGIN-------------------------------------------
+/*
+ * Simple Disk Sleep Monitor
+ *  by Bartek Kania
+ * Licenced under the GPL
+ */
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/hdreg.h>
+
+#ifdef DEBUG
+#define D(x) x
+#else
+#define D(x)
+#endif
+
+int endit = 0;
+
+/* Check if the disk is in powersave-mode
+ * Most of the code is stolen from hdparm.
+ * 1 = active, 0 = standby/sleep, -1 = unknown */
+int check_powermode(int fd)
+{
+    unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0};
+    int state;
+
+    if (ioctl(fd, HDIO_DRIVE_CMD, &args)
+	&& (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */
+	&& ioctl(fd, HDIO_DRIVE_CMD, &args)) {
+	if (errno != EIO || args[0] != 0 || args[1] != 0) {
+	    state = -1; /* "unknown"; */
+	} else
+	    state = 0; /* "sleeping"; */
+    } else {
+	state = (args[2] == 255) ? 1 : 0;
+    }
+    D(printf(" drive state is:  %d\n", state));
+
+    return state;
+}
+
+char *state_name(int i)
+{
+    if (i == -1) return "unknown";
+    if (i == 0) return "sleeping";
+    if (i == 1) return "active";
+
+    return "internal error";
+}
+
+char *myctime(time_t time)
+{
+    char *ts = ctime(&time);
+    ts[strlen(ts) - 1] = 0;
+
+    return ts;
+}
+
+void measure(int fd)
+{
+    time_t start_time;
+    int last_state;
+    time_t last_time;
+    int curr_state;
+    time_t curr_time = 0;
+    time_t time_diff;
+    time_t active_time = 0;
+    time_t sleep_time = 0;
+    time_t unknown_time = 0;
+    time_t total_time = 0;
+    int changes = 0;
+    float tmp;
+
+    printf("Starting measurements\n");
+
+    last_state = check_powermode(fd);
+    start_time = last_time = time(0);
+    printf("  System is in state %s\n\n", state_name(last_state));
+
+    while(!endit) {
+	sleep(1);
+	curr_state = check_powermode(fd);
+
+	if (curr_state != last_state || endit) {
+	    changes++;
+	    curr_time = time(0);
+	    time_diff = curr_time - last_time;
+
+	    if (last_state == 1) active_time += time_diff;
+	    else if (last_state == 0) sleep_time += time_diff;
+	    else unknown_time += time_diff;
+
+	    last_state = curr_state;
+	    last_time = curr_time;
+
+	    printf("%s: State-change to %s\n", myctime(curr_time),
+		   state_name(curr_state));
+	}
+    }
+    changes--; /* Compensate for SIGINT */
+
+    total_time = time(0) - start_time;
+    printf("\nTotal running time:  %lus\n", curr_time - start_time);
+    printf(" State changed %d times\n", changes);
+
+    tmp = (float)sleep_time / (float)total_time * 100;
+    printf(" Time in sleep state:   %lus (%.2f%%)\n", sleep_time, tmp);
+    tmp = (float)active_time / (float)total_time * 100;
+    printf(" Time in active state:  %lus (%.2f%%)\n", active_time, tmp);
+    tmp = (float)unknown_time / (float)total_time * 100;
+    printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp);
+}
+
+void ender(int s)
+{
+    endit = 1;
+}
+
+void usage()
+{
+    puts("usage: dslm [-w <time>] <disk>");
+    exit(0);
+}
+
+int main(int ac, char **av)
+{
+    int fd;
+    char *disk = 0;
+    int settle_time = 60;
+
+    /* Parse the simple command-line */
+    if (ac == 2)
+	disk = av[1];
+    else if (ac == 4) {
+	settle_time = atoi(av[2]);
+	disk = av[3];
+    } else
+	usage();
+
+    if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) {
+	printf("Can't open %s, because: %s\n", disk, strerror(errno));
+	exit(-1);
+    }
+
+    if (settle_time) {
+	printf("Waiting %d seconds for the system to settle down to "
+	       "'normal'\n", settle_time);
+	sleep(settle_time);
+    } else
+	puts("Not waiting for system to settle down");
+
+    signal(SIGINT, ender);
+
+    measure(fd);
+
+    close(fd);
+
+    return 0;
+}
+---------------------------dslm.c END---------------------------------------------
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 7b9f0c75bffd..fc4b6c698fcf 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -27,6 +27,7 @@
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/writeback.h>
 
 /*
  * for max sense size
@@ -2471,6 +2472,16 @@ int submit_bio(int rw, struct bio *bio)
 		mod_page_state(pgpgout, count);
 	else
 		mod_page_state(pgpgin, count);
+
+	if (unlikely(block_dump)) {
+		char b[BDEVNAME_SIZE];
+		printk("%s(%d): %s block %Lu on %s\n",
+			current->comm, current->pid,
+			(rw & WRITE) ? "WRITE" : "READ",
+			(unsigned long long)bio->bi_sector,
+			bdevname(bio->bi_bdev,b));
+	}
+
 	generic_make_request(bio);
 	return 1;
 }
@@ -2754,6 +2765,9 @@ void end_that_request_last(struct request *req)
 	struct gendisk *disk = req->rq_disk;
 	struct completion *waiting = req->waiting;
 
+	if (unlikely(laptop_mode))
+		laptop_io_completion();
+
 	if (disk && blk_fs_request(req)) {
 		unsigned long duration = jiffies - req->start_time;
 		switch (rq_data_dir(req)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 42b61de10bf3..605ce2099aa5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -274,6 +274,8 @@ static void do_sync(unsigned long wait)
 	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
 	if (!wait)
 		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
 }
 
 asmlinkage long sys_sync(void)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591c5eb79ba3..23e367ed22f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -75,6 +75,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) == flags)
 		return;
 
+	if (unlikely(block_dump))
+		printk("%s(%d): dirtied file\n", current->comm, current->pid);
+
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3767428df94d..d2224f6617f9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -159,6 +159,8 @@ enum
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
+	VM_LAPTOP_MODE=23,	/* vm laptop mode */
+	VM_BLOCK_DUMP=24,	/* block dump mode */
 };
 
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7380d2cefb16..f557b55e8b0a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -72,12 +72,16 @@ static inline void wait_on_inode(struct inode *inode)
  * mm/page-writeback.c
  */
 int wakeup_bdflush(long nr_pages);
+void laptop_io_completion(void);
+void laptop_sync_completion(void);
 
-/* These 5 are exported to sysctl. */
+/* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
 extern int dirty_writeback_centisecs;
 extern int dirty_expire_centisecs;
+extern int block_dump;
+extern int laptop_mode;
 
 struct ctl_table;
 struct file;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f2c8c8ce4926..05ea59ae4276 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -744,6 +744,26 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= VM_LAPTOP_MODE,
+		.procname	= "laptop_mode",
+		.data		= &laptop_mode,
+		.maxlen		= sizeof(laptop_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_BLOCK_DUMP,
+		.procname	= "block_dump",
+		.data		= &block_dump,
+		.maxlen		= sizeof(block_dump),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1981309fa9c5..9cf47af10ccc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -28,6 +28,7 @@
 #include <linux/smp.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/syscalls.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -81,6 +82,16 @@ int dirty_writeback_centisecs = 5 * 100;
  */
 int dirty_expire_centisecs = 30 * 100;
 
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode".
+ */
+int laptop_mode;
+
 /* End of sysctl-exported parameters */
 
 
@@ -195,7 +206,19 @@ static void balance_dirty_pages(struct address_space *mapping)
 	if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 		dirty_exceeded = 0;
 
-	if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
+	if (writeback_in_progress(bdi))
+		return;		/* pdflush is already working this queue */
+
+	/*
+	 * In laptop mode, we wait until hitting the higher threshold before
+	 * starting background writeout, and then write out all the way down
+	 * to the lower threshold.  So slow writers cause minimal disk activity.
+	 *
+	 * In normal mode, we start background writeout at the lower
+	 * background_thresh, to keep the amount of dirty memory low.
+	 */
+	if ((laptop_mode && pages_written) ||
+	     (!laptop_mode && (nr_reclaimable > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -289,7 +312,13 @@ int wakeup_bdflush(long nr_pages)
 	return pdflush_operation(background_writeout, nr_pages);
 }
 
-static struct timer_list wb_timer;
+static void wb_timer_fn(unsigned long unused);
+static void laptop_timer_fn(unsigned long unused);
+
+static struct timer_list wb_timer =
+			TIMER_INITIALIZER(wb_timer_fn, 0, 0);
+static struct timer_list laptop_mode_wb_timer =
+			TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
 
 /*
  * Periodic writeback of "old" data.
@@ -368,7 +397,36 @@ static void wb_timer_fn(unsigned long unused)
 {
 	if (pdflush_operation(wb_kupdate, 0) < 0)
 		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+}
+
+static void laptop_flush(unsigned long unused)
+{
+	sys_sync();
+}
+
+static void laptop_timer_fn(unsigned long unused)
+{
+	pdflush_operation(laptop_flush, 0);
+}
 
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now.  If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(void)
+{
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+	del_timer(&laptop_mode_wb_timer);
 }
 
 /*
@@ -429,12 +487,7 @@ void __init page_writeback_init(void)
 		vm_dirty_ratio *= correction;
 		vm_dirty_ratio /= 100;
 	}
-
-	init_timer(&wb_timer);
-	wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
-	wb_timer.data = 0;
-	wb_timer.function = wb_timer_fn;
-	add_timer(&wb_timer);
+	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0728eadc0eb7..39e8ed0fcdd6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -246,7 +246,8 @@ static void handle_write_error(struct address_space *mapping,
  * shrink_list returns the number of reclaimed pages
  */
 static int
-shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
+shrink_list(struct list_head *page_list, unsigned int gfp_mask,
+		int *nr_scanned, int do_writepage)
 {
 	struct address_space *mapping;
 	LIST_HEAD(ret_pages);
@@ -354,6 +355,8 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
+			if (laptop_mode && !do_writepage)
+				goto keep_locked;
 			if (clear_page_dirty_for_io(page)) {
 				int res;
 				struct writeback_control wbc = {
@@ -473,7 +476,7 @@ keep:
  */
 static int
 shrink_cache(struct zone *zone, unsigned int gfp_mask,
-		int max_scan, int *total_scanned)
+		int max_scan, int *total_scanned, int do_writepage)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -521,7 +524,8 @@ shrink_cache(struct zone *zone, unsigned int gfp_mask,
 			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 		else
 			mod_page_state_zone(zone, pgscan_direct, nr_scan);
-		nr_freed = shrink_list(&page_list, gfp_mask, total_scanned);
+		nr_freed = shrink_list(&page_list, gfp_mask,
+					total_scanned, do_writepage);
 		*total_scanned += nr_taken;
 		if (current_is_kswapd())
 			mod_page_state(kswapd_steal, nr_freed);
@@ -735,7 +739,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
  */
 static int
 shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
-		int *total_scanned, struct page_state *ps)
+		int *total_scanned, struct page_state *ps, int do_writepage)
 {
 	unsigned long ratio;
 	int count;
@@ -764,7 +768,8 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
 	count = atomic_read(&zone->nr_scan_inactive);
 	if (count >= SWAP_CLUSTER_MAX) {
 		atomic_set(&zone->nr_scan_inactive, 0);
-		return shrink_cache(zone, gfp_mask, count, total_scanned);
+		return shrink_cache(zone, gfp_mask, count,
+					total_scanned, do_writepage);
 	}
 	return 0;
 }
@@ -787,7 +792,7 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
  */
 static int
 shrink_caches(struct zone **zones, int priority, int *total_scanned,
-		int gfp_mask, struct page_state *ps)
+		int gfp_mask, struct page_state *ps, int do_writepage)
 {
 	int ret = 0;
 	int i;
@@ -803,7 +808,8 @@ shrink_caches(struct zone **zones, int priority, int *total_scanned,
 			continue;	/* Let kswapd poll it */
 
 		max_scan = zone->nr_inactive >> priority;
-		ret += shrink_zone(zone, max_scan, gfp_mask, total_scanned, ps);
+		ret += shrink_zone(zone, max_scan, gfp_mask,
+					total_scanned, ps, do_writepage);
 	}
 	return ret;
 }
@@ -833,6 +839,8 @@ int try_to_free_pages(struct zone **zones,
 	int nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	int i;
+	unsigned long total_scanned = 0;
+	int do_writepage = 0;
 
 	inc_page_state(allocstall);
 
@@ -840,13 +848,13 @@ int try_to_free_pages(struct zone **zones,
 		zones[i]->temp_priority = DEF_PRIORITY;
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		int total_scanned = 0;
+		int scanned = 0;
 		struct page_state ps;
 
 		get_page_state(&ps);
-		nr_reclaimed += shrink_caches(zones, priority, &total_scanned,
-						gfp_mask, &ps);
-		shrink_slab(total_scanned, gfp_mask);
+		nr_reclaimed += shrink_caches(zones, priority, &scanned,
+						gfp_mask, &ps, do_writepage);
+		shrink_slab(scanned, gfp_mask);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
@@ -858,14 +866,20 @@ int try_to_free_pages(struct zone **zones,
 		if (!(gfp_mask & __GFP_FS))
 			break;		/* Let the caller handle it */
 		/*
-		 * Try to write back as many pages as we just scanned.  Not
-		 * sure if that makes sense, but it's an attempt to avoid
-		 * creating IO storms unnecessarily
+		 * Try to write back as many pages as we just scanned.  This
+		 * tends to cause slow streaming writers to write data to the
+		 * disk smoothly, at the dirtying rate, which is nice.   But
+		 * that's undesirable in laptop mode, where we *want* lumpy
+		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		wakeup_bdflush(total_scanned);
+		total_scanned += scanned;
+		if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
+			wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+			do_writepage = 1;
+		}
 
 		/* Take a nap, wait for some writeback to complete */
-		if (total_scanned && priority < DEF_PRIORITY - 2)
+		if (scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
@@ -908,6 +922,8 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 	int i;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long total_scanned = 0;
+	unsigned long total_reclaimed = 0;
+	int do_writepage = 0;
 
 	inc_page_state(pageoutrun);
 
@@ -969,16 +985,25 @@ scan:
 			zone->temp_priority = priority;
 			max_scan = zone->nr_inactive >> priority;
 			reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL,
-					&scanned, ps);
+					&scanned, ps, do_writepage);
 			total_scanned += scanned;
 			reclaim_state->reclaimed_slab = 0;
 			shrink_slab(scanned, GFP_KERNEL);
 			reclaimed += reclaim_state->reclaimed_slab;
+			total_reclaimed += reclaimed;
 			to_free -= reclaimed;
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned > zone->present_pages * 2)
 				zone->all_unreclaimable = 1;
+			/*
+			 * If we've done a decent amount of scanning and
+			 * the reclaim ratio is low, start doing writepage
+			 * even in laptop mode
+			 */
+			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			    total_scanned > total_reclaimed+total_reclaimed/2)
+				do_writepage = 1;
 		}
 		if (nr_pages && to_free > 0)
 			continue;	/* swsusp: need to do more work */
@@ -997,7 +1022,7 @@ out:
 
 		zone->prev_priority = zone->temp_priority;
 	}
-	return nr_pages - to_free;
+	return total_reclaimed;
 }
 
 /*
-- 
cgit v1.2.3


From 95f238eac82907c4ccbc301cd5788e67db0715ce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:18:43 -0700
Subject: [PATCH] ia32: 4Kb stacks (and irqstacks) patch

From: Arjan van de Ven <arjanv@redhat.com>

Below is a patch to enable 4Kb stacks for x86. The goal of this is to

1) Reduce footprint per thread so that systems can run many more threads
   (for the java people)

2) Reduce the pressure on the VM for order > 0 allocations. We see real life
   workloads (granted with 2.4 but the fundamental fragmentation issue isn't
   solved in 2.6 and isn't solvable in theory) where this can be a problem.
   In addition order > 0 allocations can make the VM "stutter" and give more
   latency due to having to do much much more work trying to defragment

The first 2 bits of the patch actually affect compiler options in a generic
way: I propose to disable the -funit-at-a-time feature from gcc.  With this
enabled (and it's default with -O2), gcc will very agressively inline
functions, which is nice and all for userspace, but for the kernel this makes
us suffer a gcc deficiency more: gcc is extremely bad at sharing stackslots,
for example a situation like this:

if (some_condition)
	function_A();
else
	function_B();

with -funit-at-a-time, both function_A() and _B() might get inlined, however
the stack usage of both functions of the parent function grows the stack
usage of both functions COMBINED instead of the maximum of the two.  Even
with the normal 8Kb stacks this is a danger since we see some functions grow
3Kb to 4Kb of stack use this way.  With 4Kb stacks, 4Kb of stack usage growth
obviously is deadly ;-( but even with 8Kb stacks it's pure lottery.
Disabling -funit-at-a-time also exposes another thing in the -mm tree; the
attribute always_inline is considered harmful by gcc folks in that when gcc
makes a decision to NOT inline a function marked this way, it throws an
error.  Disabling -funit-at-a-time disables some of the agressive inlining
(eg of large functions that come later in the .c file) so this would make
your tree not compile.

The 4k stackness of the kernel is included in modversions, so people don't
load 4k-stack modules into 8k-stack kernels.

At present 4k stacks are selectable in config.  When the feature has settled
in we should remove the 8k option.  This will break the nvidia modules.  But
Fedora uses 4k stacks so a new nvidia driver is expected soon.
---
 arch/i386/Kconfig              |   9 +++
 arch/i386/Makefile             |   6 +-
 arch/i386/kernel/i8259.c       |   3 +
 arch/i386/kernel/irq.c         | 146 ++++++++++++++++++++++++++++++++++++++++-
 arch/i386/kernel/smpboot.c     |   2 +
 arch/i386/kernel/traps.c       |  18 +++--
 include/asm-alpha/irq.h        |   3 +
 include/asm-arm/irq.h          |   4 ++
 include/asm-arm26/irq.h        |   2 +
 include/asm-cris/irq.h         |   4 ++
 include/asm-h8300/irq.h        |   4 ++
 include/asm-i386/irq.h         |  25 +++++++
 include/asm-i386/module.h      |   8 ++-
 include/asm-i386/thread_info.h |  24 ++++++-
 include/asm-ia64/irq.h         |   4 ++
 include/asm-m68k/irq.h         |   4 ++
 include/asm-m68knommu/irq.h    |   4 ++
 include/asm-mips/irq.h         |   3 +
 include/asm-parisc/irq.h       |   3 +
 include/asm-ppc/irq.h          |   4 ++
 include/asm-ppc64/irq.h        |   4 ++
 include/asm-s390/irq.h         |   4 ++
 include/asm-sh/irq.h           |   4 ++
 include/asm-sparc/irq.h        |   4 ++
 include/asm-sparc64/irq.h      |   4 ++
 include/asm-um/irq.h           |   5 ++
 include/asm-v850/irq.h         |   4 ++
 include/asm-x86_64/irq.h       |   4 ++
 include/linux/compiler-gcc3.h  |   2 +-
 include/linux/irq.h            |   1 -
 kernel/softirq.c               |  70 ++++++++++++--------
 31 files changed, 342 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 03620021bd6b..c6439f846171 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1294,6 +1294,15 @@ config FRAME_POINTER
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame pointers.
 
+config 4KSTACKS
+	bool "Use 4Kb for kernel stacks instead of 8Kb"
+	help
+	  If you say Y here the kernel will use a 4Kb stacksize for the
+	  kernel stack attached to each process/thread. This facilitates
+	  running more threads on a system and also reduces the pressure
+	  on the VM subsystem for higher order allocations. This option
+	  will also use IRQ stacks to compensate for the reduced stackspace.
+
 config X86_FIND_SMP_CONFIG
 	bool
 	depends on X86_LOCAL_APIC || X86_VOYAGER
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 4c8f1c06f572..019544e08f1e 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -56,9 +56,9 @@ cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 GCC_VERSION			:= $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
 cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
 
-# Enable unit-at-a-time mode when possible. It shrinks the
-# kernel considerably.
-CFLAGS += $(call check_gcc,-funit-at-a-time,)
+# Disable unit-at-a-time mode, it makes gcc use a lot more stack
+# due to the lack of sharing of stacklots.
+CFLAGS += $(call check_gcc,-fno-unit-at-a-time,)
 
 CFLAGS += $(cflags-y)
 
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index f093e29b69a2..48fbaf5cee34 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -444,4 +444,7 @@ void __init init_IRQ(void)
 	 */
 	if (boot_cpu_data.hard_math && !cpu_has_fpu)
 		setup_irq(FPU_IRQ, &fpu_irq);
+
+	current_thread_info()->cpu = 0;
+	irq_ctx_init(0);
 }
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index c1385d668bf4..ea69f21994f6 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -74,6 +74,14 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 
 static void register_irq_proc (unsigned int irq);
 
+/*
+ * per-CPU IRQ handling stacks
+ */
+#ifdef CONFIG_4KSTACKS
+union irq_ctx *hardirq_ctx[NR_CPUS];
+union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
+
 /*
  * Special irq handlers.
  */
@@ -209,7 +217,7 @@ inline void synchronize_irq(unsigned int irq)
  * waste of time and is not what some drivers would
  * prefer.
  */
-int handle_IRQ_event(unsigned int irq,
+asmlinkage int handle_IRQ_event(unsigned int irq,
 		struct pt_regs *regs, struct irqaction *action)
 {
 	int status = 1;	/* Force the "do bottom halves" bit */
@@ -432,7 +440,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(esp < (sizeof(struct thread_info) + 1024))) {
+		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
 			printk("do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
@@ -480,11 +488,68 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 	 * useful for irq hardware that does not mask cleanly in an
 	 * SMP environment.
 	 */
+#ifdef CONFIG_4KSTACKS
+
 	for (;;) {
 		irqreturn_t action_ret;
+		u32 *isp;
+		union irq_ctx * curctx;
+		union irq_ctx * irqctx;
+
+		curctx = (union irq_ctx *) current_thread_info();
+		irqctx = hardirq_ctx[smp_processor_id()];
 
 		spin_unlock(&desc->lock);
+
+		/*
+		 * this is where we switch to the IRQ stack. However, if we are already using
+		 * the IRQ stack (because we interrupted a hardirq handler) we can't do that
+		 * and just have to keep using the current stack (which is the irq stack already
+		 * after all)
+		 */
+
+		if (curctx == irqctx)
+			action_ret = handle_IRQ_event(irq, &regs, action);
+		else {
+			/* build the stack frame on the IRQ stack */
+			isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+			irqctx->tinfo.task = curctx->tinfo.task;
+			irqctx->tinfo.previous_esp = current_stack_pointer();
+
+			*--isp = (u32) action;
+			*--isp = (u32) &regs;
+			*--isp = (u32) irq;
+
+			asm volatile(
+				"       xchgl   %%ebx,%%esp     \n"
+				"       call    handle_IRQ_event \n"
+				"       xchgl   %%ebx,%%esp     \n"
+				: "=a"(action_ret)
+				: "b"(isp)
+				: "memory", "cc", "edx", "ecx"
+			);
+
+
+		}
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (curctx != irqctx)
+			irqctx->tinfo.task = NULL;
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+
+#else
+
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
 		action_ret = handle_IRQ_event(irq, &regs, action);
+
 		spin_lock(&desc->lock);
 		if (!noirqdebug)
 			note_interrupt(irq, desc, action_ret);
@@ -492,6 +557,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 			break;
 		desc->status &= ~IRQ_PENDING;
 	}
+#endif
 	desc->status &= ~IRQ_INPROGRESS;
 
 out:
@@ -1049,3 +1115,79 @@ void init_irq_proc (void)
 		register_irq_proc(i);
 }
 
+
+#ifdef CONFIG_4KSTACKS
+static char softirq_stack[NR_CPUS * THREAD_SIZE]  __attribute__((__aligned__(THREAD_SIZE)));
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]  __attribute__((__aligned__(THREAD_SIZE)));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+	union irq_ctx *irqctx;
+
+	if (hardirq_ctx[cpu])
+		return;
+
+	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	hardirq_ctx[cpu] = irqctx;
+
+	irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	softirq_ctx[cpu] = irqctx;
+
+	printk("CPU %u irqstacks, hard=%p soft=%p\n",
+		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+	unsigned long flags;
+	struct thread_info *curctx;
+	union irq_ctx *irqctx;
+	u32 *isp;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	if (local_softirq_pending()) {
+		curctx = current_thread_info();
+		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx->tinfo.task = curctx->task;
+		irqctx->tinfo.previous_esp = current_stack_pointer();
+
+		/* build the stack frame on the softirq stack */
+		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+
+		asm volatile(
+			"       xchgl   %%ebx,%%esp     \n"
+			"       call    __do_softirq    \n"
+			"       movl    %%ebx,%%esp     \n"
+			: "=b"(isp)
+			: "0"(isp)
+			: "memory", "cc", "edx", "ecx", "eax"
+		);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index a15be84152c4..7baa4d420b73 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -815,6 +815,8 @@ static int __init do_boot_cpu(int apicid)
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	stack_start.esp = (void *) idle->thread.esp;
 
+	irq_ctx_init(cpu);
+
 	/*
 	 * This grunge runs the startup process for
 	 * the targeted processor.
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe976abcdea9..cf8da7ba4cdb 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -105,12 +105,20 @@ void show_trace(struct task_struct *task, unsigned long * stack)
 #ifdef CONFIG_KALLSYMS
 	printk("\n");
 #endif
-	while (!kstack_end(stack)) {
-		addr = *stack++;
-		if (kernel_text_address(addr)) {
-			printk(" [<%08lx>] ", addr);
-			print_symbol("%s\n", addr);
+	while (1) {
+		struct thread_info *context;
+		context = (struct thread_info*) ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+		while (!kstack_end(stack)) {
+			addr = *stack++;
+			if (kernel_text_address(addr)) {
+				printk(" [<%08lx>] ", addr);
+				print_symbol("%s\n", addr);
+			}
 		}
+		stack = (unsigned long*)context->previous_esp;
+		if (!stack)
+			break;
+		printk(" =======================\n");
 	}
 	printk("\n");
 }
diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index 551c7308c642..566db720000a 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -93,5 +93,8 @@ extern void enable_irq(unsigned int);
 struct pt_regs;
 extern void (*perf_irq)(unsigned long, struct pt_regs *);
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 
 #endif /* _ALPHA_IRQ_H */
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index a89f7345ed39..286be7cf7c63 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -44,5 +44,9 @@ void disable_irq_wake(unsigned int irq);
 void enable_irq_wake(unsigned int irq);
 int setup_irq(unsigned int, struct irqaction *);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
 
diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h
index 68712e576c6f..06bd5a543d13 100644
--- a/include/asm-arm26/irq.h
+++ b/include/asm-arm26/irq.h
@@ -45,6 +45,8 @@ extern void enable_irq(unsigned int);
 int set_irq_type(unsigned int irq, unsigned int type);
 
 int setup_irq(unsigned int, struct irqaction *);
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 
 #endif
 
diff --git a/include/asm-cris/irq.h b/include/asm-cris/irq.h
index caa45facb1b2..87f342517bb1 100644
--- a/include/asm-cris/irq.h
+++ b/include/asm-cris/irq.h
@@ -14,6 +14,10 @@ extern void enable_irq(unsigned int);
 #define disable_irq_nosync      disable_irq
 #define enable_irq_nosync       enable_irq
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif  /* _ASM_IRQ_H */
 
 
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index fabde1dd34a1..5027181ed067 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -68,4 +68,8 @@ extern void disable_irq(unsigned int);
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _H8300_IRQ_H_ */
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 69cb661b012a..5649b4a79bb2 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 /* include comes from machine specific directory */
 #include "irq_vectors.h"
+#include <asm/thread_info.h>
 
 static __inline__ int irq_canonicalize(int irq)
 {
@@ -30,4 +31,28 @@ extern int can_request_irq(unsigned int, unsigned long flags);
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+	struct thread_info      tinfo;
+	u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+extern union irq_ctx *hardirq_ctx[NR_CPUS];
+extern union irq_ctx *softirq_ctx[NR_CPUS];
+
+extern void irq_ctx_init(int cpu);
+
+#define __ARCH_HAS_DO_SOFTIRQ
+#else
+#define irq_ctx_init(cpu) do { ; } while (0)
+#endif
+
+struct irqaction;
+struct pt_regs;
+asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
+				struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index 76fc36f60ebe..8ec1dae638cb 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -60,6 +60,12 @@ struct mod_arch_specific
 #define MODULE_REGPARM ""
 #endif
 
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM
+#ifdef CONFIG_4KSTACKS
+#define MODULE_STACKSIZE "4KSTACKS "
+#else
+#define MODULE_STACKSIZE ""
+#endif
+
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
 
 #endif /* _ASM_I386_MODULE_H */
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 75f940011daa..da5c780f2c5c 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -9,6 +9,9 @@
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
+#include <asm/page.h>
+
 #ifndef __ASSEMBLY__
 #include <asm/processor.h>
 #endif
@@ -29,12 +32,16 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
 
+
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
 	struct restart_block    restart_block;
 
+	unsigned long           previous_esp;   /* ESP of the previous stack in case
+						   of nested (IRQ) stacks
+						*/
 	__u8			supervisor_stack[0];
 };
 
@@ -53,7 +60,13 @@ struct thread_info {
 #endif
 
 #define PREEMPT_ACTIVE		0x4000000
+#ifdef CONFIG_4KSTACKS
+#define THREAD_SIZE            (4096)
+#else
+#define THREAD_SIZE		(8192)
+#endif
 
+#define STACK_WARN             (THREAD_SIZE/8)
 /*
  * macros/functions for gaining access to the thread information structure
  *
@@ -77,7 +90,6 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
-#define THREAD_SIZE (2*PAGE_SIZE)
 
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
@@ -87,6 +99,14 @@ static inline struct thread_info *current_thread_info(void)
 	return ti;
 }
 
+/* how to get the current stack pointer from C */
+static inline unsigned long current_stack_pointer(void)
+{
+	unsigned long ti;
+	__asm__("movl %%esp,%0; ":"=r" (ti) : );
+	return ti;
+}
+
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
 #define alloc_thread_info(tsk)					\
@@ -108,8 +128,6 @@ static inline struct thread_info *current_thread_info(void)
 
 #else /* !__ASSEMBLY__ */
 
-#define THREAD_SIZE	8192
-
 /* how to get the thread information struct from ASM */
 #define GET_THREAD_INFO(reg) \
 	movl $-THREAD_SIZE, reg; \
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 79479e2c6966..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,4 +30,8 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IA64_IRQ_H */
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 02855ca536b0..5889bc919e80 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -124,4 +124,8 @@ extern volatile unsigned int num_spurious;
  */
 extern irq_node_t *new_irq_node(void);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 4c66ba93201a..208ccd969e4b 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -121,4 +121,8 @@ extern irq_node_t *new_irq_node(void);
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-mips/irq.h b/include/asm-mips/irq.h
index 90b4ae1258a8..d9667a8fbbfb 100644
--- a/include/asm-mips/irq.h
+++ b/include/asm-mips/irq.h
@@ -31,4 +31,7 @@ extern asmlinkage unsigned int do_IRQ(int irq, struct pt_regs *regs);
 
 extern void init_generic_irq(void);
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index 39db70230740..b7acca7de670 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -96,4 +96,7 @@ extern unsigned long txn_alloc_addr(int);
 /* soft power switch support (power.c) */
 extern struct tasklet_struct power_tasklet;
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif	/* _ASM_PARISC_IRQ_H */
diff --git a/include/asm-ppc/irq.h b/include/asm-ppc/irq.h
index bfa3de404d27..df5b76306f7a 100644
--- a/include/asm-ppc/irq.h
+++ b/include/asm-ppc/irq.h
@@ -211,5 +211,9 @@ extern unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
 extern unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
 extern atomic_t ppc_n_lost_interrupts;
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h
index 949e19f96be1..2cd77b4935fb 100644
--- a/include/asm-ppc64/irq.h
+++ b/include/asm-ppc64/irq.h
@@ -75,5 +75,9 @@ static __inline__ int irq_canonicalize(int irq)
 	return irq;
 }
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-s390/irq.h b/include/asm-s390/irq.h
index 25f1808531cc..cac6b3080725 100644
--- a/include/asm-s390/irq.h
+++ b/include/asm-s390/irq.h
@@ -21,6 +21,10 @@ enum interruption_class {
 
 #define touch_nmi_watchdog() do { } while(0)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __KERNEL__ */
 #endif
 
diff --git a/include/asm-sh/irq.h b/include/asm-sh/irq.h
index f470f758057a..7dd2a5ae10b5 100644
--- a/include/asm-sh/irq.h
+++ b/include/asm-sh/irq.h
@@ -329,4 +329,8 @@ static inline int generic_irq_demux(int irq)
 #define irq_canonicalize(irq)	(irq)
 #define irq_demux(irq)		__irq_demux(sh_mv.mv_irq_demux(irq))
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __ASM_SH_IRQ_H */
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index 5423905ffb40..cee356b0dae3 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -184,4 +184,8 @@ extern struct sun4m_intregs *sun4m_interrupts;
 #define SUN4M_INT_SBUS(x)	(1 << (x+7))
 #define SUN4M_INT_VME(x)	(1 << (x))
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h
index e3ba6bc2cc3e..3aef0ca67750 100644
--- a/include/asm-sparc64/irq.h
+++ b/include/asm-sparc64/irq.h
@@ -150,4 +150,8 @@ static __inline__ unsigned long get_softint(void)
 	return retval;
 }
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-um/irq.h b/include/asm-um/irq.h
index cd580acadc71..8300c209a1bc 100644
--- a/include/asm-um/irq.h
+++ b/include/asm-um/irq.h
@@ -32,4 +32,9 @@ extern int um_request_irq(unsigned int irq, int fd, int type,
 			  void (*handler)(int, void *, struct pt_regs *),
 			  unsigned long irqflags,  const char * devname,
 			  void *dev_id);
+
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 63e682d70de1..90c83aa053c8 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -65,4 +65,8 @@ extern void disable_irq_nosync (unsigned int irq);
 
 #endif /* !__ASSEMBLY__ */
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __V850_IRQ_H__ */
diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h
index ad5445ee7460..37c9fd65c97f 100644
--- a/include/asm-x86_64/irq.h
+++ b/include/asm-x86_64/irq.h
@@ -53,4 +53,8 @@ extern int can_request_irq(unsigned int, unsigned long flags);
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index c472cac3029d..265dad4c3cb4 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,7 +3,7 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
+#if __GNUC_MINOR__ >= 1  && __GNUC_MINOR__ < 4
 # define inline		__inline__ __attribute__((always_inline))
 # define __inline__	__inline__ __attribute__((always_inline))
 # define __inline	__inline__ __attribute__((always_inline))
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fa03b836c29a..5bc740d9bc47 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,7 +71,6 @@ extern irq_desc_t irq_desc [NR_IRQS];
 
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
-extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 extern int setup_irq(unsigned int , struct irqaction * );
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81c79736ff9e..58c915c202ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 
+#include <asm/irq.h>
 /*
    - No shared variables, all the data are CPU local.
    - If a softirq needs serialization, let it serialize itself
@@ -69,53 +70,66 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void do_softirq(void)
+asmlinkage void __do_softirq(void)
 {
-	int max_restart = MAX_SOFTIRQ_RESTART;
+	struct softirq_action *h;
 	__u32 pending;
-	unsigned long flags;
+	int max_restart = MAX_SOFTIRQ_RESTART;
 
-	if (in_interrupt())
-		return;
+	pending = local_softirq_pending();
 
-	local_irq_save(flags);
+	local_bh_disable();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	local_softirq_pending() = 0;
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1)
+			h->action(h);
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
 
 	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
 
-	if (pending) {
-		struct softirq_action *h;
+	if (pending)
+		wakeup_softirqd();
 
-		local_bh_disable();
-restart:
-		/* Reset the pending bitmask before enabling irqs */
-		local_softirq_pending() = 0;
+	__local_bh_enable();
+}
 
-		local_irq_enable();
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
 
-		h = softirq_vec;
+	if (in_interrupt())
+		return;
 
-		do {
-			if (pending & 1)
-				h->action(h);
-			h++;
-			pending >>= 1;
-		} while (pending);
+	local_irq_save(flags);
 
-		local_irq_disable();
+	pending = local_softirq_pending();
 
-		pending = local_softirq_pending();
-		if (pending && --max_restart)
-			goto restart;
-		if (pending)
-			wakeup_softirqd();
-		__local_bh_enable();
-	}
+	if (pending)
+		__do_softirq();
 
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
 
+#endif
+
 void local_bh_enable(void)
 {
 	__local_bh_enable();
-- 
cgit v1.2.3


From d1e069e24b6da75b8177ca523916309e27cd08e3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:19:47 -0700
Subject: [PATCH] add stop_machine barriers

From: Andrea Arcangeli <andrea@suse.de>

We need a barrier before checking for kthread_should_stop in do_stop.
---
 kernel/stop_machine.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index a2cae39d322c..9610403ce2cf 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -149,10 +149,10 @@ static int do_stop(void *_smdata)
 	complete(&smdata->done);
 
 	/* Wait for kthread_stop */
-	__set_current_state(TASK_INTERRUPTIBLE);
+	set_current_state(TASK_INTERRUPTIBLE);
 	while (!kthread_should_stop()) {
 		schedule();
-		__set_current_state(TASK_INTERRUPTIBLE);
+		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
 	return ret;
-- 
cgit v1.2.3


From fa736caa043b56b67cf988e9962e941aa26c64f4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:23:53 -0700
Subject: [PATCH] pmdisk needs asmlinkage

From: Pavel Machek <pavel@ucw.cz>

This function will break with -mregparm, so mark it asmlinkage.
---
 kernel/power/pmdisk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c
index 1dc29b53a25e..d54147214bea 100644
--- a/kernel/power/pmdisk.c
+++ b/kernel/power/pmdisk.c
@@ -35,7 +35,7 @@
 #include "power.h"
 
 
-extern int pmdisk_arch_suspend(int resume);
+extern asmlinkage int pmdisk_arch_suspend(int resume);
 
 #define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
 #define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
-- 
cgit v1.2.3


From f85a96f63f300878dcc785cf2333cab15eef48f0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:29:12 -0700
Subject: [PATCH] Light-weight Auditing Framework

From: Rik Faith <faith@redhat.com>

This patch provides a low-overhead system-call auditing framework for Linux
that is usable by LSM components (e.g., SELinux).  This is an update of the
patch discussed in this thread:

    http://marc.theaimsgroup.com/?t=107815888100001&r=1&w=2

In brief, it provides for netlink-based logging of audit records that have
been generated in other parts of the kernel (e.g., SELinux) as well as the
ability to audit system calls, either independently (using simple
filtering) or as a compliment to the audit record that another part of the
kernel generated.

The main goals were to provide system call auditing with 1) as low overhead
as possible, and 2) without duplicating functionality that is already
provided by SELinux (and/or other security infrastructures).  This
framework will work "stand-alone", but is not designed to provide, e.g.,
CAPP functionality without another security component in place.

This updated patch includes changes from feedback I have received,
including the ability to compile without CONFIG_NET (and better use of
tabs, so use -w if you diff against the older patch).

Please see http://people.redhat.com/faith/audit/ for an early example
user-space client (auditd-0.4.tar.gz) and instructions on how to try it.

My future intentions at the kernel level include improving filtering (e.g.,
syscall personality/exit codes) and syscall support for more architectures.
 First, though, I'm going to work on documentation, a (real) audit daemon,
and patches for other user-space tools so that people can play with the
framework and understand how it can be used with and without SELinux.


Update:

Light-weight Auditing Framework receive filter fixes
From: Rik Faith <faith@redhat.com>

Since audit_receive_filter() is only called with audit_netlink_sem held, it
cannot race with either audit_del_rule() or audit_add_rule(), so the
list_for_each_entry_rcu()s may be replaced by list_for_each_entry()s, and
the rcu_read_{un,}lock()s removed.  A fix for this is part of the attached
patch.

Other features of the attached patch are:

1) generalized the ability to test for inequality

2) added syscall exit status reporting and testing

3) added ability to report and test first 4 syscall arguments (this adds
   a large amount of flexibility for little cost; not implemented or tested
   on ppc64)

4) added ability to report and test personality

User-space demo program enhanced for new fields and inequality testing:
http://people.redhat.com/faith/audit/auditd-0.5.tar.gz
---
 arch/i386/kernel/entry.S         |   6 +-
 arch/i386/kernel/ptrace.c        |  10 +
 arch/ppc64/kernel/entry.S        |  15 +-
 arch/ppc64/kernel/ptrace.c       |  29 +-
 arch/x86_64/ia32/ia32entry.S     |  18 +-
 arch/x86_64/kernel/entry.S       |  21 +-
 arch/x86_64/kernel/ptrace.c      |  30 +-
 fs/namei.c                       |  15 +-
 include/asm-i386/thread_info.h   |   6 +-
 include/asm-ppc64/thread_info.h  |   3 +
 include/asm-x86_64/thread_info.h |   5 +-
 include/linux/audit.h            | 211 +++++++++
 include/linux/fs.h               |  14 +-
 include/linux/netlink.h          |   1 +
 include/linux/sched.h            |   3 +
 init/Kconfig                     |  20 +
 kernel/Makefile                  |   2 +
 kernel/audit.c                   | 825 +++++++++++++++++++++++++++++++++++
 kernel/auditsc.c                 | 922 +++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                    |  10 +-
 security/selinux/avc.c           | 168 ++++---
 security/selinux/include/avc.h   |   7 +-
 security/selinux/ss/services.c   |   2 +-
 23 files changed, 2199 insertions(+), 144 deletions(-)
 create mode 100644 include/linux/audit.h
 create mode 100644 kernel/audit.c
 create mode 100644 kernel/auditsc.c

(limited to 'kernel')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 14e64d3ea25c..afa02ea3592c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -264,7 +264,7 @@ sysenter_past_esp:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 
-	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
@@ -287,7 +287,7 @@ ENTRY(system_call)
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 					# system call tracing in operation
-	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 syscall_call:
 	call *sys_call_table(,%eax,4)
@@ -354,7 +354,7 @@ syscall_trace_entry:
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $_TIF_SYSCALL_TRACE, %cl
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT), %cl
 	jz work_pending
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index a77061138b0c..9f9b32a3f228 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -524,6 +525,15 @@ out:
 __attribute__((regparm(3)))
 void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	if (unlikely(current->audit_context)) {
+		if (!entryexit)
+			audit_syscall_entry(current, regs->orig_eax,
+					    regs->ebx, regs->ecx,
+					    regs->edx, regs->esi);
+		else
+			audit_syscall_exit(current, regs->eax);
+	}
+
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 	if (!(current->ptrace & PT_PTRACED))
diff --git a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S
index 027967ba3ae4..4ad95bcc5b3e 100644
--- a/arch/ppc64/kernel/entry.S
+++ b/arch/ppc64/kernel/entry.S
@@ -95,7 +95,7 @@ _GLOBAL(DoSyscall)
 #endif /* SHOW_SYSCALLS */
 	clrrdi	r10,r1,THREAD_SHIFT
 	ld	r10,TI_FLAGS(r10)
-	andi.	r11,r10,_TIF_SYSCALL_TRACE
+	andi.	r11,r10,_TIF_SYSCALL_T_OR_A
 	bne-	50f
 	cmpli	0,r0,NR_syscalls
 	bge-	66f
@@ -151,7 +151,8 @@ _GLOBAL(ret_from_syscall_1)
 	b	22b
         
 /* Traced system call support */
-50:	bl	.do_syscall_trace
+50:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_syscall_trace_enter
 	ld	r0,GPR0(r1)	/* Restore original registers */
 	ld	r3,GPR3(r1)
 	ld	r4,GPR4(r1)
@@ -201,7 +202,7 @@ _GLOBAL(ret_from_syscall_2)
 	oris	r10,r10,0x1000
 	std	r10,_CCR(r1)
 60:	std	r3,GPR3(r1)	/* Update return value */
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 66:	li	r3,ENOSYS
 	b	57b
@@ -234,14 +235,14 @@ _GLOBAL(ppc64_rt_sigreturn)
 
 80:	clrrdi	r4,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r4)
-	andi.	r4,r4,_TIF_SYSCALL_TRACE
+	andi.	r4,r4,_TIF_SYSCALL_T_OR_A
 	bne-	81f
 	cmpi	0,r3,0
 	bge	.ret_from_except
 	b	.ret_from_syscall_1
 81:	cmpi	0,r3,0
 	blt	.ret_from_syscall_2
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 
 /*
@@ -352,9 +353,9 @@ _GLOBAL(ret_from_fork)
 	bl	.schedule_tail
 	clrrdi	r4,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r4)
-	andi.	r4,r4,_TIF_SYSCALL_TRACE
+	andi.	r4,r4,_TIF_SYSCALL_T_OR_A
 	beq+	.ret_from_except
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 
 _GLOBAL(ret_from_except)
diff --git a/arch/ppc64/kernel/ptrace.c b/arch/ppc64/kernel/ptrace.c
index 6bf102811810..6afe71a7d56c 100644
--- a/arch/ppc64/kernel/ptrace.c
+++ b/arch/ppc64/kernel/ptrace.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -286,12 +287,8 @@ out:
 	return ret;
 }
 
-void do_syscall_trace(void)
+static void do_syscall_trace(void)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
-	if (!(current->ptrace & PT_PTRACED))
-		return;
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
@@ -307,3 +304,25 @@ void do_syscall_trace(void)
 		current->exit_code = 0;
 	}
 }
+
+void do_syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
+
+void do_syscall_trace_leave(void)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, 0);	/* FIXME: pass pt_regs */
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index aea26e4a8405..4e7ab108e8ac 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -78,8 +78,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	bt  $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc  sysenter_tracesys
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz  sysenter_tracesys
 sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls),%eax
 	jae	ia32_badsys
@@ -106,7 +106,7 @@ sysenter_tracesys:
 	CLEAR_RREGS
 	movq	$-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
-	call	syscall_trace
+	call	syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl	%ebp, %ebp
@@ -163,8 +163,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	bt  $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc  cstar_tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz   cstar_tracesys
 cstar_do_call:	
 	cmpl $IA32_NR_syscalls,%eax
 	jae  ia32_badsys
@@ -187,7 +187,7 @@ cstar_tracesys:
 	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl RSP-ARGOFFSET(%rsp), %r8d
@@ -236,8 +236,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	bt $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc ia32_tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls),%eax
 	jae  ia32_badsys
@@ -251,7 +251,7 @@ ia32_tracesys:
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	jmp ia32_do_syscall
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index a6309212038d..89f74f738a2a 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -131,8 +131,8 @@ ENTRY(ret_from_fork)
 	CFI_DEFAULT_STACK
 	call schedule_tail
 	GET_THREAD_INFO(%rcx)
-	bt $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx)
-	jc rff_trace
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz rff_trace
 rff_action:	
 	RESTORE_REST
 	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
@@ -143,7 +143,7 @@ rff_action:
 	jmp ret_from_sys_call
 rff_trace:
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_leave
 	GET_THREAD_INFO(%rcx)	
 	jmp rff_action
 	CFI_ENDPROC
@@ -185,8 +185,8 @@ ENTRY(system_call)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)  
 	GET_THREAD_INFO(%rcx)
-	bt    $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx) 
-	jc    tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
@@ -244,7 +244,7 @@ tracesys:
 	movq $-ENOSYS,RAX(%rsp)
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
@@ -254,7 +254,7 @@ tracesys:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 1:	SAVE_REST
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_leave
 	RESTORE_TOP_OF_STACK %rbx
 	RESTORE_REST
 	jmp ret_from_sys_call
@@ -297,13 +297,14 @@ int_very_careful:
 	sti
 	SAVE_REST
 	/* Check for syscall exit trace */	
-	bt $TIF_SYSCALL_TRACE,%edx
-	jnc int_signal
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),%edx
+	jz int_signal
 	pushq %rdi
 	leaq 8(%rsp),%rdi	# &ptregs -> arg1	
-	call syscall_trace
+	call syscall_trace_leave
 	popq %rdi
 	btr  $TIF_SYSCALL_TRACE,%edi
+	btr  $TIF_SYSCALL_AUDIT,%edi
 	jmp int_restore_rest
 	
 int_signal:
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 8aad386340dc..a97bee6f29fa 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -16,6 +16,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -486,7 +487,7 @@ out:
 	return ret;
 }
 
-asmlinkage void syscall_trace(struct pt_regs *regs)
+static void syscall_trace(struct pt_regs *regs)
 {
 
 #if 0
@@ -496,11 +497,6 @@ asmlinkage void syscall_trace(struct pt_regs *regs)
 	       current_thread_info()->flags, current->ptrace); 
 #endif
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return; 
-	if (!(current->ptrace & PT_PTRACED))
-		return;
-	
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
 				? 0x80 : 0));
 	/*
@@ -513,3 +509,25 @@ asmlinkage void syscall_trace(struct pt_regs *regs)
 		current->exit_code = 0;
 	}
 }
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->orig_rax,
+				    regs->rdi, regs->rsi,
+				    regs->rdx, regs->r10);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, regs->rax);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
diff --git a/fs/namei.c b/fs/namei.c
index e6320d133c5f..d2cab643cf64 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -26,6 +26,7 @@
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
 
@@ -141,10 +142,12 @@ char * getname(const char __user * filename)
 
 		result = tmp;
 		if (retval < 0) {
-			putname(tmp);
+			__putname(tmp);
 			result = ERR_PTR(retval);
 		}
 	}
+	if (unlikely(current->audit_context) && !IS_ERR(result) && result)
+		audit_getname(result);
 	return result;
 }
 
@@ -860,6 +863,8 @@ walk_init_root(const char *name, struct nameidata *nd)
 
 int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
 {
+	int retval;
+
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 
@@ -882,7 +887,13 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata
 	}
 	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
-	return link_path_walk(name, nd);
+	retval = link_path_walk(name, nd);
+	if (unlikely(current->audit_context
+		     && nd && nd->dentry && nd->dentry->d_inode))
+		audit_inode(name,
+			    nd->dentry->d_inode->i_ino,
+			    nd->dentry->d_inode->i_rdev);
+	return retval;
 }
 
 /*
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index da5c780f2c5c..6f59e1fe345b 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -151,6 +151,7 @@ static inline unsigned long current_stack_pointer(void)
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -159,9 +160,12 @@ static inline unsigned long current_stack_pointer(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
-#define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
+/* work to do on interrupt/exception return */
+#define _TIF_WORK_MASK \
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
 
 /*
diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h
index 5b74b149f04f..297c974bf220 100644
--- a/include/asm-ppc64/thread_info.h
+++ b/include/asm-ppc64/thread_info.h
@@ -97,6 +97,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_32BIT		5	/* 32 bit binary */
 #define TIF_RUN_LIGHT		6	/* iSeries run light */
 #define TIF_ABI_PENDING		7	/* 32/64 bit switch needed */
+#define TIF_SYSCALL_AUDIT	8	/* syscall auditing active */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_32BIT		(1<<TIF_32BIT)
 #define _TIF_RUN_LIGHT		(1<<TIF_RUN_LIGHT)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
 				 _TIF_NEED_RESCHED)
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 0145da994590..73e4fa13ed0c 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -101,6 +101,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -112,13 +113,15 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
 
 /* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK    (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP))
+#define _TIF_WORK_MASK \
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK 0x0000FFFF	
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
new file mode 100644
index 000000000000..d766482451af
--- /dev/null
+++ b/include/linux/audit.h
@@ -0,0 +1,211 @@
+/* audit.h -- Auditing support -*- linux-c -*-
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ */
+
+#ifndef _LINUX_AUDIT_H_
+#define _LINUX_AUDIT_H_
+
+/* Request and reply types */
+#define AUDIT_GET      1000	/* Get status */
+#define AUDIT_SET      1001	/* Set status (enable/disable/auditd) */
+#define AUDIT_LIST     1002	/* List filtering rules */
+#define AUDIT_ADD      1003	/* Add filtering rule */
+#define AUDIT_DEL      1004	/* Delete filtering rule */
+#define AUDIT_USER     1005	/* Send a message from user-space */
+#define AUDIT_LOGIN    1006     /* Define the login id and informaiton */
+#define AUDIT_KERNEL   2000	/* Asynchronous audit record. NOT A REQUEST. */
+
+/* Rule flags */
+#define AUDIT_PER_TASK 0x01	/* Apply rule at task creation (not syscall) */
+#define AUDIT_AT_ENTRY 0x02	/* Apply rule at syscall entry */
+#define AUDIT_AT_EXIT  0x04	/* Apply rule at syscall exit */
+#define AUDIT_PREPEND  0x10	/* Prepend to front of list */
+
+/* Rule actions */
+#define AUDIT_NEVER    0	/* Do not build context if rule matches */
+#define AUDIT_POSSIBLE 1	/* Build context if rule matches  */
+#define AUDIT_ALWAYS   2	/* Generate audit record if rule matches */
+
+/* Rule structure sizes -- if these change, different AUDIT_ADD and
+ * AUDIT_LIST commands must be implemented. */
+#define AUDIT_MAX_FIELDS   64
+#define AUDIT_BITMASK_SIZE 64
+#define AUDIT_WORD(nr) ((__u32)((nr)/32))
+#define AUDIT_BIT(nr)  (1 << ((nr) - AUDIT_WORD(nr)*32))
+
+/* Rule fields */
+				/* These are useful when checking the
+				 * task structure at task creation time
+				 * (AUDIT_PER_TASK).  */
+#define AUDIT_PID	0
+#define AUDIT_UID	1
+#define AUDIT_EUID	2
+#define AUDIT_SUID	3
+#define AUDIT_FSUID	4
+#define AUDIT_GID	5
+#define AUDIT_EGID	6
+#define AUDIT_SGID	7
+#define AUDIT_FSGID	8
+#define AUDIT_LOGINUID	9
+#define AUDIT_PERS	10
+
+				/* These are ONLY useful when checking
+				 * at syscall exit time (AUDIT_AT_EXIT). */
+#define AUDIT_DEVMAJOR	100
+#define AUDIT_DEVMINOR	101
+#define AUDIT_INODE	102
+#define AUDIT_EXIT	103
+#define AUDIT_SUCCESS   104	/* exit >= 0; value ignored */
+
+#define AUDIT_ARG0      200
+#define AUDIT_ARG1      (AUDIT_ARG0+1)
+#define AUDIT_ARG2      (AUDIT_ARG0+2)
+#define AUDIT_ARG3      (AUDIT_ARG0+3)
+
+#define AUDIT_NEGATE    0x80000000
+
+
+/* Status symbols */
+				/* Mask values */
+#define AUDIT_STATUS_ENABLED		0x0001
+#define AUDIT_STATUS_FAILURE		0x0002
+#define AUDIT_STATUS_PID		0x0004
+#define AUDIT_STATUS_RATE_LIMIT		0x0008
+#define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
+				/* Failure-to-log actions */
+#define AUDIT_FAIL_SILENT	0
+#define AUDIT_FAIL_PRINTK	1
+#define AUDIT_FAIL_PANIC	2
+
+#ifndef __KERNEL__
+struct audit_message {
+	struct nlmsghdr nlh;
+	char		data[1200];
+};
+#endif
+
+struct audit_status {
+	__u32		mask;		/* Bit mask for valid entries */
+	__u32		enabled;	/* 1 = enabled, 0 = disbaled */
+	__u32		failure;	/* Failure-to-log action */
+	__u32		pid;		/* pid of auditd process */
+	__u32		rate_limit;	/* messages rate limit (per second) */
+	__u32		backlog_limit;	/* waiting messages limit */
+	__u32		lost;		/* messages lost */
+	__u32		backlog;	/* messages waiting in queue */
+};
+
+struct audit_login {
+	__u32		loginuid;
+	int		msglen;
+	char		msg[1024];
+};
+
+struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
+	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
+	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
+	__u32		field_count;
+	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		fields[AUDIT_MAX_FIELDS];
+	__u32		values[AUDIT_MAX_FIELDS];
+};
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_AUDIT
+struct audit_buffer;
+struct audit_context;
+#endif
+
+#ifdef CONFIG_AUDITSYSCALL
+/* These are defined in auditsc.c */
+				/* Public API */
+extern int  audit_alloc(struct task_struct *task);
+extern void audit_free(struct task_struct *task);
+extern void audit_syscall_entry(struct task_struct *task,
+				int major, unsigned long a0, unsigned long a1,
+				unsigned long a2, unsigned long a3);
+extern void audit_syscall_exit(struct task_struct *task, int return_code);
+extern void audit_getname(const char *name);
+extern void audit_putname(const char *name);
+extern void audit_inode(const char *name, unsigned long ino, dev_t rdev);
+
+				/* Private API (for audit.c only) */
+extern int  audit_receive_filter(int type, int pid, int uid, int seq,
+				 void *data);
+extern void audit_get_stamp(struct audit_context *ctx,
+			    struct timespec *t, int *serial);
+extern int  audit_set_loginuid(struct audit_context *ctx, uid_t loginuid);
+#else
+#define audit_alloc(t) ({ 0; })
+#define audit_free(t) do { ; } while (0)
+#define audit_syscall_entry(t,a,b,c,d,e) do { ; } while (0)
+#define audit_syscall_exit(t,r) do { ; } while (0)
+#define audit_getname(n) do { ; } while (0)
+#define audit_putname(n) do { ; } while (0)
+#define audit_inode(n,i,d) do { ; } while (0)
+#endif
+
+#ifdef CONFIG_AUDIT
+/* These are defined in audit.c */
+				/* Public API */
+extern void		    audit_log(struct audit_context *ctx,
+				      const char *fmt, ...)
+			    __attribute__((format(printf,2,3)));
+
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx);
+extern void		    audit_log_format(struct audit_buffer *ab,
+					     const char *fmt, ...)
+			    __attribute__((format(printf,2,3)));
+extern void		    audit_log_end(struct audit_buffer *ab);
+extern void		    audit_log_end_fast(struct audit_buffer *ab);
+extern void		    audit_log_end_irq(struct audit_buffer *ab);
+extern void		    audit_log_d_path(struct audit_buffer *ab,
+					     const char *prefix,
+					     struct dentry *dentry,
+					     struct vfsmount *vfsmnt);
+extern int		    audit_set_rate_limit(int limit);
+extern int		    audit_set_backlog_limit(int limit);
+extern int		    audit_set_enabled(int state);
+extern int		    audit_set_failure(int state);
+
+				/* Private API (for auditsc.c only) */
+extern void		    audit_send_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_log_lost(const char *message);
+#else
+#define audit_log(t,f,...) do { ; } while (0)
+#define audit_log_start(t) ({ NULL; })
+#define audit_log_vformat(b,f,a) do { ; } while (0)
+#define audit_log_format(b,f,...) do { ; } while (0)
+#define audit_log_end(b) do { ; } while (0)
+#define audit_log_end_fast(b) do { ; } while (0)
+#define audit_log_end_irq(b) do { ; } while (0)
+#define audit_log_d_path(b,p,d,v) do { ; } while (0)
+#define audit_set_rate_limit(l) do { ; } while (0)
+#define audit_set_backlog_limit(l) do { ; } while (0)
+#define audit_set_enabled(s) do { ; } while (0)
+#define audit_set_failure(s) do { ; } while (0)
+#endif
+#endif
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bacf6bcbc7b7..39c893f8aa28 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,6 +20,7 @@
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
 #include <asm/atomic.h>
+#include <linux/audit.h>
 
 struct iovec;
 struct nameidata;
@@ -1159,7 +1160,18 @@ extern char * getname(const char __user *);
 extern void vfs_caches_init(unsigned long);
 
 #define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
-#define putname(name)	kmem_cache_free(names_cachep, (void *)(name))
+#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
+#ifndef CONFIG_AUDITSYSCALL
+#define putname(name)   __putname(name)
+#else
+#define putname(name)							\
+	do {								\
+		if (unlikely(current->audit_context))			\
+			audit_putname(name);				\
+		else							\
+			__putname(name);				\
+	} while (0)
+#endif
 
 extern int register_blkdev(unsigned int, const char *);
 extern int unregister_blkdev(unsigned int, const char *);
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index e5e15ddadab5..5adca479de6e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -13,6 +13,7 @@
 #define NETLINK_XFRM		6	/* ipsec */
 #define NETLINK_SELINUX		7	/* SELinux event notifications */
 #define NETLINK_ARPD		8
+#define NETLINK_AUDIT		9	/* auditing */
 #define NETLINK_ROUTE6		11	/* af_inet6 route comm channel */
 #define NETLINK_IP6_FW		13
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 22080f919266..b72c38420d71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -371,6 +371,8 @@ int set_current_groups(struct group_info *group_info);
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
 
+struct audit_context;		/* See audit.c */
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -474,6 +476,7 @@ struct task_struct {
 	sigset_t *notifier_mask;
 	
 	void *security;
+	struct audit_context *audit_context;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/init/Kconfig b/init/Kconfig
index ddd82dbad5dd..55261afdc3bf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -137,6 +137,26 @@ config SYSCTL
 	  building a kernel for install/rescue disks or your system is very
 	  limited in memory.
 
+config AUDIT
+	bool "Auditing support"
+	default y if SECURITY_SELINUX
+	default n
+	help
+	  Enable auditing infrastructure that can be used with another
+	  kernel subsystem, such as SELinux (which requires this for
+	  logging of avc messages output).  Does not do system-call
+	  auditing without CONFIG_AUDITSYSCALL.
+
+config AUDITSYSCALL
+	bool "Enable system-call auditing support"
+	depends on AUDIT && (X86 || PPC64)
+	default y if SECURITY_SELINUX
+	default n
+	help
+	  Enable low-overhead system-call auditing infrastructure that
+	  can be used independently or with another kernel subsystem,
+	  such as SELinux.
+
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 20
diff --git a/kernel/Makefile b/kernel/Makefile
index 3a6484838748..238c65f60d9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,8 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..765822b03b91
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,825 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ *	  2) Minimal run-time overhead:
+ *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
+ *	     b) Small when syscall auditing is enabled and no audit record
+ *		is generated (defer as much work as possible to record
+ *		generation time):
+ *		i) context is allocated,
+ *		ii) names from getname are stored without a copy, and
+ *		iii) inode information stored from path_lookup.
+ *	  3) Ability to disable syscall auditing at boot time (audit=0).
+ *	  4) Usable by other parts of the kernel (if audit_log* is called,
+ *	     then a syscall record will be generated automatically for the
+ *	     current syscall).
+ *	  5) Netlink interface to user-space.
+ *	  6) Support low-overhead kernel-based filtering to minimize the
+ *	     information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int	audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int		audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int	audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int	audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int	audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second.  This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int	audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int	audit_backlog_limit = 64;
+static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+   0) [suppressed in audit_alloc]
+   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+   2) out of memory in audit_log_move [alloc_skb]
+   3) suppressed due to audit_rate_limit
+   4) suppressed due to audit_backlog_limit
+*/
+static atomic_t    audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers.  The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static spinlock_t  audit_txlist_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t  audit_freelist_lock = SPIN_LOCK_UNLOCKED;
+static int	   audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records.  Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE  (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record.  The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue.  Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+	struct list_head     list;
+	struct sk_buff_head  sklist;	/* formatted skbs ready to send */
+	struct audit_context *ctx;	/* NULL or associated context */
+	int		     len;	/* used area of tmp */
+	char		     tmp[AUDIT_BUFSIZ];
+
+				/* Pointer to header and contents */
+	struct nlmsghdr      *nlh;
+	int		     total;
+	int		     type;
+	int		     pid;
+	int		     count; /* Times requeued */
+};
+
+struct audit_entry {
+	struct list_head  list;
+	struct audit_rule rule;
+};
+
+static void audit_panic(const char *message)
+{
+	switch (audit_failure)
+	{
+	case AUDIT_FAIL_SILENT:
+		break;
+	case AUDIT_FAIL_PRINTK:
+		printk(KERN_ERR "audit: %s\n", message);
+		break;
+	case AUDIT_FAIL_PANIC:
+		panic(message);
+		break;
+	}
+}
+
+static inline int audit_rate_check(void)
+{
+	static unsigned long	last_check = 0;
+	static int		messages   = 0;
+	static spinlock_t	lock	   = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	unsigned long		elapsed;
+	int			retval	   = 0;
+
+	if (!audit_rate_limit) return 1;
+
+	spin_lock_irqsave(&lock, flags);
+	if (++messages < audit_rate_limit) {
+		retval = 1;
+	} else {
+		now     = jiffies;
+		elapsed = now - last_check;
+		if (elapsed > HZ) {
+			last_check = now;
+			messages   = 0;
+			retval     = 1;
+		}
+	}
+	spin_unlock_irqrestore(&lock, flags);
+
+	return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+	static unsigned long	last_msg = 0;
+	static spinlock_t	lock     = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	int			print;
+
+	atomic_inc(&audit_lost);
+
+	print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+	if (!print) {
+		spin_lock_irqsave(&lock, flags);
+		now = jiffies;
+		if (now - last_msg > HZ) {
+			print = 1;
+			last_msg = now;
+		}
+		spin_unlock_irqrestore(&lock, flags);
+	}
+
+	if (print) {
+		printk(KERN_WARNING
+		       "audit: audit_lost=%d audit_backlog=%d"
+		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       atomic_read(&audit_lost),
+		       atomic_read(&audit_backlog),
+		       audit_rate_limit,
+		       audit_backlog_limit);
+		audit_panic(message);
+	}
+
+}
+
+int audit_set_rate_limit(int limit)
+{
+	int old		 = audit_rate_limit;
+	audit_rate_limit = limit;
+	audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+		  audit_rate_limit, old);
+	return old;
+}
+
+int audit_set_backlog_limit(int limit)
+{
+	int old		 = audit_backlog_limit;
+	audit_backlog_limit = limit;
+	audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+		  audit_backlog_limit, old);
+	return old;
+}
+
+int audit_set_enabled(int state)
+{
+	int old		 = audit_enabled;
+	if (state != 0 && state != 1)
+		return -EINVAL;
+	audit_enabled = state;
+	audit_log(current->audit_context, "audit_enabled=%d old=%d",
+		  audit_enabled, old);
+	return old;
+}
+
+int audit_set_failure(int state)
+{
+	int old		 = audit_failure;
+	if (state != AUDIT_FAIL_SILENT
+	    && state != AUDIT_FAIL_PRINTK
+	    && state != AUDIT_FAIL_PANIC)
+		return -EINVAL;
+	audit_failure = state;
+	audit_log(current->audit_context, "audit_failure=%d old=%d",
+		  audit_failure, old);
+	return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+		      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nlh;
+	int		len = NLMSG_SPACE(size);
+	void		*data;
+	int		flags = multi ? NLM_F_MULTI : 0;
+	int		t     = done  ? NLMSG_DONE  : type;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		goto nlmsg_failure;
+
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+	nlh->nlmsg_flags = flags;
+	data		 = NLMSG_DATA(nlh);
+	memcpy(data, payload, size);
+	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+	return;
+
+nlmsg_failure:			/* Used by NLMSG_PUT */
+	if (skb)
+		kfree_skb(skb);
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32			uid, pid, seq;
+	void			*data;
+	struct audit_status	*status_get, status_set;
+	struct audit_login	*login;
+	int			err = 0;
+	struct audit_buffer	*ab;
+
+	pid  = NETLINK_CREDS(skb)->pid;
+	uid  = NETLINK_CREDS(skb)->uid;
+	seq  = nlh->nlmsg_seq;
+	data = NLMSG_DATA(nlh);
+
+	switch (nlh->nlmsg_type) {
+	case AUDIT_GET:
+		status_set.enabled	 = audit_enabled;
+		status_set.failure	 = audit_failure;
+		status_set.pid		 = audit_pid;
+		status_set.rate_limit	 = audit_rate_limit;
+		status_set.backlog_limit = audit_backlog_limit;
+		status_set.lost		 = atomic_read(&audit_lost);
+		status_set.backlog	 = atomic_read(&audit_backlog);
+		audit_send_reply(pid, seq, AUDIT_GET, 0, 0,
+				 &status_set, sizeof(status_set));
+		break;
+	case AUDIT_SET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		status_get   = (struct audit_status *)data;
+		if (status_get->mask & AUDIT_STATUS_ENABLED) {
+			err = audit_set_enabled(status_get->enabled);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_FAILURE) {
+			err = audit_set_failure(status_get->failure);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_PID) {
+			int old   = audit_pid;
+			audit_pid = status_get->pid;
+			audit_log(current->audit_context,
+				  "audit_pid=%d old=%d", audit_pid, old);
+		}
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+			audit_set_rate_limit(status_get->rate_limit);
+		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+			audit_set_backlog_limit(status_get->backlog_limit);
+		break;
+	case AUDIT_USER:
+		ab = audit_log_start(NULL);
+		if (!ab)
+			break;	/* audit_panic has been called */
+		audit_log_format(ab,
+				 "user pid=%d uid=%d length=%d msg='%.1024s'",
+				 pid, uid,
+				 (int)(nlh->nlmsg_len
+				       - ((char *)data - (char *)nlh)),
+				 (char *)data);
+		ab->type = AUDIT_USER;
+		ab->pid  = pid;
+		audit_log_end(ab);
+		break;
+	case AUDIT_LOGIN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		login = (struct audit_login *)data;
+		ab = audit_log_start(NULL);
+		if (ab) {
+			audit_log_format(ab, "login pid=%d uid=%d loginuid=%d"
+					 " length=%d msg='%.1024s'",
+					 pid, uid,
+					 login->loginuid,
+					 login->msglen,
+					 login->msg);
+			ab->type = AUDIT_LOGIN;
+			ab->pid  = pid;
+			audit_log_end(ab);
+		}
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_set_loginuid(current->audit_context,
+					 login->loginuid);
+#endif
+		break;
+	case AUDIT_LIST:
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_receive_filter(nlh->nlmsg_type, pid, uid, seq,
+					   data);
+#else
+		err = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * discarded silently.  */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+	int		err;
+	struct nlmsghdr	*nlh;
+	u32		rlen;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return 0;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if ((err = audit_receive_msg(skb, nlh))) {
+			netlink_ack(skb, nlh, -err);
+		} else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+	return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+	struct sk_buff  *skb;
+
+	if (down_trylock(&audit_netlink_sem))
+		return;
+
+				/* FIXME: this must not cause starvation */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+		if (audit_receive_skb(skb) && skb->len)
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		else
+			kfree_skb(skb);
+	}
+	up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb.  This is an extra copy, and
+ * that is unfortunate.  However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation.  (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	struct sk_buff	*skb;
+	char		*start;
+	int		extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+	skb = skb_peek(&ab->sklist);
+	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+		if (!skb) {
+			ab->len = 0; /* Lose information in ab->tmp */
+			audit_log_lost("out of memory in audit_log_move");
+			return;
+		}
+		__skb_queue_tail(&ab->sklist, skb);
+		if (!ab->nlh)
+			ab->nlh = (struct nlmsghdr *)skb_put(skb,
+							     NLMSG_SPACE(0));
+	}
+	start = skb_put(skb, ab->len);
+	memcpy(start, ab->tmp, ab->len);
+	ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ab->sklist))) {
+		int retval = 0;
+
+		if (audit_pid) {
+			if (ab->nlh) {
+				ab->nlh->nlmsg_len   = ab->total;
+				ab->nlh->nlmsg_type  = ab->type;
+				ab->nlh->nlmsg_flags = 0;
+				ab->nlh->nlmsg_seq   = 0;
+				ab->nlh->nlmsg_pid   = ab->pid;
+			}
+			skb_get(skb); /* because netlink_* frees */
+			retval = netlink_unicast(audit_sock, skb, audit_pid,
+						 MSG_DONTWAIT);
+		}
+		if (retval == -EAGAIN && ab->count < 5) {
+			++ab->count;
+			audit_log_end_irq(ab);
+			return 1;
+		}
+		if (retval < 0) {
+			if (retval == -ECONNREFUSED) {
+				printk(KERN_ERR
+				       "audit: *NO* daemon at audit_pid=%d\n",
+				       audit_pid);
+				audit_pid = 0;
+			} else
+				audit_log_lost("netlink socket too busy");
+		}
+		if (!audit_pid) { /* No daemon */
+			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+			int len    = skb->len - offset;
+			printk(KERN_ERR "%*.*s\n",
+			       len, len, skb->data + offset);
+		}
+		kfree_skb(skb);
+		ab->nlh = NULL;
+	}
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+	       audit_default ? "enabled" : "disabled");
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	if (!audit_sock)
+		audit_panic("cannot initialize netlink socket");
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs.  For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+	ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+	audit_sock = NULL;
+	audit_pid  = 0;
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+	audit_default = !!simple_strtol(str, NULL, 0);
+	printk(KERN_INFO "audit: %s%s\n",
+	       audit_default ? "enabled" : "disabled",
+	       audit_initialized ? "" : " (after initialization)");
+	if (audit_initialized)
+		audit_enabled = audit_default;
+	return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+	struct audit_buffer	*ab	= NULL;
+	unsigned long		flags;
+	struct timespec		t;
+	int			serial	= 0;
+
+	if (!audit_initialized)
+		return NULL;
+
+	if (audit_backlog_limit
+	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       atomic_read(&audit_backlog),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		return NULL;
+	}
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab)
+		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+	if (!ab)
+		audit_log_lost("audit: out of memory in audit_log_start");
+	if (!ab)
+		return NULL;
+
+	atomic_inc(&audit_backlog);
+	skb_queue_head_init(&ab->sklist);
+
+	ab->ctx   = ctx;
+	ab->len   = 0;
+	ab->nlh   = NULL;
+	ab->total = 0;
+	ab->type  = AUDIT_KERNEL;
+	ab->pid   = 0;
+	ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (ab->ctx)
+		audit_get_stamp(ab->ctx, &t, &serial);
+	else
+#endif
+		t = CURRENT_TIME;
+
+	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+			 t.tv_sec, t.tv_nsec/1000000, serial);
+	return ab;
+}
+
+
+/* Format an audit message into the audit buffer.  If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time.  Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+			      va_list args)
+{
+	int len, avail;
+
+	if (!ab)
+		return;
+
+	avail = sizeof(ab->tmp) - ab->len;
+	if (avail <= 0) {
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+	}
+	len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	if (len >= avail) {
+		/* The printk buffer is 1024 bytes long, so if we get
+		 * here and AUDIT_BUFSIZ is at least 1024, then we can
+		 * log everything that printk could have logged. */
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+		len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	}
+	ab->len   += (len < avail) ? len : avail;
+	ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer.  All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+	va_list args;
+
+	if (!ab)
+		return;
+	va_start(args, fmt);
+	audit_log_vformat(ab, fmt, args);
+	va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+		      struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	char *p;
+	int  len, avail;
+
+	if (prefix) audit_log_format(ab, " %s", prefix);
+
+	if (ab->len > 128)
+		audit_log_move(ab);
+	avail = sizeof(ab->tmp) - ab->len;
+	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+	if (p == ERR_PTR(-ENAMETOOLONG)) {
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<toolong>");
+	} else {
+				/* path isn't at start of buffer */
+		len	   = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+		memmove(ab->tmp + ab->len, p, len);
+		ab->len   += len;
+		ab->total += len;
+	}
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+	LIST_HEAD(list);
+	struct audit_buffer *ab;
+	unsigned long	    flags;
+
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_splice_init(&audit_txlist, &list);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	while (!list_empty(&list)) {
+		ab = list_entry(list.next, struct audit_buffer, list);
+		list_del(&ab->list);
+		audit_log_end_fast(ab);
+	}
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context.  May be called in
+ * any context. */
+void audit_log_end_irq(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	if (!ab)
+		return;
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_add_tail(&ab->list, &audit_txlist);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space.  May not
+ * be called in an irq context. */
+void audit_log_end_fast(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	BUG_ON(in_irq());
+	if (!ab)
+		return;
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		audit_log_move(ab);
+		if (audit_log_drain(ab))
+			return;
+	}
+
+	atomic_dec(&audit_backlog);
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context.  (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+	if (in_irq())
+		audit_log_end_irq(ab);
+	else
+		audit_log_end_fast(ab);
+}
+
+/* Log an audit record.  This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+	struct audit_buffer *ab;
+	va_list args;
+
+	ab = audit_log_start(ctx);
+	if (ab) {
+		va_start(args, fmt);
+		audit_log_vformat(ab, fmt, args);
+		va_end(args);
+		audit_log_end(ab);
+	}
+}
+
+EXPORT_SYMBOL_GPL(audit_set_rate_limit);
+EXPORT_SYMBOL_GPL(audit_set_backlog_limit);
+EXPORT_SYMBOL_GPL(audit_set_enabled);
+EXPORT_SYMBOL_GPL(audit_set_failure);
+
+EXPORT_SYMBOL_GPL(audit_log_start);
+EXPORT_SYMBOL_GPL(audit_log_format);
+EXPORT_SYMBOL_GPL(audit_log_end_irq);
+EXPORT_SYMBOL_GPL(audit_log_end_fast);
+EXPORT_SYMBOL_GPL(audit_log_end);
+EXPORT_SYMBOL_GPL(audit_log);
+EXPORT_SYMBOL_GPL(audit_log_d_path);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..342b57141fd9
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,922 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES    20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+	const char	*name;
+	unsigned long	ino;
+	dev_t		rdev;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+	int		    in_syscall;	/* 1 if task is in a syscall */
+	enum audit_state    state;
+	unsigned int	    serial;     /* serial number for record */
+	struct timespec	    ctime;      /* time of syscall entry */
+	uid_t		    loginuid;   /* login uid (identity) */
+	int		    major;      /* syscall number */
+	unsigned long	    argv[4];    /* syscall arguments */
+	int		    return_valid; /* return code is valid */
+	int		    return_code;/* syscall return code */
+	int		    auditable;  /* 1 if record should be written */
+	int		    name_count;
+	struct audit_names  names[AUDIT_NAMES];
+	struct audit_context *previous; /* For nested syscalls */
+
+				/* Save things to print about task_struct */
+	pid_t		    pid;
+	uid_t		    uid, euid, suid, fsuid;
+	gid_t		    gid, egid, sgid, fsgid;
+	unsigned long	    personality;
+
+#if AUDIT_DEBUG
+	int		    put_count;
+	int		    ino_count;
+#endif
+};
+
+				/* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+				 struct list_head *list)
+{
+	if (entry->rule.flags & AUDIT_PREPEND) {
+		entry->rule.flags &= ~AUDIT_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+	return 0;
+}
+
+static void audit_free_rule(void *arg)
+{
+	kfree(arg);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule, e);
+			return 0;
+		}
+	}
+	return -EFAULT;		/* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space.  Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+	u32		   flags;
+	struct audit_entry *entry;
+	int		   err = 0;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* The *_rcu iterators not needed here because we are
+		   always called with audit_netlink_sem held. */
+		list_for_each_entry(entry, &audit_tsklist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_entlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_extlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+		break;
+	case AUDIT_ADD:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+			return -ENOMEM;
+		if (audit_copy_rule(&entry->rule, data)) {
+			kfree(entry);
+			return -EINVAL;
+		}
+		flags = entry->rule.flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_add_rule(entry, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_add_rule(entry, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_add_rule(entry, &audit_extlist);
+		break;
+	case AUDIT_DEL:
+		flags =((struct audit_rule *)data)->flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_del_rule(data, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_del_rule(data, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_del_rule(data, &audit_extlist);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule.  Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+			      struct audit_rule *rule,
+			      struct audit_context *ctx,
+			      enum audit_state *state)
+{
+	int i, j;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = (tsk->pid == value);
+			break;
+		case AUDIT_UID:
+			result = (tsk->uid == value);
+			break;
+		case AUDIT_EUID:
+			result = (tsk->euid == value);
+			break;
+		case AUDIT_SUID:
+			result = (tsk->suid == value);
+			break;
+		case AUDIT_FSUID:
+			result = (tsk->fsuid == value);
+			break;
+		case AUDIT_GID:
+			result = (tsk->gid == value);
+			break;
+		case AUDIT_EGID:
+			result = (tsk->egid == value);
+			break;
+		case AUDIT_SGID:
+			result = (tsk->sgid == value);
+			break;
+		case AUDIT_FSGID:
+			result = (tsk->fsgid == value);
+			break;
+		case AUDIT_PERS:
+			result = (tsk->personality == value);
+			break;
+
+		case AUDIT_EXIT:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code == value);
+			break;
+		case AUDIT_SUCCESS:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code >= 0);
+			break;
+		case AUDIT_DEVMAJOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MAJOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_DEVMINOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_INODE:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].ino)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_LOGINUID:
+			result = 0;
+			if (ctx)
+				result = (ctx->loginuid == value);
+			break;
+		case AUDIT_ARG0:
+		case AUDIT_ARG1:
+		case AUDIT_ARG2:
+		case AUDIT_ARG3:
+			if (ctx)
+				result = (ctx->argv[field-AUDIT_ARG0]==value);
+			break;
+		}
+
+		if (rule->fields[i] & AUDIT_NEGATE)
+			result = !result;
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task.  Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_tsklist, list) {
+		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+					     struct audit_context *ctx,
+					     struct list_head *list)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int		   word = AUDIT_WORD(ctx->major);
+	int		   bit  = AUDIT_BIT(ctx->major);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit
+ 		    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+						      int return_valid,
+						      int return_code)
+{
+	struct audit_context *context = tsk->audit_context;
+
+	if (likely(!context))
+		return NULL;
+	context->return_valid = return_valid;
+	context->return_code  = return_code;
+
+	if (context->in_syscall && !context->auditable) {
+		enum audit_state state;
+		state = audit_filter_syscall(tsk, context, &audit_extlist);
+		if (state == AUDIT_RECORD_CONTEXT)
+			context->auditable = 1;
+	}
+
+	context->pid = tsk->pid;
+	context->uid = tsk->uid;
+	context->gid = tsk->gid;
+	context->euid = tsk->euid;
+	context->suid = tsk->suid;
+	context->fsuid = tsk->fsuid;
+	context->egid = tsk->egid;
+	context->sgid = tsk->sgid;
+	context->fsgid = tsk->fsgid;
+	context->personality = tsk->personality;
+	tsk->audit_context = NULL;
+	return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+	int i;
+
+#if AUDIT_DEBUG == 2
+	if (context->auditable
+	    ||context->put_count + context->ino_count != context->name_count) {
+		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		       " name_count=%d put_count=%d"
+		       " ino_count=%d [NOT freeing]\n",
+		       __LINE__,
+		       context->serial, context->major, context->in_syscall,
+		       context->name_count, context->put_count,
+		       context->ino_count);
+		for (i = 0; i < context->name_count; i++)
+			printk(KERN_ERR "names[%d] = %p = %s\n", i,
+			       context->names[i].name,
+			       context->names[i].name);
+		dump_stack();
+		return;
+	}
+#endif
+#if AUDIT_DEBUG
+	context->put_count  = 0;
+	context->ino_count  = 0;
+#endif
+
+	for (i = 0; i < context->name_count; i++)
+		if (context->names[i].name)
+			__putname(context->names[i].name);
+	context->name_count = 0;
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+				      enum audit_state state)
+{
+	uid_t loginuid = context->loginuid;
+
+	memset(context, 0, sizeof(*context));
+	context->state      = state;
+	context->loginuid   = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+	struct audit_context *context;
+
+	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+		return NULL;
+	audit_zero_context(context, state);
+	return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary.  Doing so turns on system call auditing for the
+ * specified task.  This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+	struct audit_context *context;
+	enum audit_state     state;
+
+	if (likely(!audit_enabled))
+		return 0; /* Return if not auditing. */
+
+	state = audit_filter_task(tsk);
+	if (likely(state == AUDIT_DISABLED))
+		return 0;
+
+	if (!(context = audit_alloc_context(state))) {
+		audit_log_lost("out of memory in audit_alloc");
+		return -ENOMEM;
+	}
+
+				/* Preserve login uid */
+	context->loginuid = -1;
+	if (tsk->audit_context)
+		context->loginuid = tsk->audit_context->loginuid;
+
+	tsk->audit_context  = context;
+	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+	return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+	struct audit_context *previous;
+	int		     count = 0;
+
+	do {
+		previous = context->previous;
+		if (previous || (count &&  count < 10)) {
+			++count;
+			printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+			       " freeing multiple contexts (%d)\n",
+			       context->serial, context->major,
+			       context->name_count, count);
+		}
+		audit_free_names(context);
+		kfree(context);
+		context  = previous;
+	} while (context);
+	if (count >= 10)
+		printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+	int i;
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "syscall=%d", context->major);
+	if (context->personality != PER_LINUX)
+		audit_log_format(ab, " per=%lx", context->personality);
+	if (context->return_valid)
+		audit_log_format(ab, " exit=%u", context->return_code);
+	audit_log_format(ab,
+		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+		  " pid=%d loginuid=%d uid=%d gid=%d"
+		  " euid=%d suid=%d fsuid=%d"
+		  " egid=%d sgid=%d fsgid=%d",
+		  context->argv[0],
+		  context->argv[1],
+		  context->argv[2],
+		  context->argv[3],
+		  context->name_count,
+		  context->pid,
+		  context->loginuid,
+		  context->uid,
+		  context->gid,
+		  context->euid, context->suid, context->fsuid,
+		  context->egid, context->sgid, context->fsgid);
+	audit_log_end(ab);
+	for (i = 0; i < context->name_count; i++) {
+		ab = audit_log_start(context);
+		if (!ab)
+			continue; /* audit_panic has been called */
+		audit_log_format(ab, "item=%d", i);
+		if (context->names[i].name)
+			audit_log_format(ab, " name=%s",
+					 context->names[i].name);
+		if (context->names[i].ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu",
+					 context->names[i].ino);
+		/* FIXME: should use format_dev_t, but ab structure is
+		 * opaque. */
+		if (context->names[i].rdev != -1)
+			audit_log_format(ab, " dev=%02x:%02x",
+					 MAJOR(context->names[i].rdev),
+					 MINOR(context->names[i].rdev));
+		audit_log_end(ab);
+	}
+}
+
+/* Free a per-task audit context.  Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+	struct audit_context *context;
+
+	task_lock(tsk);
+	context = audit_get_context(tsk, 0, 0);
+	task_unlock(tsk);
+
+	if (likely(!context))
+		return;
+
+	/* Check for system calls that do not go through the exit
+	 * function (e.g., exit_group), then free context block. */
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+	static atomic_t serial = ATOMIC_INIT(0xffffff);
+	unsigned int a, b;
+
+	do {
+		a = atomic_read(&serial);
+		if (atomic_dec_and_test(&serial))
+			atomic_set(&serial, 0xffffff);
+		b = atomic_read(&serial);
+	} while (b != a - 1);
+
+	return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry.  This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built.  If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+			 unsigned long a1, unsigned long a2,
+			 unsigned long a3, unsigned long a4)
+{
+	struct audit_context *context = tsk->audit_context;
+	enum audit_state     state;
+
+	BUG_ON(!context);
+
+	/* This happens only on certain architectures that make system
+	 * calls in kernel_thread via the entry.S interface, instead of
+	 * with direct calls.  (If you are porting to a new
+	 * architecture, hitting this condition can indicate that you
+	 * got the _exit/_leave calls backward in entry.S.)
+	 *
+	 * i386     no
+	 * x86_64   no
+	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 *
+	 * This also happens with vm86 emulation in a non-nested manner
+	 * (entries without exits), so this case must be caught.
+	 */
+	if (context->in_syscall) {
+		struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+		/* vm86 mode should only be entered once */
+		if (major == __NR_vm86 || major == __NR_vm86old)
+			return;
+#endif
+#if AUDIT_DEBUG
+		printk(KERN_ERR
+		       "audit(:%d) pid=%d in syscall=%d;"
+		       " entering syscall=%d\n",
+		       context->serial, tsk->pid, context->major, major);
+#endif
+		newctx = audit_alloc_context(context->state);
+		if (newctx) {
+			newctx->previous   = context;
+			context		   = newctx;
+			tsk->audit_context = newctx;
+		} else	{
+			/* If we can't alloc a new context, the best we
+			 * can do is to leak memory (any pending putname
+			 * will be lost).  The only other alternative is
+			 * to abandon auditing. */
+			audit_zero_context(context, context->state);
+		}
+	}
+	BUG_ON(context->in_syscall || context->name_count);
+
+	if (!audit_enabled)
+		return;
+
+	context->major      = major;
+	context->argv[0]    = a1;
+	context->argv[1]    = a2;
+	context->argv[2]    = a3;
+	context->argv[3]    = a4;
+
+	state = context->state;
+	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+		state = audit_filter_syscall(tsk, context, &audit_entlist);
+	if (likely(state == AUDIT_DISABLED))
+		return;
+
+	context->serial     = audit_serial();
+	context->ctime      = CURRENT_TIME;
+	context->in_syscall = 1;
+	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call.  If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information.  In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+	struct audit_context *context;
+
+	get_task_struct(tsk);
+	task_lock(tsk);
+	context = audit_get_context(tsk, 1, return_code);
+	task_unlock(tsk);
+
+	/* Not having a context here is ok, since the parent may have
+	 * called __put_task_struct. */
+	if (likely(!context))
+		return;
+
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	context->in_syscall = 0;
+	context->auditable  = 0;
+	if (context->previous) {
+		struct audit_context *new_context = context->previous;
+		context->previous  = NULL;
+		audit_free_context(context);
+		tsk->audit_context = new_context;
+	} else {
+		audit_free_names(context);
+		audit_zero_context(context, context->state);
+		tsk->audit_context = context;
+	}
+	put_task_struct(tsk);
+}
+
+/* Add a name to the list.  Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		dump_stack();
+#endif
+		return;
+	}
+	BUG_ON(context->name_count >= AUDIT_NAMES);
+	context->names[context->name_count].name = name;
+	context->names[context->name_count].ino  = (unsigned long)-1;
+	context->names[context->name_count].rdev = -1;
+	++context->name_count;
+}
+
+/* Intercept a putname request.  Called from
+ * include/linux/fs.h:putname().  If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		if (context->name_count) {
+			int i;
+			for (i = 0; i < context->name_count; i++)
+				printk(KERN_ERR "name[%d] = %p = %s\n", i,
+				       context->names[i].name,
+				       context->names[i].name);
+		}
+#endif
+		__putname(name);
+	}
+#if AUDIT_DEBUG
+	else {
+		++context->put_count;
+		if (context->put_count > context->name_count) {
+			printk(KERN_ERR "%s:%d(:%d): major=%d"
+			       " in_syscall=%d putname(%p) name_count=%d"
+			       " put_count=%d\n",
+			       __FILE__, __LINE__,
+			       context->serial, context->major,
+			       context->in_syscall, name, context->name_count,
+			       context->put_count);
+			dump_stack();
+		}
+	}
+#endif
+}
+
+/* Store the inode and device from a lookup.  Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, unsigned long ino, dev_t rdev)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+	if (context->name_count
+	    && context->names[context->name_count-1].name
+	    && context->names[context->name_count-1].name == name)
+		idx = context->name_count - 1;
+	else if (context->name_count > 1
+		 && context->names[context->name_count-2].name
+		 && context->names[context->name_count-2].name == name)
+		idx = context->name_count - 2;
+	else {
+		/* FIXME: how much do we care about inodes that have no
+		 * associated name? */
+		if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+			return;
+		idx = context->name_count++;
+		context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+		++context->ino_count;
+#endif
+	}
+	context->names[idx].ino  = ino;
+	context->names[idx].rdev = rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+		     struct timespec *t, int *serial)
+{
+	if (ctx) {
+		t->tv_sec  = ctx->ctime.tv_sec;
+		t->tv_nsec = ctx->ctime.tv_nsec;
+		*serial    = ctx->serial;
+		ctx->auditable = 1;
+	} else {
+		*t      = CURRENT_TIME;
+		*serial = 0;
+	}
+}
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+	if (ctx) {
+		if (loginuid < 0)
+			return -EINVAL;
+		ctx->loginuid = loginuid;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(audit_alloc);
+EXPORT_SYMBOL_GPL(audit_free);
+EXPORT_SYMBOL_GPL(audit_syscall_entry);
+EXPORT_SYMBOL_GPL(audit_syscall_exit);
+EXPORT_SYMBOL_GPL(audit_getname);
+EXPORT_SYMBOL_GPL(audit_putname);
+EXPORT_SYMBOL_GPL(audit_inode);
diff --git a/kernel/fork.c b/kernel/fork.c
index fc25a3a15d0e..6035db6957f8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/futex.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -83,6 +84,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -949,13 +952,16 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
 	p->io_context = NULL;
+	p->audit_context = NULL;
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
 		goto bad_fork_cleanup;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_audit;
 	if ((retval = copy_files(clone_flags, p)))
 		goto bad_fork_cleanup_semundo;
 	if ((retval = copy_fs(clone_flags, p)))
@@ -1090,6 +1096,8 @@ bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
 	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
 bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup:
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index dad7ae38cc91..2e431763dc73 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -22,11 +22,14 @@
 #include <linux/un.h>
 #include <net/af_unix.h>
 #include <linux/ip.h>
+#include <linux/audit.h>
 #include <linux/ipv6.h>
 #include <net/ipv6.h>
 #include "avc.h"
 #include "avc_ss.h"
+#ifdef CONFIG_AUDIT
 #include "class_to_string.h"
+#endif
 #include "common_perm_to_string.h"
 #include "av_inherit.h"
 #include "av_perm_to_string.h"
@@ -68,14 +71,10 @@ struct avc_callback_node {
 };
 
 static spinlock_t avc_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t avc_log_lock = SPIN_LOCK_UNLOCKED;
 static struct avc_node *avc_node_freelist = NULL;
 static struct avc_cache avc_cache;
-static char *avc_audit_buffer = NULL;
 static unsigned avc_cache_stats[AVC_NSTATS];
 static struct avc_callback_node *avc_callbacks = NULL;
-static unsigned int avc_log_level = 4; /* default:  KERN_WARNING */
-static char avc_level_string[4] = "< >";
 
 static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
 {
@@ -87,14 +86,14 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
  * @tclass: target security class
  * @av: access vector
  */
-void avc_dump_av(u16 tclass, u32 av)
+void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
 {
 	char **common_pts = 0;
 	u32 common_base = 0;
 	int i, i2, perm;
 
 	if (av == 0) {
-		printk(" null");
+		audit_log_format(ab, " null");
 		return;
 	}
 
@@ -106,12 +105,12 @@ void avc_dump_av(u16 tclass, u32 av)
 		}
 	}
 
-	printk(" {");
+	audit_log_format(ab, " {");
 	i = 0;
 	perm = 1;
 	while (perm < common_base) {
 		if (perm & av)
-			printk(" %s", common_pts[i]);
+			audit_log_format(ab, " %s", common_pts[i]);
 		i++;
 		perm <<= 1;
 	}
@@ -124,13 +123,14 @@ void avc_dump_av(u16 tclass, u32 av)
 					break;
 			}
 			if (i2 < ARRAY_SIZE(av_perm_to_string))
-				printk(" %s", av_perm_to_string[i2].name);
+				audit_log_format(ab, " %s",
+						 av_perm_to_string[i2].name);
 		}
 		i++;
 		perm <<= 1;
 	}
 
-	printk(" }");
+	audit_log_format(ab, " }");
 }
 
 /**
@@ -139,7 +139,7 @@ void avc_dump_av(u16 tclass, u32 av)
  * @tsid: target security identifier
  * @tclass: target security class
  */
-void avc_dump_query(u32 ssid, u32 tsid, u16 tclass)
+void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass)
 {
 	int rc;
 	char *scontext;
@@ -147,20 +147,20 @@ void avc_dump_query(u32 ssid, u32 tsid, u16 tclass)
 
  	rc = security_sid_to_context(ssid, &scontext, &scontext_len);
 	if (rc)
-		printk("ssid=%d", ssid);
+		audit_log_format(ab, "ssid=%d", ssid);
 	else {
-		printk("scontext=%s", scontext);
+		audit_log_format(ab, "scontext=%s", scontext);
 		kfree(scontext);
 	}
 
 	rc = security_sid_to_context(tsid, &scontext, &scontext_len);
 	if (rc)
-		printk(" tsid=%d", tsid);
+		audit_log_format(ab, " tsid=%d", tsid);
 	else {
-		printk(" tcontext=%s", scontext);
+		audit_log_format(ab, " tcontext=%s", scontext);
 		kfree(scontext);
 	}
-	printk(" tclass=%s", class_to_string[tclass]);
+	audit_log_format(ab, " tclass=%s", class_to_string[tclass]);
 }
 
 /**
@@ -194,11 +194,7 @@ void __init avc_init(void)
 		avc_node_freelist = new;
 	}
 
-	avc_audit_buffer = (char *)__get_free_page(GFP_ATOMIC);
-	if (!avc_audit_buffer)
-		panic("AVC:  unable to allocate audit buffer\n");
-
-	avc_level_string[1] = '0' + avc_log_level;
+	audit_log(current->audit_context, "AVC INITIALIZED\n");
 }
 
 #if 0
@@ -430,12 +426,13 @@ static inline void avc_print_ipv6_addr(struct in6_addr *addr, u16 port,
 		printk(" %s=%d", name2, ntohs(port));
 }
 
-static inline void avc_print_ipv4_addr(u32 addr, u16 port, char *name1, char *name2)
+static inline void avc_print_ipv4_addr(struct audit_buffer *ab, u32 addr,
+				       u16 port, char *name1, char *name2)
 {
 	if (addr)
-		printk(" %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
+		audit_log_format(ab, " %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
 	if (port)
-		printk(" %s=%d", name2, ntohs(port));
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
 
 /*
@@ -515,9 +512,8 @@ void avc_audit(u32 ssid, u32 tsid,
 {
 	struct task_struct *tsk = current;
 	struct inode *inode = NULL;
-	char *p;
 	u32 denied, audited;
-	unsigned long flags;
+	struct audit_buffer *ab;
 
 	denied = requested & ~avd->allowed;
 	if (denied) {
@@ -535,19 +531,18 @@ void avc_audit(u32 ssid, u32 tsid,
 	if (!check_avc_ratelimit())
 		return;
 
-	/* prevent overlapping printks */
-	spin_lock_irqsave(&avc_log_lock,flags);
-
-	printk("%s\n", avc_level_string);
-	printk("%savc:  %s ", avc_level_string, denied ? "denied" : "granted");
-	avc_dump_av(tclass,audited);
-	printk(" for ");
+	ab = audit_log_start(current->audit_context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "avc:  %s ", denied ? "denied" : "granted");
+	avc_dump_av(ab, tclass,audited);
+	audit_log_format(ab, " for ");
 	if (a && a->tsk)
 		tsk = a->tsk;
 	if (tsk && tsk->pid) {
 		struct mm_struct *mm;
 		struct vm_area_struct *vma;
-		printk(" pid=%d", tsk->pid);
+		audit_log_format(ab, " pid=%d", tsk->pid);
 		if (tsk == current)
 			mm = current->mm;
 		else
@@ -558,11 +553,9 @@ void avc_audit(u32 ssid, u32 tsid,
 				while (vma) {
 					if ((vma->vm_flags & VM_EXECUTABLE) &&
 					    vma->vm_file) {
-						p = d_path(vma->vm_file->f_dentry,
-							   vma->vm_file->f_vfsmnt,
-							   avc_audit_buffer,
-							   PAGE_SIZE);
-						printk(" exe=%s", p);
+						audit_log_d_path(ab, "exe=",
+							vma->vm_file->f_dentry,
+							vma->vm_file->f_vfsmnt);
 						break;
 					}
 					vma = vma->vm_next;
@@ -572,29 +565,26 @@ void avc_audit(u32 ssid, u32 tsid,
 			if (tsk != current)
 				mmput(mm);
 		} else {
-			printk(" comm=%s", tsk->comm);
+			audit_log_format(ab, " comm=%s", tsk->comm);
 		}
 	}
 	if (a) {
 		switch (a->type) {
 		case AVC_AUDIT_DATA_IPC:
-			printk(" key=%d", a->u.ipc_id);
+			audit_log_format(ab, " key=%d", a->u.ipc_id);
 			break;
 		case AVC_AUDIT_DATA_CAP:
-			printk(" capability=%d", a->u.cap);
+			audit_log_format(ab, " capability=%d", a->u.cap);
 			break;
 		case AVC_AUDIT_DATA_FS:
 			if (a->u.fs.dentry) {
 				struct dentry *dentry = a->u.fs.dentry;
 				if (a->u.fs.mnt) {
-					p = d_path(dentry,
-						   a->u.fs.mnt,
-						   avc_audit_buffer,
-						   PAGE_SIZE);
-					if (p)
-						printk(" path=%s", p);
+					audit_log_d_path(ab, "path=", dentry,
+							a->u.fs.mnt);
 				} else {
-					printk(" name=%s", dentry->d_name.name);
+					audit_log_format(ab, " name=%s",
+							 dentry->d_name.name);
 				}
 				inode = dentry->d_inode;
 			} else if (a->u.fs.inode) {
@@ -602,29 +592,33 @@ void avc_audit(u32 ssid, u32 tsid,
 				inode = a->u.fs.inode;
 				dentry = d_find_alias(inode);
 				if (dentry) {
-					printk(" name=%s", dentry->d_name.name);
+					audit_log_format(ab, " name=%s",
+							 dentry->d_name.name);
 					dput(dentry);
 				}
 			}
 			if (inode)
-				printk(" dev=%s ino=%ld",
-				       inode->i_sb->s_id, inode->i_ino);
+				audit_log_format(ab, " dev=%s ino=%ld",
+						 inode->i_sb->s_id,
+						 inode->i_ino);
 			break;
 		case AVC_AUDIT_DATA_NET:
 			if (a->u.net.sk) {
 				struct sock *sk = a->u.net.sk;
 				struct unix_sock *u;
+				int len = 0;
+				char *p = NULL;
 
 				switch (sk->sk_family) {
 				case AF_INET: {
 					struct inet_opt *inet = inet_sk(sk);
 
-					avc_print_ipv4_addr(inet->rcv_saddr,
-					                    inet->sport,
-					                    "laddr", "lport");
-					avc_print_ipv4_addr(inet->daddr,
-					                    inet->dport,
-					                    "faddr", "fport");
+					avc_print_ipv4_addr(ab, inet->rcv_saddr,
+							    inet->sport,
+							    "laddr", "lport");
+					avc_print_ipv4_addr(ab, inet->daddr,
+							    inet->dport,
+							    "faddr", "fport");
 					break;
 				}
 				case AF_INET6: {
@@ -642,34 +636,32 @@ void avc_audit(u32 ssid, u32 tsid,
 				case AF_UNIX:
 					u = unix_sk(sk);
 					if (u->dentry) {
-						p = d_path(u->dentry,
-							   u->mnt,
-							   avc_audit_buffer,
-							   PAGE_SIZE);
-						printk(" path=%s", p);
-					} else if (u->addr) {
-						p = avc_audit_buffer;
-						memcpy(p,
-						       u->addr->name->sun_path,
-						       u->addr->len-sizeof(short));
-						if (*p == 0) {
-							*p = '@';
-							p += u->addr->len-sizeof(short);
-							*p = 0;
-						}
-						printk(" path=%s",
-						       avc_audit_buffer);
+						audit_log_d_path(ab, "path=",
+							u->dentry, u->mnt);
+						break;
 					}
+					if (!u->addr)
+						break;
+					len = u->addr->len-sizeof(short);
+					p = &u->addr->name->sun_path[0];
+					if (*p)
+						audit_log_format(ab,
+							"path=%*.*s", len,
+							len, p);
+					else
+						audit_log_format(ab,
+							"path=@%*.*s", len-1,
+							len-1, p+1);
 					break;
 				}
 			}
 			
 			switch (a->u.net.family) {
 			case AF_INET:
-				avc_print_ipv4_addr(a->u.net.v4info.saddr,
+				avc_print_ipv4_addr(ab, a->u.net.v4info.saddr,
 						    a->u.net.sport,
 						    "saddr", "src");
-				avc_print_ipv4_addr(a->u.net.v4info.daddr,
+				avc_print_ipv4_addr(ab, a->u.net.v4info.daddr,
 						    a->u.net.dport,
 						    "daddr", "dest");
 				break;
@@ -683,15 +675,14 @@ void avc_audit(u32 ssid, u32 tsid,
 				break;
 			}
 			if (a->u.net.netif)
-				printk(" netif=%s", a->u.net.netif);
+				audit_log_format(ab, " netif=%s",
+					a->u.net.netif);
 			break;
 		}
 	}
-	printk(" ");
-	avc_dump_query(ssid, tsid, tclass);
-	printk("\n");
-
-	spin_unlock_irqrestore(&avc_log_lock,flags);
+	audit_log_format(ab, " ");
+	avc_dump_query(ab, ssid, tsid, tclass);
+	audit_log_end(ab);
 }
 
 /**
@@ -1120,14 +1111,3 @@ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
 	avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata);
 	return rc;
 }
-
-static int __init avc_log_level_setup(char *str)
-{
-	avc_log_level = simple_strtol(str, NULL, 0);
-	if (avc_log_level > 7)
-		avc_log_level = 7;
-	return 1;
-}
-
-__setup("avc_log_level=", avc_log_level_setup);
-
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index c143db4ca685..86bdeef585a4 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -127,9 +127,10 @@ static inline void avc_cache_stats_add(int type, unsigned val)
 /*
  * AVC display support
  */
-void avc_dump_av(u16 tclass, u32 av);
-void avc_dump_query(u32 ssid, u32 tsid, u16 tclass);
-void avc_dump_cache(char *tag);
+struct audit_buffer;
+void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av);
+void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass);
+void avc_dump_cache(struct audit_buffer *ab, char *tag);
 
 /*
  * AVC operations
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d8c18ed0087b..f2a53e22b060 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -399,7 +399,7 @@ int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len)
 			char *scontextp;
 
 			*scontext_len = strlen(initial_sid_to_string[sid]) + 1;
-			scontextp = kmalloc(*scontext_len,GFP_KERNEL);
+			scontextp = kmalloc(*scontext_len,GFP_ATOMIC);
 			strcpy(scontextp, initial_sid_to_string[sid]);
 			*scontext = scontextp;
 			goto out;
-- 
cgit v1.2.3


From 0e568881178ff0e0aceeafdb51f9fecab39e1923 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:33:21 -0700
Subject: [PATCH] fix posix-timers to have proper per-process scope

From: Roland McGrath <roland@redhat.com>

The posix-timers implementation associates timers with the creating thread
and destroys timers when their creator thread dies.  POSIX clearly
specifies that these timers are per-process, and a timer should not be torn
down when the thread that created it exits.  I hope there won't be any
controversy on what the correct semantics are here, since POSIX is clear
and the Linux feature is called "posix-timers".

The attached program built with NPTL -lrt -lpthread demonstrates the bug.
The program is correct by POSIX, but fails on Linux.  Note that a until
just the other day, NPTL had a trivial bug that always disabled its use of
kernel timer syscalls (check strace for lack of timer_create/SYS_259).  So
unless you have built your own NPTL libs very recently, you probably won't
see the kernel calls actually used by this program.

Also attached is my patch to fix this.  It (you guessed it) moves the
posix_timers field from task_struct to signal_struct.  Access is now
governed by the siglock instead of the task lock.  exit_itimers is called
from __exit_signal, i.e.  only on the death of the last thread in the
group, rather than from do_exit for every thread.  Timers' it_process
fields store the group leader's pointer, which won't die.  For the case of
SIGEV_THREAD_ID, I hold a ref on the task_struct for it_process to stay
robust in case the target thread dies; the ref is released and the dangling
pointer cleared when the timer fires and the target thread is dead.  (This
should only come up in a buggy user program, so noone cares exactly how the
kernel handles that case.  But I think what I did is robust and sensical.)

/* Test for bogus per-thread deletion of timers.  */

#include <stdio.h>
#include <error.h>
#include <time.h>
#include <signal.h>
#include <stdint.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#include <pthread.h>

/* Creating timers in another thread should work too.  */
static void *do_timer_create(void *arg)
{
	struct sigevent *const sigev = arg;
	timer_t *const timerId = sigev->sigev_value.sival_ptr;
	if (timer_create(CLOCK_REALTIME, sigev, timerId) < 0) {
		perror("timer_create");
		return NULL;
	}
	return timerId;
}

int main(void)
{
	int i, res;
	timer_t timerId;
	struct itimerspec itval;
	struct sigevent sigev;

	itval.it_interval.tv_sec = 2;
	itval.it_interval.tv_nsec = 0;
	itval.it_value.tv_sec = 2;
	itval.it_value.tv_nsec = 0;

	sigev.sigev_notify = SIGEV_SIGNAL;
	sigev.sigev_signo = SIGALRM;
	sigev.sigev_value.sival_ptr = (void *)&timerId;

	for (i = 0; i < 100; i++) {
		printf("cnt = %d\n", i);

		pthread_t thr;
		res = pthread_create(&thr, NULL, &do_timer_create, &sigev);
		if (res) {
			error(0, res, "pthread_create");
			continue;
		}
		void *val;
		res = pthread_join(thr, &val);
		if (res) {
			error(0, res, "pthread_join");
			continue;
		}
		if (val == NULL)
			continue;

		res = timer_settime(timerId, 0, &itval, NULL);
		if (res < 0)
			perror("timer_settime");

		res = timer_delete(timerId);
		if (res < 0)
			perror("timer_delete");
	}

	return 0;
}
---
 fs/exec.c                 |  1 -
 include/linux/init_task.h |  2 +-
 include/linux/sched.h     |  6 ++-
 kernel/exit.c             |  1 -
 kernel/fork.c             |  2 +-
 kernel/posix-timers.c     | 97 ++++++++++++++++++++++++++++++++++-------------
 kernel/signal.c           |  2 +-
 7 files changed, 78 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 26e3392b6369..5fb9f8f7c38f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -856,7 +856,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 			
 	flush_signal_handlers(current, 0);
 	flush_old_files(current->files);
-	exit_itimers(current);
 
 	return 0;
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5c4843a08917..29189706ea57 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -49,6 +49,7 @@
 	.shared_pending	= { 				\
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}}, \
+	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 }
 
 #define INIT_SIGHAND(sighand) {	\
@@ -107,7 +108,6 @@ extern struct group_info init_groups;
 		.list = LIST_HEAD_INIT(tsk.pending.list),		\
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
-	.posix_timers	 = LIST_HEAD_INIT(tsk.posix_timers),		\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b72c38420d71..17bbedd6bb3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -270,6 +270,9 @@ struct signal_struct {
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
 
+	/* POSIX.1b Interval Timers */
+	struct list_head posix_timers;
+
 	/* job control IDs */
 	pid_t pgrp;
 	pid_t tty_old_pgrp;
@@ -433,7 +436,6 @@ struct task_struct {
 	unsigned long it_real_value, it_prof_value, it_virt_value;
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
 	struct timer_list real_timer;
-	struct list_head posix_timers; /* POSIX.1b Interval Timers */
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
 	u64 start_time;
@@ -728,7 +730,7 @@ extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
-extern void exit_itimers(struct task_struct *);
+extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 8157dbc037d6..0ec66729ead8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -776,7 +776,6 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	__exit_files(tsk);
 	__exit_fs(tsk);
 	exit_namespace(tsk);
-	exit_itimers(tsk);
 	exit_thread();
 
 	if (tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6035db6957f8..b4cbfd04847b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->group_stop_count = 0;
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
 
 	sig->tty = current->signal->tty;
 	sig->pgrp = process_group(current);
@@ -932,7 +933,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-	INIT_LIST_HEAD(&p->posix_timers);
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 082693e383cf..3de4d0ae9d26 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -317,12 +317,21 @@ static void timer_notify_task(struct k_itimer *timr)
 	if (timr->it_incr)
 		timr->sigq->info.si_sys_private = ++timr->it_requeue_pending;
 
-	if (timr->it_sigev_notify & SIGEV_THREAD_ID )
+	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+		if (unlikely(timr->it_process->flags & PF_EXITING)) {
+			timr->it_sigev_notify = SIGEV_SIGNAL;
+			put_task_struct(timr->it_process);
+			timr->it_process = timr->it_process->group_leader;
+			goto group;
+		}
 		ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
-	else
+	}
+	else {
+	group:
 		ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
+	}
 	if (ret) {
 		/*
 		 * signal was not sent because of sig_ignor
@@ -352,7 +361,7 @@ static void posix_timer_fn(unsigned long __data)
 
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
-	struct task_struct *rtn = current;
+	struct task_struct *rtn = current->group_leader;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
@@ -395,11 +404,15 @@ static struct k_itimer * alloc_posix_timer(void)
 static void release_posix_timer(struct k_itimer *tmr)
 {
 	if (tmr->it_id != -1) {
-		spin_lock_irq(&idr_lock);
+		unsigned long flags;
+		spin_lock_irqsave(&idr_lock, flags);
 		idr_remove(&posix_timers_id, tmr->it_id);
-		spin_unlock_irq(&idr_lock);
+		spin_unlock_irqrestore(&idr_lock, flags);
 	}
 	sigqueue_free(tmr->sigq);
+	if (unlikely(tmr->it_process) &&
+	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(tmr->it_process);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -414,6 +427,7 @@ sys_timer_create(clockid_t which_clock,
 	struct k_itimer *new_timer = NULL;
 	timer_t new_timer_id;
 	struct task_struct *process = 0;
+	unsigned long flags;
 	sigevent_t event;
 
 	if ((unsigned) which_clock >= MAX_CLOCKS ||
@@ -458,7 +472,7 @@ sys_timer_create(clockid_t which_clock,
 			 * We may be setting up this process for another
 			 * thread.  It may be exiting.  To catch this
 			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the task_lock will catch
+			 * the flag is not set, the siglock will catch
 			 * him before it is too late (in exit_itimers).
 			 *
 			 * The exec case is a bit more invloved but easy
@@ -469,13 +483,14 @@ sys_timer_create(clockid_t which_clock,
 			 * for us to die which means we can finish this
 			 * linkage with our last gasp. I.e. no code :)
 			 */
-			task_lock(process);
+			spin_lock_irqsave(&process->sighand->siglock, flags);
 			if (!(process->flags & PF_EXITING)) {
 				list_add(&new_timer->list,
-					 &process->posix_timers);
-				task_unlock(process);
+					 &process->signal->posix_timers);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				get_task_struct(process);
 			} else {
-				task_unlock(process);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 				process = 0;
 			}
 		}
@@ -491,10 +506,10 @@ sys_timer_create(clockid_t which_clock,
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;
 		new_timer->it_sigev_signo = SIGALRM;
 		new_timer->it_sigev_value.sival_int = new_timer->it_id;
-		process = current;
-		task_lock(process);
-		list_add(&new_timer->list, &process->posix_timers);
-		task_unlock(process);
+		process = current->group_leader;
+		spin_lock_irqsave(&process->sighand->siglock, flags);
+		list_add(&new_timer->list, &process->signal->posix_timers);
+		spin_unlock_irqrestore(&process->sighand->siglock, flags);
 	}
 
 	new_timer->it_clock = which_clock;
@@ -925,14 +940,18 @@ retry_delete:
 #else
 	p_timer_del(&posix_clocks[timer->it_clock], timer);
 #endif
-	task_lock(timer->it_process);
+	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
-	task_unlock(timer->it_process);
+	spin_unlock(&current->sighand->siglock);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
 	timer->it_process = NULL;
+	}
 	unlock_timer(timer, flags);
 	release_posix_timer(timer);
 	return 0;
@@ -942,24 +961,50 @@ retry_delete:
  */
 static inline void itimer_delete(struct k_itimer *timer)
 {
-	if (sys_timer_delete(timer->it_id))
-		BUG();
+	unsigned long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+retry_delete:
+#endif
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	p_timer_del(&posix_clocks[timer->it_clock], timer);
+#endif
+	list_del(&timer->list);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer);
 }
+
 /*
- * This is exported to exit and exec
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
  */
-void exit_itimers(struct task_struct *tsk)
+void exit_itimers(struct signal_struct *sig)
 {
 	struct k_itimer *tmr;
 
-	task_lock(tsk);
-	while (!list_empty(&tsk->posix_timers)) {
-		tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list);
-		task_unlock(tsk);
+	while (!list_empty(&sig->posix_timers)) {
+		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
-		task_lock(tsk);
 	}
-	task_unlock(tsk);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 7a4b479a6f45..c69671600bef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -352,6 +352,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		exit_itimers(sig);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 		kmem_cache_free(signal_cachep, sig);
@@ -2555,4 +2556,3 @@ void __init signals_init(void)
 	if (!sigqueue_cachep)
 		panic("signals_init(): cannot create sigqueue SLAB cache");
 }
-
-- 
cgit v1.2.3


From c8b976af1af10de3d92968bf7d4bd5415e8a3778 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:41:36 -0700
Subject: [PATCH] hugetlb consolidation

From: William Lee Irwin III <wli@holomorphy.com>

The following patch consolidates redundant code in various hugetlb
implementations.  I took the liberty of renaming a few things, since the
code was all moved anyway, and it has the benefit of helping to catch
missed conversions and/or consolidations.
---
 arch/i386/mm/hugetlbpage.c    | 264 +-----------------------------------------
 arch/ia64/mm/hugetlbpage.c    | 251 +--------------------------------------
 arch/ppc64/mm/hugetlbpage.c   | 258 +----------------------------------------
 arch/sh/mm/hugetlbpage.c      | 258 +----------------------------------------
 arch/sparc64/mm/hugetlbpage.c | 259 +----------------------------------------
 fs/hugetlbfs/inode.c          |   2 +-
 include/linux/hugetlb.h       |   7 +-
 kernel/sysctl.c               |   6 +-
 mm/Makefile                   |   1 +
 mm/hugetlb.c                  | 245 +++++++++++++++++++++++++++++++++++++++
 10 files changed, 263 insertions(+), 1288 deletions(-)
 create mode 100644 mm/hugetlb.c

(limited to 'kernel')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 7224ddcb6a11..a702f96373af 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -20,68 +20,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long    htlbpagemem;
-int     htlbpage_max;
-static long    htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-				HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -276,26 +214,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 #endif
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end)
 {
@@ -319,16 +237,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void
-zap_hugepage_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -360,7 +268,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -380,173 +288,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write,
-		struct file *file, void *buffer, size_t *length)
-{
-	if (!cpu_has_pse)
-		return -ENODEV;
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	if (!cpu_has_pse)
-		return -ENODEV;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 3dec8e2f4056..8b5b1cac3a1c 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -22,69 +22,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-unsigned int	hpage_shift=HPAGE_SHIFT_DEFAULT;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
+unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
 static pte_t *
 huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
@@ -244,26 +182,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 	return NULL;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 /*
  * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
  * are hugetlb region specific.
@@ -339,14 +257,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsig
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -378,7 +288,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -422,106 +332,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 		addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
 	}
 }
-void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-int set_hugetlb_mem_size(int count)
-{
-	int  lcount;
-	struct page *page ;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write, struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
 
 static int __init hugetlb_setup_sz(char *str)
 {
@@ -551,60 +361,3 @@ static int __init hugetlb_setup_sz(char *str)
 	return 1;
 }
 __setup("hugepagesz=", hugetlb_setup_sz);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-__initcall(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	if (size > (htlbpagemem << HPAGE_SHIFT))
-		return 0;
-	return 1;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage =	hugetlb_nopage,
-};
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index a7b2c63c700f..e81eeec9b009 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -29,65 +29,6 @@
 
 #include <linux/sysctl.h>
 
-int htlbpage_max;
-
-/* This lock protects the two counters and list below */
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static int htlbpage_free; /* = 0 */
-static int htlbpage_total; /* = 0 */
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-/* XXX make this a sysctl */
-unsigned long largepage_roundrobin = 1;
-
-static struct page *dequeue_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page = NULL;
-	int i;
-
-	if (!largepage_roundrobin)
-		nid = numa_node_id();
-
-	for (i = 0; i < numnodes; i++) {
-		if (!list_empty(&hugepage_freelists[nid]))
-			break;
-		nid = (nid + 1) % numnodes;
-	}
-
-	if (!list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-
-	if (largepage_roundrobin)
-		nid = (nid + 1) % numnodes;
-
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	if (!page)
-		return NULL;
-
-	nid = page_zone(page)->zone_pgdat->node_id;
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
 /* HugePTE layout:
  *
  * 31 30 ... 15 14 13 12 10 9  8  7   6    5    4    3    2    1    0
@@ -119,7 +60,6 @@ typedef struct {unsigned int val;} hugepte_t;
 #define hugepte_none(x)	(!(hugepte_val(x) & _HUGEPAGE_PFN))
 
 
-static void free_huge_page(struct page *page);
 static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
 				hugepte_t pte, int local);
 
@@ -146,27 +86,6 @@ static inline void set_hugepte(hugepte_t *ptep, hugepte_t pte)
 		       hugepte_val(pte) & ~_HUGEPAGE_HPTEFLAGS);
 }
 
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-
-	htlbpage_free--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
-
 static hugepte_t *hugepte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -448,26 +367,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return page;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpage_free++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -510,16 +409,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	mm->rss -= (end - start) >> PAGE_SHIFT;
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -554,7 +443,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -876,148 +765,3 @@ static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
 
 	ppc_md.hpte_invalidate(slot, va, 1, local);
 }
-
-static void split_and_free_hugepage(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbpage_total--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (!(cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE))
-		return 0;
-	
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbpage_total;
-
-	if (lcount == 0)
-		return htlbpage_total;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpage_free++;
-			htlbpage_total++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return htlbpage_total;
-	}
-	/* Shrink the memory size. */
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		split_and_free_hugepage(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return htlbpage_total;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write,
-		struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	if (cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) {
-		for (i = 0; i < MAX_NUMNODES; ++i)
-			INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-		for (i = 0; i < htlbpage_max; ++i) {
-			page = alloc_fresh_huge_page();
-			if (!page)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			spin_unlock(&htlbpage_lock);
-		}
-		htlbpage_max = htlbpage_free = htlbpage_total = i;
-		printk(KERN_INFO "Total HugeTLB memory allocated, %d\n",
-		       htlbpage_free);
-	} else {
-		htlbpage_max = 0;
-		printk(KERN_INFO "CPU does not support HugeTLB\n");
-	}
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5d\n"
-			"HugePages_Free:  %5d\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbpage_total,
-			htlbpage_free,
-			HPAGE_SIZE/1024);
-}
-
-/* This is advisory only, so we can get away with accesing
- * htlbpage_free without taking the lock. */
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpage_free;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbpage_total * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6f72d865e8d2..751a7d1a666d 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -24,68 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, list);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	memset(page_address(page), 0, HPAGE_SIZE);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -250,25 +188,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -297,16 +216,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -338,7 +247,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -358,168 +267,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, list);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				   unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 771ec3757d73..b4e6dfa0833a 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -21,68 +21,6 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	memset(page_address(page), 0, HPAGE_SIZE);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -247,26 +185,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -295,16 +213,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -336,7 +244,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -356,168 +264,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				   unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c60d937b202e..5e37a271dd2e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -573,7 +573,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			unsigned long long size = memparse(value, &rest);
 			if (*rest == '%') {
 				size <<= HPAGE_SHIFT;
-				size *= htlbpage_max;
+				size *= max_huge_pages;
 				do_div(size, 100);
 				rest++;
 			}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index da3fc826a0de..b0e98cfe15f9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -28,8 +28,11 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
+struct page *alloc_huge_page(void);
+void free_huge_page(struct page *);
 
-extern int htlbpage_max;
+extern unsigned long max_huge_pages;
+extern const unsigned long hugetlb_zero, hugetlb_infinity;
 
 static inline void
 mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma)
@@ -78,6 +81,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(addr, len)	0
 #define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define alloc_huge_page()			({ NULL; })
+#define free_huge_page(p)			({ (void)(p); BUG(); })
 
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	0		/* Keep the compiler happy */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 05ea59ae4276..69e9123cdd0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -710,10 +710,12 @@ static ctl_table vm_table[] = {
 	 {
 		.ctl_name	= VM_HUGETLB_PAGES,
 		.procname	= "nr_hugepages",
-		.data		= &htlbpage_max,
-		.maxlen		= sizeof(int),
+		.data		= &max_huge_pages,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
 	 },
 #endif
 	{
diff --git a/mm/Makefile b/mm/Makefile
index c66aba5886f8..5f3baecd85a7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,3 +12,4 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   slab.o swap.o truncate.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 000000000000..cb72a40c38b6
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,245 @@
+/*
+ * Generic hugetlb support.
+ * (C) William Irwin, April 2004
+ */
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/sysctl.h>
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static unsigned long nr_huge_pages, free_huge_pages;
+unsigned long max_huge_pages;
+static struct list_head hugepage_freelists[MAX_NUMNODES];
+static spinlock_t hugetlb_lock = SPIN_LOCK_UNLOCKED;
+
+static void enqueue_huge_page(struct page *page)
+{
+	list_add(&page->lru,
+		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
+}
+
+static struct page *dequeue_huge_page(void)
+{
+	int nid = numa_node_id();
+	struct page *page = NULL;
+
+	if (list_empty(&hugepage_freelists[nid])) {
+		for (nid = 0; nid < MAX_NUMNODES; ++nid)
+			if (!list_empty(&hugepage_freelists[nid]))
+				break;
+	}
+	if (nid >= 0 && nid < MAX_NUMNODES &&
+	    !list_empty(&hugepage_freelists[nid])) {
+		page = list_entry(hugepage_freelists[nid].next,
+				  struct page, lru);
+		list_del(&page->lru);
+	}
+	return page;
+}
+
+static struct page *alloc_fresh_huge_page(void)
+{
+	static int nid = 0;
+	struct page *page;
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
+	nid = (nid + 1) % numnodes;
+	return page;
+}
+
+void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	free_huge_pages++;
+	spin_unlock(&hugetlb_lock);
+}
+
+struct page *alloc_huge_page(void)
+{
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	page = dequeue_huge_page();
+	if (!page) {
+		spin_unlock(&hugetlb_lock);
+		return NULL;
+	}
+	free_huge_pages--;
+	spin_unlock(&hugetlb_lock);
+	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
+	memset(page_address(page), 0, HPAGE_SIZE);
+	return page;
+}
+
+void huge_page_release(struct page *page)
+{
+	if (!put_page_testzero(page))
+		return;
+
+	free_huge_page(page);
+}
+
+static int __init hugetlb_init(void)
+{
+	unsigned long i;
+	struct page *page;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(&hugepage_freelists[i]);
+
+	for (i = 0; i < max_huge_pages; ++i) {
+		page = alloc_fresh_huge_page();
+		if (!page)
+			break;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	return 0;
+}
+module_init(hugetlb_init);
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+		max_huge_pages = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static void update_and_free_page(struct page *page)
+{
+	int i;
+	nr_huge_pages--;
+	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(&page[i], 0);
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+#ifdef CONFIG_HIGHMEM
+static int try_to_free_low(unsigned long count)
+{
+	int i;
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page;
+		list_for_each_entry(page, &hugepage_freelists[i], lru) {
+			if (PageHighMem(page))
+				continue;
+			list_del(&page->lru);
+			update_and_free_page(page);
+			--free_huge_pages;
+			if (!--count)
+				return 0;
+		}
+	}
+	return count;
+}
+#else
+static inline int try_to_free_low(unsigned long count)
+{
+	return count;
+}
+#endif
+
+static unsigned long set_max_huge_pages(unsigned long count)
+{
+	while (count > nr_huge_pages) {
+		struct page *page = alloc_fresh_huge_page();
+		if (!page)
+			return nr_huge_pages;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		free_huge_pages++;
+		nr_huge_pages++;
+		spin_unlock(&hugetlb_lock);
+	}
+	if (count >= nr_huge_pages)
+		return nr_huge_pages;
+
+	spin_lock(&hugetlb_lock);
+	for (count = try_to_free_low(count); count < nr_huge_pages; --free_huge_pages) {
+		struct page *page = dequeue_huge_page();
+		if (!page)
+			break;
+		update_and_free_page(page);
+	}
+	spin_unlock(&hugetlb_lock);
+	return nr_huge_pages;
+}
+
+#ifdef CONFIG_SYSCTL
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+			   struct file *file, void *buffer, size_t *length)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length);
+	max_huge_pages = set_max_huge_pages(max_huge_pages);
+	return 0;
+}
+#endif /* CONFIG_SYSCTL */
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			nr_huge_pages,
+			free_huge_pages,
+			HPAGE_SIZE/1024);
+}
+
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+EXPORT_SYMBOL(hugetlb_total_pages);
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
-- 
cgit v1.2.3


From 6d27f67bf6ee2b9ad0c8814118264bc273d916a1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:15:51 -0700
Subject: [PATCH] per-backing dev unplugging

From: Jens Axboe <axboe@suse.de>,
      Chris Mason,
      me, others.

The global unplug list causes horrid spinlock contention on many-disk
many-CPU setups - throughput is worse than halved.

The other problem with the global unplugging is of course that it will cause
the unplugging of queues which are unrelated to the I/O upon which the caller
is about to wait.

So what we do to solve these problems is to remove the global unplug and set
up the infrastructure under which the VFS can tell the block layer to unplug
only those queues which are relevant to the page or buffer_head whcih is
about to be waited upon.

We do this via the very appropriate address_space->backing_dev_info structure.

Most of the complexity is in devicemapper, MD and swapper_space, because for
these backing devices, multiple queues may need to be unplugged to complete a
page/buffer I/O.  In each case we ensure that data structures are in place to
permit us to identify all the lower-level queues which contribute to the
higher-level backing_dev_info.  Each contributing queue is told to unplug in
response to a higher-level unplug.

To simplify things in various places we also introduce the concept of a
"synchronous BIO": it is tagged with BIO_RW_SYNC.  The block layer will
perform an immediate unplug when it sees one of these go past.
---
 drivers/block/ll_rw_blk.c    | 96 +++++++++++++-------------------------------
 drivers/block/loop.c         | 15 ++++++-
 drivers/block/rd.c           |  1 +
 drivers/block/umem.c         |  3 +-
 drivers/md/dm-crypt.c        |  2 +-
 drivers/md/dm-table.c        | 16 ++++++++
 drivers/md/dm.c              | 23 +++++++++--
 drivers/md/dm.h              |  1 +
 drivers/md/md.c              | 32 +++++++++++++--
 drivers/md/raid1.c           |  3 ++
 drivers/md/raid5.c           |  4 +-
 drivers/md/raid6main.c       |  3 +-
 drivers/mtd/devices/blkmtd.c |  6 +--
 fs/buffer.c                  | 12 ++++--
 fs/direct-io.c               |  4 +-
 fs/jfs/jfs_logmgr.c          |  6 +--
 fs/ntfs/compress.c           |  3 +-
 fs/ufs/truncate.c            |  3 +-
 fs/xfs/linux/xfs_buf.c       | 24 ++++-------
 include/linux/backing-dev.h  |  3 ++
 include/linux/bio.h          |  3 ++
 include/linux/blkdev.h       | 23 ++++++++---
 include/linux/fs.h           |  2 +
 include/linux/raid/md.h      |  1 +
 include/linux/raid/md_k.h    | 26 ------------
 include/linux/swap.h         |  3 ++
 kernel/power/disk.c          |  1 -
 kernel/power/pmdisk.c        |  3 +-
 kernel/power/swsusp.c        |  5 ---
 mm/filemap.c                 |  4 +-
 mm/mempool.c                 |  2 -
 mm/nommu.c                   |  5 +++
 mm/readahead.c               |  8 +++-
 mm/shmem.c                   |  1 +
 mm/swap_state.c              |  1 +
 mm/swapfile.c                | 65 +++++++++++++++++++++++++++++-
 36 files changed, 254 insertions(+), 159 deletions(-)

(limited to 'kernel')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index fc4b6c698fcf..209fdef4d986 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -42,12 +42,6 @@ static void blk_unplug_timeout(unsigned long data);
  */
 static kmem_cache_t *request_cachep;
 
-/*
- * plug management
- */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -251,8 +245,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
-	INIT_LIST_HEAD(&q->plug_list);
-
 	blk_queue_activity_fn(q, NULL, NULL);
 }
 
@@ -1104,13 +1096,11 @@ void blk_plug_device(request_queue_t *q)
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
-	if (!blk_queue_plugged(q)
-	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
-		spin_lock(&blk_plug_lock);
-		list_add_tail(&q->plug_list, &blk_plug_list);
+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+		return;
+
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		spin_unlock(&blk_plug_lock);
-	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1122,15 +1112,12 @@ EXPORT_SYMBOL(blk_plug_device);
 int blk_remove_plug(request_queue_t *q)
 {
 	WARN_ON(!irqs_disabled());
-	if (blk_queue_plugged(q)) {
-		spin_lock(&blk_plug_lock);
-		list_del_init(&q->plug_list);
-		del_timer(&q->unplug_timer);
-		spin_unlock(&blk_plug_lock);
-		return 1;
-	}
 
-	return 0;
+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+		return 0;
+
+	del_timer(&q->unplug_timer);
+	return 1;
 }
 
 EXPORT_SYMBOL(blk_remove_plug);
@@ -1161,24 +1148,32 @@ static inline void __generic_unplug_device(request_queue_t *q)
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged (either by manually calling this function, or by
- *   calling blk_run_queues()), the request_fn defined for the
- *   queue is invoked and transfers started.
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
  **/
-void generic_unplug_device(void *data)
+void generic_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
-
 	spin_lock_irq(q->queue_lock);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 }
-
 EXPORT_SYMBOL(generic_unplug_device);
 
+static void blk_backing_dev_unplug(struct backing_dev_info *bdi)
+{
+	request_queue_t *q = bdi->unplug_io_data;
+
+	/*
+	 * devices don't necessarily have an ->unplug_fn defined
+	 */
+	if (q->unplug_fn)
+		q->unplug_fn(q);
+}
+
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
+
 	q->unplug_fn(q);
 }
 
@@ -1255,42 +1250,6 @@ void blk_run_queue(struct request_queue *q)
 
 EXPORT_SYMBOL(blk_run_queue);
 
-/**
- * blk_run_queues - fire all plugged queues
- *
- * Description:
- *   Start I/O on all plugged queues known to the block layer. Queues that
- *   are currently stopped are ignored. This is equivalent to the older
- *   tq_disk task queue run.
- **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
-void blk_run_queues(void)
-{
-	LIST_HEAD(local_plug_list);
-
-	spin_lock_irq(&blk_plug_lock);
-
-	/*
-	 * this will happen fairly often
-	 */
-	if (list_empty(&blk_plug_list))
-		goto out;
-
-	list_splice_init(&blk_plug_list, &local_plug_list);
-	
-	while (!list_empty(&local_plug_list)) {
-		request_queue_t *q = blk_plug_entry(local_plug_list.next);
-
-		spin_unlock_irq(&blk_plug_lock);
-		q->unplug_fn(q);
-		spin_lock_irq(&blk_plug_lock);
-	}
-out:
-	spin_unlock_irq(&blk_plug_lock);
-}
-
-EXPORT_SYMBOL(blk_run_queues);
-
 /**
  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
  * @q:    the request queue to be released
@@ -1390,6 +1349,10 @@ request_queue_t *blk_alloc_queue(int gfp_mask)
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	atomic_set(&q->refcnt, 1);
+
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+
 	return q;
 }
 
@@ -2050,7 +2013,6 @@ long blk_congestion_wait(int rw, long timeout)
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
 
-	blk_run_queues();
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
@@ -2315,7 +2277,7 @@ out:
 	if (blk_queue_plugged(q)) {
 		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
-		if (nr_queued == q->unplug_thresh)
+		if (nr_queued == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f29f72ee30d0..a43c545071cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -434,6 +434,17 @@ inactive:
 	goto out;
 }
 
+/*
+ * kick off io on the underlying address space
+ */
+static void loop_unplug(request_queue_t *q)
+{
+	struct loop_device *lo = q->queuedata;
+
+	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+	blk_run_address_space(lo->lo_backing_file->f_mapping);
+}
+
 struct switch_request {
 	struct file *file;
 	struct completion wait;
@@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 {
 	struct file	*file;
 	struct inode	*inode;
-	struct block_device *lo_device = NULL;
 	struct address_space *mapping;
 	unsigned lo_blocksize;
 	int		lo_flags = 0;
@@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
 	lo->lo_blocksize = lo_blocksize;
-	lo->lo_device = lo_device;
+	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
 	lo->transfer = NULL;
@@ -688,6 +698,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	 */
 	blk_queue_make_request(lo->lo_queue, loop_make_request);
 	lo->lo_queue->queuedata = lo;
+	lo->lo_queue->unplug_fn = loop_unplug;
 
 	set_capacity(disks[lo->lo_number], size);
 	bd_set_size(bdev, size << 9);
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index e626344c9b58..3dd9163a64e2 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode, struct file *file,
 static struct backing_dev_info rd_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 static int rd_open(struct inode *inode, struct file *filp)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 31cd010f4d56..5a1e349b131d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -368,9 +368,8 @@ static inline void reset_page(struct mm_page *page)
 	page->biotail = & page->bio;
 }
 
-static void mm_unplug_device(void *data)
+static void mm_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	struct cardinfo *card = q->queuedata;
 	unsigned long flags;
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8e1798115e2f..a17b25380fce 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
 		/* out of memory -> run queues */
 		if (remaining)
-			blk_run_queues();
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 
 	/* drop reference, clones could have returned before we reach this */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4aa6c43ffd01..93dc0e6361c0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
+void dm_table_unplug_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+
+		if (q->unplug_fn)
+			q->unplug_fn(q);
+	}
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
 EXPORT_SYMBOL(dm_table_event);
 EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_put);
+EXPORT_SYMBOL(dm_table_get);
+EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6dc34c8b4604..542f9cd0acc0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
+static void dm_unplug_all(request_queue_t *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+}
+
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
@@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
+	md->queue->unplug_fn = dm_unplug_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
@@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
 	add_wait_queue(&md->wait, &wait);
 	up_write(&md->lock);
 
+	/* unplug */
+	map = dm_get_table(md);
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+
 	/*
 	 * Then we wait for the already mapped ios to
 	 * complete.
 	 */
-	blk_run_queues();
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md)
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
+	dm_table_unplug_all(map);
 	dm_table_put(map);
 
-	blk_run_queues();
-
 	return 0;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 780185db38d0..34bf0e7cceb2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t);
 void dm_table_suspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aa6fef11aa4e..72d6a2da5827 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -160,6 +160,30 @@ static int md_fail_request (request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
+void md_unplug_mddev(mddev_t *mddev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+EXPORT_SYMBOL(md_unplug_mddev);
+
+static void md_unplug_all(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+
+	md_unplug_mddev(mddev);
+}
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -335,6 +359,8 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct bio_vec vec;
 	struct completion event;
 
+	rw |= (1 << BIO_RW_SYNC);
+
 	bio_init(&bio);
 	bio.bi_io_vec = &vec;
 	vec.bv_page = page;
@@ -349,7 +375,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	bio.bi_private = &event;
 	bio.bi_end_io = bi_complete;
 	submit_bio(rw, &bio);
-	blk_run_queues();
 	wait_for_completion(&event);
 
 	return test_bit(BIO_UPTODATE, &bio.bi_flags);
@@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->unplug_fn = md_unplug_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -2718,7 +2744,7 @@ int md_thread(void * arg)
 		run = thread->run;
 		if (run) {
 			run(thread->mddev);
-			blk_run_queues();
+			md_unplug_mddev(thread->mddev);
 		}
 		if (signal_pending(current))
 			flush_signals(current);
@@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev)
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		blk_run_queues();
+		md_unplug_mddev(mddev);
 
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f308d5fe946f..6616cd46c50f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -451,6 +451,7 @@ rb_out:
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
 	
@@ -478,6 +479,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	conf->nr_pending++;
@@ -644,6 +646,7 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	spin_unlock_irq(&conf->resync_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b5cc6c4ba6ba..5c9d3fd66913 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -249,6 +249,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
+				md_unplug_mddev(conf->mddev);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
@@ -1292,9 +1293,8 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 		}
 	}
 }
-static void raid5_unplug_device(void *data)
+static void raid5_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 747085a6dac0..131f4a1f34eb 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1454,9 +1454,8 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
 		}
 	}
 }
-static void raid6_unplug_device(void *data)
+static void raid6_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid6_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff --git a/drivers/mtd/devices/blkmtd.c b/drivers/mtd/devices/blkmtd.c
index b4b4178943a1..4bd5d3219458 100644
--- a/drivers/mtd/devices/blkmtd.c
+++ b/drivers/mtd/devices/blkmtd.c
@@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd_dev *dev, struct page *page)
 		bio->bi_private = &event;
 		bio->bi_end_io = bi_read_complete;
 		if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
-			submit_bio(READ, bio);
-			blk_run_queues();
+			submit_bio(READ_SYNC, bio);
 			wait_for_completion(&event);
 			err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
 			bio_put(bio);
@@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio *bio)
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_write_complete;
-	submit_bio(WRITE, bio);
-	blk_run_queues();
+	submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&event);
 	DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
 	err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
diff --git a/fs/buffer.c b/fs/buffer.c
index be9cc963a178..8ab66d0b7548 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head * bh)
 	do {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		if (buffer_locked(bh)) {
-			blk_run_queues();
+			struct block_device *bd;
+			smp_mb();
+			bd = bh->b_bdev;
+			if (bd)
+				blk_run_address_space(bd->bd_inode->i_mapping);
 			io_schedule();
 		}
 	} while (buffer_locked(bh));
@@ -492,7 +496,6 @@ static void free_more_memory(void)
 	pg_data_t *pgdat;
 
 	wakeup_bdflush(1024);
-	blk_run_queues();
 	yield();
 
 	for_each_pgdat(pgdat) {
@@ -2927,7 +2930,10 @@ EXPORT_SYMBOL(try_to_free_buffers);
 
 int block_sync_page(struct page *page)
 {
-	blk_run_queues();
+	struct address_space *mapping;
+	smp_mb();
+	mapping = page->mapping;
+	blk_run_address_space(mapping);
 	return 0;
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d022a233820f..79534d258f37 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -364,7 +364,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		if (dio->bio_list == NULL) {
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_lock, flags);
-			blk_run_queues();
+			blk_run_address_space(dio->inode->i_mapping);
 			io_schedule();
 			spin_lock_irqsave(&dio->bio_lock, flags);
 			dio->waiter = NULL;
@@ -1035,7 +1035,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		if (ret == 0)
 			ret = dio->result;
 		finished_one_bio(dio);		/* This can free the dio */
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		if (should_wait) {
 			unsigned long flags;
 			/*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index b72fb4a40adc..b90aa961dd5a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1975,8 +1975,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	submit_bio(READ, bio);
-	blk_run_queues();
+	submit_bio(READ_SYNC, bio);
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
@@ -2120,9 +2119,8 @@ static void lbmStartIO(struct lbuf * bp)
 
 	/* check if journaling to disk has been disabled */
 	if (!log->no_integrity) {
-		submit_bio(WRITE, bio);
+		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
-		blk_run_queues();
 	}
 	else {
 		bio->bi_size = 0;
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index a8618f107ead..68231e909496 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -23,6 +23,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 #include "ntfs.h"
 
@@ -668,7 +669,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_queues();
+			blk_run_address_space(mapping);
 			schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 04e50f696202..b22169e7ba76 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -38,6 +38,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/sched.h>
 
 #include "swab.h"
@@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode)
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		yield();
 	}
 	offset = inode->i_size & uspi->s_fshift;
diff --git a/fs/xfs/linux/xfs_buf.c b/fs/xfs/linux/xfs_buf.c
index c5f06aad5234..2d4cf586cf85 100644
--- a/fs/xfs/linux/xfs_buf.c
+++ b/fs/xfs/linux/xfs_buf.c
@@ -1013,7 +1013,7 @@ pagebuf_lock(
 {
 	PB_TRACE(pb, "lock", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_sema);
 	PB_SET_OWNER(pb);
 	PB_TRACE(pb, "locked", 0);
@@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin(
 		if (atomic_read(&pb->pb_pin_count) == 0)
 			break;
 		if (atomic_read(&pb->pb_io_remaining))
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		schedule();
 	}
 	remove_wait_queue(&pb->pb_waiters, &wait);
@@ -1407,7 +1407,7 @@ submit_io:
 	if (pb->pb_flags & PBF_RUN_QUEUES) {
 		pb->pb_flags &= ~PBF_RUN_QUEUES;
 		if (atomic_read(&pb->pb_io_remaining) > 1)
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 	}
 }
 
@@ -1471,7 +1471,7 @@ pagebuf_iowait(
 {
 	PB_TRACE(pb, "iowait", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_iodonesema);
 	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
 	return pb->pb_error;
@@ -1617,7 +1617,6 @@ STATIC int
 pagebuf_daemon(
 	void			*data)
 {
-	int			count;
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 
@@ -1640,7 +1639,6 @@ pagebuf_daemon(
 
 		spin_lock(&pbd_delwrite_lock);
 
-		count = 0;
 		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
 			pb = list_entry(curr, page_buf_t, pb_list);
 
@@ -1657,7 +1655,6 @@ pagebuf_daemon(
 				pb->pb_flags &= ~PBF_DELWRI;
 				pb->pb_flags |= PBF_WRITE;
 				list_move(&pb->pb_list, &tmp);
-				count++;
 			}
 		}
 
@@ -1667,12 +1664,11 @@ pagebuf_daemon(
 			list_del_init(&pb->pb_list);
 
 			pagebuf_iostrategy(pb);
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		}
 
 		if (as_list_len > 0)
 			purge_addresses();
-		if (count)
-			blk_run_queues();
 
 		force_flush = 0;
 	} while (pagebuf_daemon_active);
@@ -1689,7 +1685,6 @@ pagebuf_delwri_flush(
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 	int			pincount = 0;
-	int			flush_cnt = 0;
 
 	pagebuf_runall_queues(pagebuf_dataio_workqueue);
 	pagebuf_runall_queues(pagebuf_logio_workqueue);
@@ -1733,14 +1728,8 @@ pagebuf_delwri_flush(
 
 		pagebuf_lock(pb);
 		pagebuf_iostrategy(pb);
-		if (++flush_cnt > 32) {
-			blk_run_queues();
-			flush_cnt = 0;
-		}
 	}
 
-	blk_run_queues();
-
 	while (!list_empty(&tmp)) {
 		pb = list_entry(tmp.next, page_buf_t, pb_list);
 
@@ -1751,6 +1740,9 @@ pagebuf_delwri_flush(
 		pagebuf_rele(pb);
 	}
 
+	if (flags & PBDF_WAIT)
+		blk_run_address_space(target->pbr_mapping);
+
 	if (pinptr)
 		*pinptr = pincount;
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e34916ddd1d7..00371734995c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -28,9 +28,12 @@ struct backing_dev_info {
 	int memory_backed;	/* Cannot clean pages with writepage */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	void (*unplug_io_fn)(struct backing_dev_info *);
+	void *unplug_io_data;
 };
 
 extern struct backing_dev_info default_backing_dev_info;
+void default_unplug_io_fn(struct backing_dev_info *bdi);
 
 int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c421c46bfbb2..c4dd287dd1c8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -119,11 +119,13 @@ struct bio {
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
+ * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
+#define BIO_RW_SYNC	4
 
 /*
  * various member access, note that bio_data should of course not be used
@@ -138,6 +140,7 @@ struct bio {
 #define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 
 /*
  * will die
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a521e16b398..572f96e6940a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request_queue_t *, struct request *,
 typedef void (request_fn_proc) (request_queue_t *q);
 typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
 typedef int (prep_rq_fn) (request_queue_t *, struct request *);
-typedef void (unplug_fn) (void *q);
+typedef void (unplug_fn) (request_queue_t *);
 
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
@@ -315,8 +315,6 @@ struct request_queue
 	unsigned long		bounce_pfn;
 	int			bounce_gfp;
 
-	struct list_head	plug_list;
-
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
@@ -370,8 +368,9 @@ struct request_queue
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
+#define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 
-#define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
+#define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 
@@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk *, unsigned int, unsigned long);
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void __blk_stop_queue(request_queue_t *q);
-extern void blk_run_queue(request_queue_t *q);
+extern void blk_run_queue(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int);
@@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_disk->queue;
 }
 
+static inline void blk_run_backing_dev(struct backing_dev_info *bdi)
+{
+	if (bdi && bdi->unplug_io_fn)
+		bdi->unplug_io_fn(bdi);
+}
+
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info);
+}
+
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
@@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(void *);
+extern void generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(request_queue_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39c893f8aa28..c7f0052b4abd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
+#define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 240dc450dcd3..9c06e776cfc2 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
+extern void md_unplug_mddev(mddev_t *mddev);
 
 extern void md_print_devices (void);
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index bea64b0fb6c1..42c973c53d04 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -326,7 +326,6 @@ do {									\
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
-		blk_run_queues();					\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -341,30 +340,5 @@ do {									\
 	__wait_event_lock_irq(wq, condition, lock);			\
 } while (0)
 
-
-#define __wait_disk_event(wq, condition) 				\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		blk_run_queues();					\
-		schedule();						\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_disk_event(wq, condition) 					\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_disk_event(wq, condition);				\
-} while (0)
-
 #endif
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b000c56803b8..d189090cf63a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -197,6 +197,8 @@ extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #define	SWAP_AGAIN	1
 #define	SWAP_FAIL	2
 
+extern void swap_unplug_io_fn(struct backing_dev_info *);
+
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
@@ -232,6 +234,7 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
 extern spinlock_t swaplock;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 7e035a9b42d1..6abcf99b7ada 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -84,7 +84,6 @@ static void free_some_memory(void)
 	while (shrink_all_memory(10000))
 		printk(".");
 	printk("|\n");
-	blk_run_queues();
 }
 
 
diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c
index d54147214bea..22855abbdd6e 100644
--- a/kernel/power/pmdisk.c
+++ b/kernel/power/pmdisk.c
@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err)
 
 static void wait_io(void)
 {
-	blk_run_queues();
 	while(atomic_read(&io_done))
 		io_schedule();
 }
@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page)
 	if (rw == WRITE)
 		bio_set_pages_dirty(bio);
 	start_io();
-	submit_bio(rw,bio);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 	wait_io();
  Done:
 	bio_put(bio);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 20134ab8e0b2..ae748a467af5 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -707,11 +707,6 @@ int software_suspend(void)
 
 		free_some_memory();
 		
-		/* No need to invalidate any vfsmnt list -- 
-		 * they will be valid after resume, anyway.
-		 */
-		blk_run_queues();
-
 		/* Save state of all device drivers, and stop them. */		   
 		if ((res = device_suspend(4))==0)
 			/* If stopping device drivers worked, we proceed basically into
diff --git a/mm/filemap.c b/mm/filemap.c
index ec1952db8baf..dc2f0992d879 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,8 +119,10 @@ void remove_from_page_cache(struct page *page)
 
 static inline int sync_page(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping;
 
+	smp_mb();
+	mapping = page->mapping;
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		return mapping->a_ops->sync_page(page);
 	return 0;
diff --git a/mm/mempool.c b/mm/mempool.c
index 756e60ee18d6..da6ad1e12c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -234,8 +234,6 @@ repeat_alloc:
 	if (!(gfp_mask & __GFP_WAIT))
 		return NULL;
 
-	blk_run_queues();
-
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	mb();
 	if (!pool->curr_nr)
diff --git a/mm/nommu.c b/mm/nommu.c
index c940756b49e5..1432dbab85eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -571,3 +572,7 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr,
 void pte_chain_init(void)
 {
 }
+
+void swap_unplug_io_fn(struct backing_dev_info *)
+{
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index 08a2d9f1051d..71bf2462d097 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -15,11 +15,16 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
+void default_unplug_io_fn(struct backing_dev_info *bdi)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
 struct backing_dev_info default_backing_dev_info = {
 	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
 	.state		= 0,
+	.unplug_io_fn	= default_unplug_io_fn,
 };
-
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 /*
@@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
 	ra->average = ra->ra_pages / 2;
 }
-
 EXPORT_SYMBOL(file_ra_state_init);
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 4116ea26daf1..345e04cb0f6c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -169,6 +169,7 @@ static struct vm_operations_struct shmem_vm_ops;
 static struct backing_dev_info shmem_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 LIST_HEAD(shmem_inodes);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 22946f0d9ecf..97f80d20807c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
 static struct backing_dev_info swap_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
 extern struct address_space_operations swap_aops;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5cebb1800b9..f885e6d17a49 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/rmap-locking.h>
 #include <linux/security.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
+/*
+ * Array of backing blockdevs, for swap_unplug_fn.  We need this because the
+ * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
+ * the unplug_fn.  And swap_list_lock cannot be turned into a semaphore.
+ */
+static DECLARE_MUTEX(swap_bdevs_sem);
+static struct block_device *swap_bdevs[MAX_SWAPFILES];
+
 #define SWAPFILE_CLUSTER 256
 
+/*
+ * Caller holds swap_bdevs_sem
+ */
+static void install_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == NULL) {
+			swap_bdevs[i] = bdev;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void remove_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == bdev) {
+			memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
+				(MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
+			swap_bdevs[MAX_SWAPFILES - 1] = NULL;
+			return;
+		}
+	}
+	BUG();
+}
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi)
+{
+	int i;
+
+	down(&swap_bdevs_sem);
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		struct block_device *bdev = swap_bdevs[i];
+		struct backing_dev_info *bdi;
+
+		if (bdev == NULL)
+			break;
+		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+		(*bdi->unplug_io_fn)(bdi);
+	}
+	up(&swap_bdevs_sem);
+}
+
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
@@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		swap_list_unlock();
 		goto out_dput;
 	}
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	swap_file = p->swap_file;
@@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	destroy_swap_extents(p);
 	swap_device_unlock(p);
 	swap_list_unlock();
+	remove_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	vfree(swap_map);
 	if (S_ISBLK(mapping->host->i_mode)) {
 		struct block_device *bdev = I_BDEV(mapping->host);
@@ -1440,6 +1500,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	if (error)
 		goto bad_swap;
 
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	p->flags = SWP_ACTIVE;
@@ -1465,6 +1526,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	}
 	swap_device_unlock(p);
 	swap_list_unlock();
+	install_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	error = 0;
 	goto out;
 bad_swap:
@@ -1484,7 +1547,7 @@ bad_swap_2:
 	destroy_swap_extents(p);
 	if (swap_map)
 		vfree(swap_map);
-	if (swap_file && !IS_ERR(swap_file))
+	if (swap_file)
 		filp_close(swap_file, NULL);
 out:
 	if (page && !IS_ERR(page)) {
-- 
cgit v1.2.3


From 69a03dedc92b7968fc9ca5c701e8d2d6c481750d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:57:22 -0700
Subject: [PATCH] swsusp update: supports discontingmem/highmem

From: Pavel Machek <pavel@ucw.cz>

Bill Irwin did some work on this.  It makes swsusp behave correctly w.r.t.
discontingmem, and adds highmem handling (very simple-minded, but should work
ok with 1GB).  It now should behave correctly w.r.t.  more than one swap
device, and fixes double restoring of console.
---
 include/linux/suspend.h |   2 +-
 kernel/power/swsusp.c   | 244 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 173 insertions(+), 73 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 810947658d59..7e4409b7c55b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -24,7 +24,7 @@ typedef struct pbe {
 #define SWAP_FILENAME_MAXLENGTH	32
 
 struct suspend_header {
-	__u32 version_code;
+	u32 version_code;
 	unsigned long num_physpages;
 	char machine[8];
 	char version[20];
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae748a467af5..23e577559fd9 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,11 @@
 /*
- * linux/kernel/suspend.c
+ * linux/kernel/power/swsusp.c
  *
  * This file is to realize architecture-independent
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -61,6 +61,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -74,11 +75,6 @@ unsigned char software_suspend_enabled = 0;
 #define NORESUME		1
 #define RESUME_SPECIFIED	2
 
-
-#define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
-#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
-#define ADDRESS2(x) __ADDRESS(__pa(x))		/* Needed for x86-64 where some pages are in memory twice */
-
 /* References to section boundaries */
 extern char __nosave_begin, __nosave_end;
 
@@ -105,6 +101,10 @@ unsigned int nr_copy_pages __nosavedata = 0;
    time of suspend, that must be freed. Second is "pagedir_nosave", 
    allocated at time of resume, that travels through memory not to
    collide with anything.
+
+   Warning: this is even more evil than it seems. Pagedirs this file
+   talks about are completely different from page directories used by
+   MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 static suspend_pagedir_t *pagedir_save;
@@ -139,15 +139,15 @@ static const char name_resume[] = "Resume Machine: ";
 #define TEST_SWSUSP 0		/* Set to 1 to reboot instead of halt machine after suspension */
 
 #ifdef DEBUG_DEFAULT
-# define PRINTK(f, a...)       printk(f, ## a)
+# define PRINTK(f, a...)	printk(f, ## a)
 #else
-# define PRINTK(f, a...)
+# define PRINTK(f, a...)       	do { } while(0)
 #endif
 
 #ifdef DEBUG_SLOW
 #define MDELAY(a) mdelay(a)
 #else
-#define MDELAY(a)
+#define MDELAY(a) do { } while(0)
 #endif
 
 /*
@@ -225,6 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
 static void read_swapfiles(void) /* This is called before saving image */
 {
 	int i, len;
+	char buff[sizeof(resume_file)], *sname;
 	
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
@@ -243,8 +244,11 @@ static void read_swapfiles(void) /* This is called before saving image */
 					swapfile_used[i] = SWAPFILE_IGNORED;				  
 			} else {
 	  			/* we ignore all swap devices that are not the resume_file */
-				if (1) {
-// FIXME				if(resume_device == swap_info[i].swap_device) {
+				sname = d_path(swap_info[i].swap_file->f_dentry,
+					       swap_info[i].swap_file->f_vfsmnt,
+					       buff,
+					       sizeof(buff));
+				if (!strcmp(sname, resume_file)) {
 					swapfile_used[i] = SWAPFILE_SUSPEND;
 					root_swap = i;
 				} else {
@@ -346,7 +350,7 @@ static int write_suspend_image(void)
 
 	cur = (void *) buffer;
 	if (fill_suspend_header(&cur->sh))
-		panic("\nOut of memory while writing header");
+		BUG();		/* Not a BUG_ON(): we want fill_suspend_header to be called, always */
 		
 	cur->link.next = prev;
 
@@ -362,73 +366,165 @@ static int write_suspend_image(void)
 	return 0;
 }
 
-/* if pagedir_p != NULL it also copies the counted pages */
-static int count_and_copy_data_pages(struct pbe *pagedir_p)
-{
-	int chunk_size;
-	int nr_copy_pages = 0;
-	int pfn;
+struct highmem_page {
+	char *data;
 	struct page *page;
-	
-#ifdef CONFIG_DISCONTIGMEM
-	panic("Discontingmem not supported");
-#else
-	BUG_ON (max_pfn != num_physpages);
-#endif
-	for (pfn = 0; pfn < max_pfn; pfn++) {
+	struct highmem_page *next;
+};
+
+struct highmem_page *highmem_copy = NULL;
+
+static void save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		int chunk_size;
+
+		if (!(pfn%200))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
-		if (PageHighMem(page))
-			panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			BUG();
+		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			panic("Not enough memory");
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data)
+			panic("Not enough memory");
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+}
 
-		if (!PageReserved(page)) {
-			if (PageNosave(page))
-				continue;
+static void save_highmem(void)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			save_highmem_zone(zone);
+	}
+}
 
-			if ((chunk_size=is_head_of_free_region(page))!=0) {
-				pfn += chunk_size - 1;
-				continue;
-			}
-		} else if (PageReserved(page)) {
-			BUG_ON (PageNosave(page));
+static int restore_highmem(void)
+{
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+	return 0;
+}
 
-			/*
-			 * Just copy whole code segment. Hopefully it is not that big.
-			 */
-			if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
-			    (ADDRESS(pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
-				PRINTK("[nosave %lx]", ADDRESS(pfn));
-				continue;
-			}
-			/* Hmm, perhaps copying all reserved pages is not too healthy as they may contain 
-			   critical bios data? */
-		} else	BUG();
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
 
-		nr_copy_pages++;
-		if (pagedir_p) {
-			pagedir_p->orig_address = ADDRESS(pfn);
-			copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
-			pagedir_p++;
+/* if *pagedir_p != NULL it also copies the counted pages */
+static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
+{
+	unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
+	struct pbe *pbe = *pagedir_p;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%200))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		BUG_ON(PageReserved(page) && PageNosave(page));
+		if (PageNosave(page))
+			continue;
+		if (PageReserved(page) && pfn_is_nosave(pfn)) {
+			PRINTK("[nosave pfn 0x%lx]", pfn);
+			continue;
 		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		nr_copy_pages++;
+		if (!pbe)
+			continue;
+		pbe->orig_address = (long) page_address(page);
+		copy_page((void *)pbe->address, (void *)pbe->orig_address);
+		pbe++;
 	}
+	*pagedir_p = pbe;
 	return nr_copy_pages;
 }
 
-static void free_suspend_pagedir(unsigned long this_pagedir)
+static int count_and_copy_data_pages(struct pbe *pagedir_p)
 {
-	struct page *page;
-	int pfn;
-	unsigned long this_pagedir_end = this_pagedir +
-		(PAGE_SIZE << pagedir_order);
+	int nr_copy_pages = 0;
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
+	}
+	return nr_copy_pages;
+}
 
-	for(pfn = 0; pfn < num_physpages; pfn++) {
+static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
+{
+	unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
+	pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
+	pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
+	pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
 		if (!TestClearPageNosave(page))
 			continue;
+		else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
+			continue;
+		__free_page(page);
+	}
+}
 
-		if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
-			continue; /* old pagedir gets freed in one */
-		
-		free_page(ADDRESS(pfn));
+static void free_suspend_pagedir(unsigned long this_pagedir)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			free_suspend_pagedir_zone(zone, this_pagedir);
 	}
 	free_pages(this_pagedir, pagedir_order);
 }
@@ -443,7 +539,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
 	pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
 
 	p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
-	if(!pagedir)
+	if (!pagedir)
 		return NULL;
 
 	page = virt_to_page(pagedir);
@@ -492,10 +588,12 @@ static int suspend_prepare_image(void)
 	struct sysinfo i;
 	unsigned int nr_needed_pages = 0;
 
-	drain_local_pages();
-
 	pagedir_nosave = NULL;
-	printk( "/critical section: Counting pages to copy" );
+	printk( "/critical section: Handling highmem" );
+	save_highmem();
+
+	printk(", counting pages to copy" );
+	drain_local_pages();
 	nr_copy_pages = count_and_copy_data_pages(NULL);
 	nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
 	
@@ -603,21 +701,23 @@ asmlinkage void do_magic_resume_2(void)
 
 	PRINTK( "Freeing prev allocated pagedir\n" );
 	free_suspend_pagedir((unsigned long) pagedir_save);
+
+	printk( "Restoring highmem\n" );
+	restore_highmem();
+	printk("done, devices\n");
+
 	device_power_up();
 	spin_unlock_irq(&suspend_pagedir_lock);
 	device_resume();
 
-	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
-	release_console_sem();
-
+	/* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
 	PRINTK( "Fixing swap signatures... " );
 	mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
 	PRINTK( "ok\n" );
 
 #ifdef SUSPEND_CONSOLE
 	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
+	update_screen(fg_console);
 	release_console_sem();
 #endif
 }
-- 
cgit v1.2.3


From 2c22f5c85acb9bc1bc6cde1167d2836b0b3868e1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:57:34 -0700
Subject: [PATCH] swsusp update: supports discontingmem/highmem fixes

From: Pavel Machek <pavel@ucw.cz>

It makes swsusp behave correctly w.r.t.  discontingmem, and adds highmem
handling.
---
 arch/i386/power/swsusp.S |  9 +++++++-
 kernel/power/swsusp.c    | 59 +++++++++++++++++++++++++++++++-----------------
 2 files changed, 46 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/i386/power/swsusp.S b/arch/i386/power/swsusp.S
index 6fad0ed51f9f..ac2b57d7a045 100644
--- a/arch/i386/power/swsusp.S
+++ b/arch/i386/power/swsusp.S
@@ -1,6 +1,13 @@
 .text
 
-/* Originally gcc generated, modified by hand */
+/* Originally gcc generated, modified by hand
+ *
+ * This may not use any stack, nor any variable that is not "NoSave":
+ *
+ * Its rewriting one kernel image with another. What is stack in "old"
+ * image could very well be data page in "new" image, and overwriting
+ * your own stack under you is bad idea.
+ */
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 23e577559fd9..8f78d6807576 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -225,7 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
 static void read_swapfiles(void) /* This is called before saving image */
 {
 	int i, len;
-	char buff[sizeof(resume_file)], *sname;
+	static char buff[sizeof(resume_file)], *sname;
 	
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
@@ -366,6 +366,7 @@ static int write_suspend_image(void)
 	return 0;
 }
 
+#ifdef CONFIG_HIGHMEM
 struct highmem_page {
 	char *data;
 	struct page *page;
@@ -374,7 +375,7 @@ struct highmem_page {
 
 struct highmem_page *highmem_copy = NULL;
 
-static void save_highmem_zone(struct zone *zone)
+static int save_highmem_zone(struct zone *zone)
 {
 	unsigned long zone_pfn;
 	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
@@ -384,7 +385,7 @@ static void save_highmem_zone(struct zone *zone)
 		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
 		int chunk_size;
 
-		if (!(pfn%200))
+		if (!(pfn%1000))
 			printk(".");
 		if (!pfn_valid(pfn))
 			continue;
@@ -397,7 +398,7 @@ static void save_highmem_zone(struct zone *zone)
 		 */
 		if (PageReserved(page)) {
 			printk("highmem reserved page?!\n");
-			BUG();
+			continue;
 		}
 		if ((chunk_size = is_head_of_free_region(page))) {
 			pfn += chunk_size - 1;
@@ -406,26 +407,33 @@ static void save_highmem_zone(struct zone *zone)
 		}
 		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
 		if (!save)
-			panic("Not enough memory");
+			return -ENOMEM;
 		save->next = highmem_copy;
 		save->page = page;
 		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
-		if (!save->data)
-			panic("Not enough memory");
+		if (!save->data) {
+			kfree(save);
+			return -ENOMEM;
+		}
 		kaddr = kmap_atomic(page, KM_USER0);
 		memcpy(save->data, kaddr, PAGE_SIZE);
 		kunmap_atomic(kaddr, KM_USER0);
 		highmem_copy = save;
 	}
+	return 0;
 }
 
-static void save_highmem(void)
+static int save_highmem(void)
 {
 	struct zone *zone;
+	int res = 0;
 	for_each_zone(zone) {
 		if (is_highmem(zone))
-			save_highmem_zone(zone);
+			res = save_highmem_zone(zone);
+		if (res)
+			return res;
 	}
+	return 0;
 }
 
 static int restore_highmem(void)
@@ -443,6 +451,7 @@ static int restore_highmem(void)
 	}
 	return 0;
 }
+#endif
 
 static int pfn_is_nosave(unsigned long pfn)
 {
@@ -460,7 +469,7 @@ static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
 		struct page *page;
 		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
 
-		if (!(pfn%200))
+		if (!(pfn%1000))
 			printk(".");
 		if (!pfn_valid(pfn))
 			continue;
@@ -548,7 +557,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
 		
 	while(nr_copy_pages--) {
 		p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
-		if(!p->address) {
+		if (!p->address) {
 			free_suspend_pagedir((unsigned long) pagedir);
 			return NULL;
 		}
@@ -589,10 +598,17 @@ static int suspend_prepare_image(void)
 	unsigned int nr_needed_pages = 0;
 
 	pagedir_nosave = NULL;
-	printk( "/critical section: Handling highmem" );
-	save_highmem();
+	printk( "/critical section: ");
+#ifdef CONFIG_HIGHMEM
+	printk( "handling highmem" );
+	if (save_highmem()) {
+		printk(KERN_CRIT "%sNot enough free pages for highmem\n", name_suspend);
+		return -ENOMEM;
+	}
+	printk(", ");
+#endif
 
-	printk(", counting pages to copy" );
+	printk("counting pages to copy" );
 	drain_local_pages();
 	nr_copy_pages = count_and_copy_data_pages(NULL);
 	nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
@@ -602,23 +618,22 @@ static int suspend_prepare_image(void)
 		printk(KERN_CRIT "%sCouldn't get enough free pages, on %d pages short\n",
 		       name_suspend, nr_needed_pages-nr_free_pages());
 		root_swap = 0xFFFF;
-		return 1;
+		return -ENOMEM;
 	}
 	si_swapinfo(&i);	/* FIXME: si_swapinfo(&i) returns all swap devices information.
 				   We should only consider resume_device. */
 	if (i.freeswap < nr_needed_pages)  {
 		printk(KERN_CRIT "%sThere's not enough swap space available, on %ld pages short\n",
 		       name_suspend, nr_needed_pages-i.freeswap);
-		return 1;
+		return -ENOSPC;
 	}
 
 	PRINTK( "Alloc pagedir\n" ); 
 	pagedir_save = pagedir_nosave = create_suspend_pagedir(nr_copy_pages);
-	if(!pagedir_nosave) {
-		/* Shouldn't happen */
-		printk(KERN_CRIT "%sCouldn't allocate enough pages\n",name_suspend);
-		panic("Really should not happen");
-		return 1;
+	if (!pagedir_nosave) {
+		/* Pagedir is big, one-chunk allocation. It is easily possible for this allocation to fail */
+		printk(KERN_CRIT "%sCouldn't allocate continuous pagedir\n", name_suspend);
+		return -ENOMEM;
 	}
 	nr_copy_pages_check = nr_copy_pages;
 	pagedir_order_check = pagedir_order;
@@ -702,8 +717,10 @@ asmlinkage void do_magic_resume_2(void)
 	PRINTK( "Freeing prev allocated pagedir\n" );
 	free_suspend_pagedir((unsigned long) pagedir_save);
 
+#ifdef CONFIG_HIGHMEM
 	printk( "Restoring highmem\n" );
 	restore_highmem();
+#endif
 	printk("done, devices\n");
 
 	device_power_up();
-- 
cgit v1.2.3


From d386fe6e327eb7d7808e5cf69fceaba671621ef8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:57:48 -0700
Subject: [PATCH] Swsusp should not wake up stopped processes

From: Pavel Machek <pavel@suse.cz>

If you stop process with ^Z, then suspend, process is awakened.  Thats a
bug.  Solution is to simply leave already stopped processes alone.  Plus we
no longer use TASK_STOPPED for processes in refrigerator.  Userland might
see us and get confused.
---
 kernel/power/process.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/process.c b/kernel/power/process.c
index 15c1b340c2ed..8225457183ed 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -30,7 +30,8 @@ static inline int freezeable(struct task_struct * p)
 	if ((p == current) || 
 	    (p->flags & PF_IOTHREAD) || 
 	    (p->state == TASK_ZOMBIE) ||
-	    (p->state == TASK_DEAD))
+	    (p->state == TASK_DEAD) ||
+	    (p->state == TASK_STOPPED))
 		return 0;
 	return 1;
 }
@@ -38,21 +39,19 @@ static inline int freezeable(struct task_struct * p)
 /* Refrigerator is place where frozen processes are stored :-). */
 void refrigerator(unsigned long flag)
 {
-	/* You need correct to work with real-time processes.
-	   OTOH, this way one process may see (via /proc/) some other
-	   process in stopped state (and thereby discovered we were
-	   suspended. We probably do not care. 
-	 */
+	/* Hmm, should we be allowed to suspend when there are realtime
+	   processes around? */
 	long save;
 	save = current->state;
-	current->state = TASK_STOPPED;
+	current->state = TASK_UNINTERRUPTIBLE;
 	pr_debug("%s entered refrigerator\n", current->comm);
 	printk("=");
 	current->flags &= ~PF_FREEZE;
-	if (flag)
-		flush_signals(current); /* We have signaled a kernel thread, which isn't normal behaviour
-					   and that may lead to 100%CPU sucking because those threads
-					   just don't manage signals. */
+
+	spin_lock_irq(&current->sighand->siglock);
+	recalc_sigpending(); /* We sent fake signal, clean it up */
+	spin_unlock_irq(&current->sighand->siglock);
+
 	current->flags |= PF_FROZEN;
 	while (current->flags & PF_FROZEN)
 		schedule();
-- 
cgit v1.2.3


From 2386894043fc3be2c0d2d25edb8c741f18c592ec Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:03:03 -0700
Subject: [PATCH] do_fork() error path memory leak

From: <john.l.byrne@hp.com>

In do_fork(), if an error occurs after the mm_struct for the child has been
allocated, it is never freed.  The exit_mm() meant to free it increments
the mm_count and this count is never decremented.  (For a running process
that is exitting, schedule() takes care this; however, the child process
being cleaned up is not running.) In the CLONE_VM case, the parent's
mm_struct will get an extra mm_count and so it will never be freed.

This patch should fix both the CLONE_VM and the not CLONE_VM case; the test
of p->active_mm prevents a panic in the case that a kernel-thread is being
cloned.
---
 kernel/fork.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index b4cbfd04847b..4f5b018777d8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1086,6 +1086,8 @@ bad_fork_cleanup_namespace:
 	exit_namespace(p);
 bad_fork_cleanup_mm:
 	exit_mm(p);
+	if (p->active_mm)
+		mmdrop(p->active_mm);
 bad_fork_cleanup_signal:
 	exit_signal(p);
 bad_fork_cleanup_sighand:
-- 
cgit v1.2.3


From 7285840f8745b467b111a908eb20a1d9b845c98c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:04:34 -0700
Subject: [PATCH] BSD accounting oops fix

oopses have been reported in do_acct_process(), with premption enabled, when
threaded applications are exitting.

It appears that we're racing with another thread which is nulling out
current->tty.  I think this race is still there after we moved current->tty
into current->signal->tty, so let's take the needed lock.
---
 kernel/acct.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/acct.c b/kernel/acct.c
index 8e32413c41f3..555e1e3c349f 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -347,7 +347,11 @@ static void do_acct_process(long exitcode, struct file *file)
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
-	ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0;
+
+	read_lock(&tasklist_lock);	/* pin current->signal */
+	ac.ac_tty = current->signal->tty ?
+		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
+	read_unlock(&tasklist_lock);
 
 	ac.ac_flag = 0;
 	if (current->flags & PF_FORKNOEXEC)
-- 
cgit v1.2.3


From 6451af4da3e2aee9c8d65f75002e7a80dfd185e5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:06:59 -0700
Subject: [PATCH] pmdisk is x86 only

Only x86 implements pmdisk_arch_suspend().  So mark pmdisk as ia32-only, to
avoid breaking allyesconfig.
---
 kernel/power/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 68bc8a8603ea..6bb62269f3eb 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -44,7 +44,7 @@ config SOFTWARE_SUSPEND
 
 config PM_DISK
 	bool "Suspend-to-Disk Support"
-	depends on PM && SWAP
+	depends on PM && SWAP && X86 && !X86_64
 	---help---
 	  Suspend-to-disk is a power management state in which the contents
 	  of memory are stored on disk and the entire system is shut down or
-- 
cgit v1.2.3