From 243c64b2cfea7e49e074c80db65fa7b90d765c6f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:39:51 -0700
Subject: [PATCH] feed devfs through Lindent

Nobody seems to have any outstanding work against devfs, so...
---
 include/linux/devfs_fs.h        | 32 +++++++++++++++-----------------
 include/linux/devfs_fs_kernel.h | 26 +++++++++++++-------------
 2 files changed, 28 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/devfs_fs.h b/include/linux/devfs_fs.h
index 48da59012021..de236f431877 100644
--- a/include/linux/devfs_fs.h
+++ b/include/linux/devfs_fs.h
@@ -22,22 +22,20 @@
 #define DEVFSD_NOTIFY_CREATE        6
 #define DEVFSD_NOTIFY_DELETE        7
 
-#define DEVFS_PATHLEN               1024  /*  Never change this otherwise the
-					      binary interface will change   */
-
-struct devfsd_notify_struct
-{   /*  Use native C types to ensure same types in kernel and user space     */
-    unsigned int type;           /*  DEVFSD_NOTIFY_* value                   */
-    unsigned int mode;           /*  Mode of the inode or device entry       */
-    unsigned int major;          /*  Major number of device entry            */
-    unsigned int minor;          /*  Minor number of device entry            */
-    unsigned int uid;            /*  Uid of process, inode or device entry   */
-    unsigned int gid;            /*  Gid of process, inode or device entry   */
-    unsigned int overrun_count;  /*  Number of lost events                   */
-    unsigned int namelen;        /*  Number of characters not including '\0' */
-    /*  The device name MUST come last                                       */
-    char devname[DEVFS_PATHLEN]; /*  This will be '\0' terminated            */
+#define DEVFS_PATHLEN               1024	/*  Never change this otherwise the
+						   binary interface will change   */
+
+struct devfsd_notify_struct {	/*  Use native C types to ensure same types in kernel and user space     */
+	unsigned int type;	/*  DEVFSD_NOTIFY_* value                   */
+	unsigned int mode;	/*  Mode of the inode or device entry       */
+	unsigned int major;	/*  Major number of device entry            */
+	unsigned int minor;	/*  Minor number of device entry            */
+	unsigned int uid;	/*  Uid of process, inode or device entry   */
+	unsigned int gid;	/*  Gid of process, inode or device entry   */
+	unsigned int overrun_count;	/*  Number of lost events                   */
+	unsigned int namelen;	/*  Number of characters not including '\0' */
+	/*  The device name MUST come last                                       */
+	char devname[DEVFS_PATHLEN];	/*  This will be '\0' terminated            */
 };
 
-
-#endif  /*  _LINUX_DEVFS_FS_H  */
+#endif				/*  _LINUX_DEVFS_FS_H  */
diff --git a/include/linux/devfs_fs_kernel.h b/include/linux/devfs_fs_kernel.h
index 16c78f54f427..89810e73d256 100644
--- a/include/linux/devfs_fs_kernel.h
+++ b/include/linux/devfs_fs_kernel.h
@@ -12,18 +12,18 @@
 
 #ifdef CONFIG_DEVFS_FS
 extern int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
+    __attribute__ ((format(printf, 3, 4)));
 extern int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
+    __attribute__ ((format(printf, 3, 4)));
 extern int devfs_mk_symlink(const char *name, const char *link);
 extern int devfs_mk_dir(const char *fmt, ...)
-	__attribute__((format (printf, 1, 2)));
+    __attribute__ ((format(printf, 1, 2)));
 extern void devfs_remove(const char *fmt, ...)
-	__attribute__((format (printf, 1, 2)));
+    __attribute__ ((format(printf, 1, 2)));
 extern int devfs_register_tape(const char *name);
 extern void devfs_unregister_tape(int num);
 extern void mount_devfs_fs(void);
-#else  /*  CONFIG_DEVFS_FS  */
+#else				/*  CONFIG_DEVFS_FS  */
 static inline int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
 {
 	return 0;
@@ -32,9 +32,9 @@ static inline int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
 {
 	return 0;
 }
-static inline int devfs_mk_symlink (const char *name, const char *link)
+static inline int devfs_mk_symlink(const char *name, const char *link)
 {
-    return 0;
+	return 0;
 }
 static inline int devfs_mk_dir(const char *fmt, ...)
 {
@@ -43,16 +43,16 @@ static inline int devfs_mk_dir(const char *fmt, ...)
 static inline void devfs_remove(const char *fmt, ...)
 {
 }
-static inline int devfs_register_tape (const char *name)
+static inline int devfs_register_tape(const char *name)
 {
-    return -1;
+	return -1;
 }
 static inline void devfs_unregister_tape(int num)
 {
 }
-static inline void mount_devfs_fs (void)
+static inline void mount_devfs_fs(void)
 {
-    return;
+	return;
 }
-#endif  /*  CONFIG_DEVFS_FS  */
-#endif  /*  _LINUX_DEVFS_FS_KERNEL_H  */
+#endif				/*  CONFIG_DEVFS_FS  */
+#endif				/*  _LINUX_DEVFS_FS_KERNEL_H  */
-- 
cgit v1.2.3


From 0eb217f9b539fccf5aafaba8c9a06e170825f68b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:40:05 -0700
Subject: [PATCH] generalise system_running

From: Olof Johansson <olof@austin.ibm.com>

It's currently a boolean, but that means that system_running goes to zero
again when shutting down.  So we then use code (in the page allocator) which
is only designed to be used during bootup - it is marked __init.

So we need to be able to distinguish early boot state from late shutdown
state.  Rename system_running to system_state and give it the three
appropriate states.
---
 arch/ppc/platforms/pmac_nvram.c | 8 ++++----
 include/linux/kernel.h          | 8 +++++++-
 init/main.c                     | 8 ++------
 kernel/kmod.c                   | 2 +-
 kernel/printk.c                 | 3 ++-
 kernel/sched.c                  | 3 ++-
 kernel/sys.c                    | 8 ++++----
 mm/page_alloc.c                 | 2 +-
 8 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ppc/platforms/pmac_nvram.c b/arch/ppc/platforms/pmac_nvram.c
index f381f3f745f9..3b3f984fb929 100644
--- a/arch/ppc/platforms/pmac_nvram.c
+++ b/arch/ppc/platforms/pmac_nvram.c
@@ -154,11 +154,11 @@ static unsigned char __pmac pmu_nvram_read_byte(int addr)
 	struct adb_request req;
 	DECLARE_COMPLETION(req_complete); 
 	
-	req.arg = system_running ? &req_complete : NULL;
+	req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL;
 	if (pmu_request(&req, pmu_nvram_complete, 3, PMU_READ_NVRAM,
 			(addr >> 8) & 0xff, addr & 0xff))
 		return 0xff;
-	if (system_running)
+	if (system_state == SYSTEM_RUNNING)
 		wait_for_completion(&req_complete);
 	while (!req.complete)
 		pmu_poll();
@@ -170,11 +170,11 @@ static void __pmac pmu_nvram_write_byte(int addr, unsigned char val)
 	struct adb_request req;
 	DECLARE_COMPLETION(req_complete); 
 	
-	req.arg = system_running ? &req_complete : NULL;
+	req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL;
 	if (pmu_request(&req, pmu_nvram_complete, 4, PMU_WRITE_NVRAM,
 			(addr >> 8) & 0xff, addr & 0xff, val))
 		return;
-	if (system_running)
+	if (system_state == SYSTEM_RUNNING)
 		wait_for_completion(&req_complete);
 	while (!req.complete)
 		pmu_poll();
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e11e79199357..c1171e77c76b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -109,9 +109,15 @@ static inline void console_verbose(void)
 extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_on_oops;
-extern int system_running;
+extern int system_state;		/* See values below */
 extern int tainted;
 extern const char *print_tainted(void);
+
+/* Values used for system_state */
+#define SYSTEM_BOOTING 0
+#define SYSTEM_RUNNING 1
+#define SYSTEM_SHUTDOWN 2
+
 #define TAINT_PROPRIETARY_MODULE	(1<<0)
 #define TAINT_FORCED_MODULE		(1<<1)
 #define TAINT_UNSAFE_SMP		(1<<2)
diff --git a/init/main.c b/init/main.c
index 9d1ed1de14c5..348ce7db30f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -94,11 +94,7 @@ extern void driver_init(void);
 extern void tc_init(void);
 #endif
 
-/*
- * Are we up and running (ie do we have all the infrastructure
- * set up)
- */
-int system_running;
+int system_state;	/* SYSTEM_BOOTING/RUNNING/SHUTDOWN */
 
 /*
  * Boot command-line arguments
@@ -613,7 +609,7 @@ static int init(void * unused)
 	 */
 	free_initmem();
 	unlock_kernel();
-	system_running = 1;
+	system_state = SYSTEM_RUNNING;
 
 	if (sys_open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5261de82029b..0002fcd4c554 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 	};
 	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
-	if (!system_running)
+	if (system_state != SYSTEM_RUNNING)
 		return -EBUSY;
 
 	if (path[0] == '\0')
diff --git a/kernel/printk.c b/kernel/printk.c
index a7be1f922f34..5f2b3c9bbd6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...)
 			log_level_unknown = 1;
 	}
 
-	if (!cpu_online(smp_processor_id()) && !system_running) {
+	if (!cpu_online(smp_processor_id()) &&
+	    system_state != SYSTEM_RUNNING) {
 		/*
 		 * Some console drivers may assume that per-cpu resources have
 		 * been allocated.  So don't allow them to be called by this
diff --git a/kernel/sched.c b/kernel/sched.c
index d5f21712ffbb..9e19d4c0d4a9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2982,7 +2982,8 @@ void __might_sleep(char *file, int line)
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) && system_running) {
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
diff --git a/kernel/sys.c b/kernel/sys.c
index 33a14e13079e..bc498b12edcc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -436,7 +436,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system.\n");
 		machine_restart(NULL);
@@ -452,7 +452,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_HALT:
 		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "System halted.\n");
 		machine_halt();
@@ -462,7 +462,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Power down.\n");
 		machine_power_off();
@@ -478,7 +478,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		buffer[sizeof(buffer) - 1] = '\0';
 
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
 		machine_restart(buffer);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d035d836c15..9764a4e78e45 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -734,7 +734,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde
 	struct page * page;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(!system_running))
+	if (unlikely(system_state == SYSTEM_BOOTING))
 		return get_boot_pages(gfp_mask, order);
 #endif
 	page = alloc_pages(gfp_mask, order);
-- 
cgit v1.2.3


From efffe9c8536bf9ee28f2f381bd285824bedcdbcd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:40:55 -0700
Subject: [PATCH] Fix VT open/close race

The race is that con_close() can sleep, and drops the BKL while
tty->count==1.  But another thread can come into init_dev() and will take a
new ref against the tty and start using it.

But con_close() doesn't notice that new ref and proceeds to null out
tty->driver_data while someone else is using the resurrected tty.

So the patch serialises con_close() against init_dev() with tty_sem.


Here's a test app which reproduced the oops instantly on 2-way.  It realy
needs to be run against all tty-capable devices.

/*
 * Run this against a tty which nobody currently has open, such as /dev/tty9
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/kd.h>

void doit(char *filename)
{
	int fd,x;

	fd = open(filename, O_RDWR);
	if (fd < 0) {
		perror("open");
		exit(1);
	}
	ioctl(fd, KDKBDREP, &x);
	close(fd);
}

main(int argc, char *argv[])
{
	char *filename = argv[1];

	for ( ; ; )
		doit(filename);
}
---
 drivers/char/tty_io.c |  2 +-
 drivers/char/vt.c     | 14 ++++++++++++++
 include/linux/tty.h   |  3 +++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6bb5ae7e41a5..0ba52078f637 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -123,7 +123,7 @@ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
 struct tty_ldisc ldiscs[NR_LDISCS];	/* line disc dispatch table	*/
 
 /* Semaphore to protect creating and releasing a tty */
-static DECLARE_MUTEX(tty_sem);
+DECLARE_MUTEX(tty_sem);
 
 #ifdef CONFIG_UNIX98_PTYS
 extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index a5ddfc5ac9c1..2febed52e19f 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2480,8 +2480,16 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 	return ret;
 }
 
+/*
+ * We take tty_sem in here to prevent another thread from coming in via init_dev
+ * and taking a ref against the tty while we're in the process of forgetting
+ * about it and cleaning things up.
+ *
+ * This is because vcs_remove_devfs() can sleep and will drop the BKL.
+ */
 static void con_close(struct tty_struct *tty, struct file *filp)
 {
+	down(&tty_sem);
 	acquire_console_sem();
 	if (tty && tty->count == 1) {
 		struct vt_struct *vt;
@@ -2492,9 +2500,15 @@ static void con_close(struct tty_struct *tty, struct file *filp)
 		tty->driver_data = 0;
 		release_console_sem();
 		vcs_remove_devfs(tty);
+		up(&tty_sem);
+		/*
+		 * tty_sem is released, but we still hold BKL, so there is
+		 * still exclusion against init_dev()
+		 */
 		return;
 	}
 	release_console_sem();
+	up(&tty_sem);
 }
 
 static void vc_init(unsigned int currcons, unsigned int rows,
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fbcc401e8b28..6e61f3b27157 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -363,6 +363,9 @@ extern void tty_flip_buffer_push(struct tty_struct *tty);
 extern int tty_get_baud_rate(struct tty_struct *tty);
 extern int tty_termios_baud_rate(struct termios *termios);
 
+struct semaphore;
+extern struct semaphore tty_sem;
+
 /* n_tty.c */
 extern struct tty_ldisc tty_ldisc_N_TTY;
 
-- 
cgit v1.2.3


From ee28db843649533f5650186251ae4a8bd49a3da9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:41:07 -0700
Subject: [PATCH] i4l: kernelcapi receive workqueue and locking rework

From: Armin Schindler <armin@melware.de>

With this patch the ISDN kernel CAPI code uses a per application workqueue
with proper locking to prevent message re-ordering due to the fact a
workqueue may run on another CPU at the same time.  Also some locks for
internal data is added.

Removed global recv_queue work, use per application workqueue.  Added
proper locking mechanisms for application, controller and application
workqueue function.  Increased max.  number of possible applications and
controllers.
---
 drivers/isdn/capi/kcapi.c  | 96 ++++++++++++++++++++++++++++++++--------------
 include/linux/kernelcapi.h | 11 ++++--
 2 files changed, 75 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 064dc3003716..8524997b10b6 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -1,4 +1,4 @@
-/* $Id: kcapi.c,v 1.1.2.7 2004/03/16 08:01:47 armin Exp $
+/* $Id: kcapi.c,v 1.1.2.8 2004/03/26 19:57:20 armin Exp $
  * 
  * Kernel CAPI 2.0 Module
  * 
@@ -31,7 +31,7 @@
 #include <linux/b1lli.h>
 #endif
 
-static char *revision = "$Revision: 1.1.2.7 $";
+static char *revision = "$Revision: 1.1.2.8 $";
 
 /* ------------------------------------------------------------- */
 
@@ -63,13 +63,13 @@ static char capi_manufakturer[64] = "AVM Berlin";
 LIST_HEAD(capi_drivers);
 rwlock_t capi_drivers_list_lock = RW_LOCK_UNLOCKED;
 
+static rwlock_t application_lock = RW_LOCK_UNLOCKED;
+static DECLARE_MUTEX(controller_sem);
+
 struct capi20_appl *capi_applications[CAPI_MAXAPPL];
 struct capi_ctr *capi_cards[CAPI_MAXCONTR];
 
 static int ncards;
-static struct sk_buff_head recv_queue;
-
-static struct work_struct tq_recv_notify;
 
 /* -------- controller ref counting -------------------------------------- */
 
@@ -174,7 +174,7 @@ static void notify_up(u32 contr)
 
 	for (applid = 1; applid <= CAPI_MAXAPPL; applid++) {
 		ap = get_capi_appl_by_nr(applid);
-		if (ap && ap->callback)
+		if (ap && ap->callback && !ap->release_in_progress)
 			ap->callback(KCI_CONTRUP, contr, &card->profile);
 	}
 }
@@ -192,7 +192,7 @@ static void notify_down(u32 contr)
 
 	for (applid = 1; applid <= CAPI_MAXAPPL; applid++) {
 		ap = get_capi_appl_by_nr(applid);
-		if (ap && ap->callback)
+		if (ap && ap->callback && !ap->release_in_progress)
 			ap->callback(KCI_CONTRDOWN, contr, 0);
 	}
 }
@@ -237,38 +237,39 @@ static int notify_push(unsigned int cmd, u32 controller, u16 applid, u32 ncci)
 	
 /* -------- Receiver ------------------------------------------ */
 
-static void recv_handler(void *dummy)
+static void recv_handler(void *_ap)
 {
 	struct sk_buff *skb;
-	struct capi20_appl *ap;
+	struct capi20_appl *ap = (struct capi20_appl *) _ap;
 
-	while ((skb = skb_dequeue(&recv_queue)) != 0) {
-		ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data));
-		if (!ap) {
-			printk(KERN_ERR "kcapi: recv_handler: applid %d ? (%s)\n",
-				CAPIMSG_APPID(skb->data), capi_message2str(skb->data));
-			kfree_skb(skb);
-			continue;
-		}
+	if ((!ap) || (ap->release_in_progress))
+		return;
 
+	down(&ap->recv_sem);
+	while ((skb = skb_dequeue(&ap->recv_queue))) {
 		if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_IND)
 			ap->nrecvdatapkt++;
 		else
 			ap->nrecvctlpkt++;
+
 		ap->recv_message(ap, skb);
 	}
+	up(&ap->recv_sem);
 }
 
 void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb)
 {
+	struct capi20_appl *ap;
 	int showctl = 0;
 	u8 cmd, subcmd;
+	unsigned long flags;
 
 	if (card->cardstate != CARD_RUNNING) {
 		printk(KERN_INFO "kcapi: controller %d not active, got: %s",
 		       card->cnr, capi_message2str(skb->data));
 		goto error;
 	}
+
 	cmd = CAPIMSG_COMMAND(skb->data);
         subcmd = CAPIMSG_SUBCOMMAND(skb->data);
 	if (cmd == CAPI_DATA_B3 && subcmd == CAPI_IND) {
@@ -293,8 +294,19 @@ void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *s
 		}
 
 	}
-	skb_queue_tail(&recv_queue, skb);
-	schedule_work(&tq_recv_notify);
+
+	read_lock_irqsave(&application_lock, flags);
+	ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data));
+	if ((!ap) || (ap->release_in_progress)) {
+		read_unlock_irqrestore(&application_lock, flags);
+		printk(KERN_ERR "kcapi: handle_message: applid %d state released (%s)\n",
+			CAPIMSG_APPID(skb->data), capi_message2str(skb->data));
+		goto error;
+	}
+	skb_queue_tail(&ap->recv_queue, skb);
+	schedule_work(&ap->recv_work);
+	read_unlock_irqrestore(&application_lock, flags);
+
 	return;
 
 error:
@@ -310,11 +322,13 @@ void capi_ctr_ready(struct capi_ctr * card)
 
 	card->cardstate = CARD_RUNNING;
 
+	down(&controller_sem);
 	for (appl = 1; appl <= CAPI_MAXAPPL; appl++) {
 		ap = get_capi_appl_by_nr(appl);
-		if (!ap) continue;
+		if (!ap || ap->release_in_progress) continue;
 		register_appl(card, appl, &ap->rparam);
 	}
+	up(&controller_sem);
 
         printk(KERN_NOTICE "kcapi: card %d \"%s\" ready.\n",
 	       card->cnr, card->name);
@@ -342,7 +356,7 @@ void capi_ctr_reseted(struct capi_ctr * card)
 
 	for (appl = 1; appl <= CAPI_MAXAPPL; appl++) {
 		struct capi20_appl *ap = get_capi_appl_by_nr(appl);
-		if (!ap)
+		if (!ap || ap->release_in_progress)
 			continue;
 
 		capi_ctr_put(card);
@@ -382,16 +396,21 @@ attach_capi_ctr(struct capi_ctr *card)
 {
 	int i;
 
+	down(&controller_sem);
+
 	for (i = 0; i < CAPI_MAXCONTR; i++) {
 		if (capi_cards[i] == NULL)
 			break;
 	}
 	if (i == CAPI_MAXCONTR) {
+		up(&controller_sem);
 		printk(KERN_ERR "kcapi: out of controller slots\n");
 	   	return -EBUSY;
 	}
 	capi_cards[i] = card;
 
+	up(&controller_sem);
+
 	card->nrecvctlpkt = 0;
 	card->nrecvdatapkt = 0;
 	card->nsentctlpkt = 0;
@@ -480,18 +499,23 @@ u16 capi20_register(struct capi20_appl *ap)
 {
 	int i;
 	u16 applid;
+	unsigned long flags;
 
 	DBG("");
 
 	if (ap->rparam.datablklen < 128)
 		return CAPI_LOGBLKSIZETOSMALL;
 
+	write_lock_irqsave(&application_lock, flags);
+
 	for (applid = 1; applid <= CAPI_MAXAPPL; applid++) {
 		if (capi_applications[applid - 1] == NULL)
 			break;
 	}
-	if (applid > CAPI_MAXAPPL)
+	if (applid > CAPI_MAXAPPL) {
+		write_unlock_irqrestore(&application_lock, flags);
 		return CAPI_TOOMANYAPPLS;
+	}
 
 	ap->applid = applid;
 	capi_applications[applid - 1] = ap;
@@ -501,12 +525,21 @@ u16 capi20_register(struct capi20_appl *ap)
 	ap->nsentctlpkt = 0;
 	ap->nsentdatapkt = 0;
 	ap->callback = 0;
+	init_MUTEX(&ap->recv_sem);
+	skb_queue_head_init(&ap->recv_queue);
+	INIT_WORK(&ap->recv_work, recv_handler, (void *)ap);
+	ap->release_in_progress = 0;
+
+	write_unlock_irqrestore(&application_lock, flags);
 	
+	down(&controller_sem);
 	for (i = 0; i < CAPI_MAXCONTR; i++) {
 		if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING)
 			continue;
 		register_appl(capi_cards[i], applid, &ap->rparam);
 	}
+	up(&controller_sem);
+
 	if (showcapimsgs & 1) {
 		printk(KERN_DEBUG "kcapi: appl %d up\n", applid);
 	}
@@ -519,15 +552,26 @@ EXPORT_SYMBOL(capi20_register);
 u16 capi20_release(struct capi20_appl *ap)
 {
 	int i;
+	unsigned long flags;
 
 	DBG("applid %#x", ap->applid);
 
+	write_lock_irqsave(&application_lock, flags);
+	ap->release_in_progress = 1;
+	capi_applications[ap->applid - 1] = NULL;
+	write_unlock_irqrestore(&application_lock, flags);
+
+	down(&controller_sem);
 	for (i = 0; i < CAPI_MAXCONTR; i++) {
 		if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING)
 			continue;
 		release_appl(capi_cards[i], ap->applid);
 	}
-	capi_applications[ap->applid - 1] = NULL;
+	up(&controller_sem);
+
+	flush_scheduled_work();
+	skb_queue_purge(&ap->recv_queue);
+
 	if (showcapimsgs & 1) {
 		printk(KERN_DEBUG "kcapi: appl %d down\n", ap->applid);
 	}
@@ -547,7 +591,7 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
  
 	if (ncards == 0)
 		return CAPI_REGNOTINSTALLED;
-	if (ap->applid == 0)
+	if ((ap->applid == 0) || ap->release_in_progress)
 		return CAPI_ILLAPPNR;
 	if (skb->len < 12
 	    || !capi_cmd_valid(CAPIMSG_COMMAND(skb->data))
@@ -925,10 +969,6 @@ static int __init kcapi_init(void)
 	char *p;
 	char rev[32];
 
-	skb_queue_head_init(&recv_queue);
-
-	INIT_WORK(&tq_recv_notify, recv_handler, NULL);
-
         kcapi_proc_init();
 
 	if ((p = strchr(revision, ':')) != 0 && p[1]) {
diff --git a/include/linux/kernelcapi.h b/include/linux/kernelcapi.h
index b982d5b77ae9..1d4b1b15d0b8 100644
--- a/include/linux/kernelcapi.h
+++ b/include/linux/kernelcapi.h
@@ -10,10 +10,8 @@
 #ifndef __KERNELCAPI_H__
 #define __KERNELCAPI_H__
 
-#include <linux/list.h>
-
-#define CAPI_MAXAPPL	128	/* maximum number of applications  */
-#define CAPI_MAXCONTR	16	/* maximum number of controller    */
+#define CAPI_MAXAPPL	240	/* maximum number of applications  */
+#define CAPI_MAXCONTR	32	/* maximum number of controller    */
 #define CAPI_MAXDATAWINDOW	8
 
 
@@ -47,6 +45,7 @@ typedef struct kcapi_carddef {
 
 #ifdef __KERNEL__
 
+#include <linux/list.h>
 #include <linux/skbuff.h>
 
 #define	KCI_CONTRUP	0	/* arg: struct capi_profile */
@@ -63,6 +62,10 @@ struct capi20_appl {
 	unsigned long nrecvdatapkt;
 	unsigned long nsentctlpkt;
 	unsigned long nsentdatapkt;
+	struct semaphore recv_sem;
+	struct sk_buff_head recv_queue;
+	struct work_struct recv_work;
+	int release_in_progress;
 
 	/* ugly hack to allow for notification of added/removed
 	 * controllers. The Right Way (tm) is known. XXX
-- 
cgit v1.2.3


From b283f09cf8f51c29bf90e42e22099f76d0f33378 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:41:20 -0700
Subject: [PATCH] Fix get_wchan() FIXME wrt. order of functions

From: William Lee Irwin III <wli@holomorphy.com>

This addresses the issue with get_wchan() that the various functions acting
as scheduling-related primitives are not, in fact, contiguous in the text
segment.  It creates an ELF section for scheduling primitives to be placed
in, and places currently-detected (i.e.  skipped during stack decoding)
scheduling primitives and others like io_schedule() and down(), which are
currently missed by get_wchan() code, into this section also.

The net effects are more reliability of get_wchan()'s results and the new
ability, made use of by this code, to arbitrarily place scheduling
primitives in the source code without disturbing get_wchan()'s accuracy.

Suggestions by Arnd Bergmann and Matthew Wilcox regarding reducing the
invasiveness of the patch were incorporated during prior rounds of review.
I've at least tried to sweep all arches in this patch.
---
 arch/alpha/kernel/process.c                |  2 --
 arch/alpha/kernel/semaphore.c              |  9 ++++----
 arch/alpha/kernel/vmlinux.lds.S            |  1 +
 arch/arm/kernel/process.c                  |  2 --
 arch/arm/kernel/semaphore.c                |  8 ++++---
 arch/arm/kernel/vmlinux.lds.S              |  1 +
 arch/arm26/kernel/process.c                |  2 --
 arch/arm26/kernel/semaphore.c              |  8 ++++---
 arch/arm26/kernel/vmlinux-arm26-xip.lds.in |  1 +
 arch/arm26/kernel/vmlinux-arm26.lds.in     |  1 +
 arch/cris/arch-v10/kernel/process.c        |  3 +--
 arch/cris/arch-v10/vmlinux.lds.S           |  1 +
 arch/cris/kernel/semaphore.c               |  5 ++--
 arch/h8300/kernel/process.c                |  3 ---
 arch/h8300/kernel/semaphore.c              |  5 ++--
 arch/h8300/kernel/vmlinux.lds.S            |  1 +
 arch/i386/kernel/process.c                 |  2 --
 arch/i386/kernel/semaphore.c               | 17 +++++++-------
 arch/i386/kernel/vmlinux.lds.S             |  1 +
 arch/ia64/kernel/process.c                 |  2 --
 arch/ia64/kernel/semaphore.c               |  7 +++---
 arch/ia64/kernel/vmlinux.lds.S             |  1 +
 arch/m68k/kernel/process.c                 |  5 ----
 arch/m68k/kernel/semaphore.c               |  5 ++--
 arch/m68k/kernel/vmlinux-std.lds           |  1 +
 arch/m68k/kernel/vmlinux-sun3.lds          |  1 +
 arch/m68knommu/kernel/process.c            |  5 ----
 arch/m68knommu/kernel/semaphore.c          |  5 ++--
 arch/m68knommu/kernel/vmlinux.lds.S        |  1 +
 arch/mips/kernel/process.c                 |  2 --
 arch/mips/kernel/semaphore.c               |  5 ++--
 arch/mips/kernel/vmlinux.lds.S             |  1 +
 arch/parisc/kernel/semaphore.c             |  5 ++--
 arch/parisc/kernel/vmlinux.lds.S           |  1 +
 arch/ppc/kernel/process.c                  |  2 --
 arch/ppc/kernel/semaphore.c                |  5 ++--
 arch/ppc/kernel/vmlinux.lds.S              |  1 +
 arch/ppc64/kernel/process.c                |  2 --
 arch/ppc64/kernel/semaphore.c              |  5 ++--
 arch/ppc64/kernel/vmlinux.lds.S            |  1 +
 arch/s390/kernel/process.c                 |  2 --
 arch/s390/kernel/semaphore.c               |  5 ++--
 arch/s390/kernel/vmlinux.lds.S             |  1 +
 arch/sh/kernel/process.c                   |  4 +---
 arch/sh/kernel/semaphore.c                 |  5 ++--
 arch/sh/kernel/vmlinux.lds.S               |  1 +
 arch/sparc/kernel/process.c                |  4 +---
 arch/sparc/kernel/semaphore.c              |  5 ++--
 arch/sparc/kernel/vmlinux.lds.S            |  1 +
 arch/sparc/lib/rwsem.S                     |  3 ++-
 arch/sparc64/kernel/process.c              |  4 +---
 arch/sparc64/kernel/semaphore.c            |  9 ++++----
 arch/sparc64/kernel/vmlinux.lds.S          |  1 +
 arch/sparc64/lib/rwsem.c                   |  5 ++--
 arch/v850/kernel/process.c                 |  3 ---
 arch/v850/kernel/semaphore.c               |  5 ++--
 arch/v850/kernel/vmlinux.lds.S             |  1 +
 arch/x86_64/kernel/process.c               |  2 --
 arch/x86_64/kernel/semaphore.c             |  5 ++--
 arch/x86_64/kernel/vmlinux.lds.S           |  1 +
 arch/x86_64/lib/thunk.S                    |  3 ++-
 include/asm-generic/vmlinux.lds.h          |  5 ++++
 include/linux/init.h                       |  2 ++
 include/linux/sched.h                      |  2 ++
 kernel/sched.c                             | 37 ++++++++++++++++--------------
 kernel/timer.c                             |  4 ++--
 lib/rwsem.c                                |  5 ++--
 67 files changed, 137 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index e427bae12ffe..297e4b48bfe2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -513,8 +513,6 @@ thread_saved_pc(task_t *t)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c
index b52a0df303fe..4d60a0ccd6f7 100644
--- a/arch/alpha/kernel/semaphore.c
+++ b/arch/alpha/kernel/semaphore.c
@@ -7,6 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 
 /*
  * This is basically the PPC semaphore scheme ported to use
@@ -60,7 +61,7 @@ static inline int __sem_update_count(struct semaphore *sem, int incr)
  * Either form may be used in conjunction with "up()".
  */
 
-void
+void __sched
 __down_failed(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
@@ -101,7 +102,7 @@ __down_failed(struct semaphore *sem)
 #endif
 }
 
-int
+int __sched
 __down_failed_interruptible(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
@@ -159,7 +160,7 @@ __up_wakeup(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void
+void __sched
 down(struct semaphore *sem)
 {
 #if WAITQUEUE_DEBUG
@@ -173,7 +174,7 @@ down(struct semaphore *sem)
 	__down(sem);
 }
 
-int
+int __sched
 down_interruptible(struct semaphore *sem)
 {
 #if WAITQUEUE_DEBUG
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 7afd00d5d46b..d159b8f0d022 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -17,6 +17,7 @@ SECTIONS
   _text = .;					/* Text and read-only data */
   .text : { 
 	*(.text) 
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
   } :kernel
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 863c4076daad..8423921e821a 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -414,8 +414,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c
index a50902e8bec7..da39eb3dca31 100644
--- a/arch/arm/kernel/semaphore.c
+++ b/arch/arm/kernel/semaphore.c
@@ -13,6 +13,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -54,7 +55,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -87,7 +88,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * sem)
  * registers (r0 to r3 and lr), but not ip, as we use it as a return
  * value in some cases..
  */
-asm("	.align	5				\n\
+asm("	.section .sched.text			\n\
+	.align	5				\n\
 	.globl	__down_failed			\n\
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 56af3401b34d..a5db0ddca6a4 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -73,6 +73,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c
index 09a2f52ad8a8..ce23571617a1 100644
--- a/arch/arm26/kernel/process.c
+++ b/arch/arm26/kernel/process.c
@@ -400,8 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/arm26/kernel/semaphore.c b/arch/arm26/kernel/semaphore.c
index e7964ce1d0d9..60591a738592 100644
--- a/arch/arm26/kernel/semaphore.c
+++ b/arch/arm26/kernel/semaphore.c
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -56,7 +57,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +90,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * sem)
  * registers (r0 to r3 and lr), but not ip, as we use it as a return
  * value in some cases..
  */
-asm("	.align	5				\n\
+asm("	.section .sched.text			\n\
+	.align	5				\n\
 	.globl	__down_failed			\n\
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
diff --git a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
index 602a77c022d7..61eedf0bc42f 100644
--- a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
+++ b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
@@ -66,6 +66,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/arm26/kernel/vmlinux-arm26.lds.in b/arch/arm26/kernel/vmlinux-arm26.lds.in
index 8782fe36f0a8..2393f3805a49 100644
--- a/arch/arm26/kernel/vmlinux-arm26.lds.in
+++ b/arch/arm26/kernel/vmlinux-arm26.lds.in
@@ -67,6 +67,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 62e3a4fbf33a..c785b54e6cbd 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -16,6 +16,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/init.h>
 
 #ifdef CONFIG_ETRAX_GPIO
 void etrax_gpio_wake_up_check(void); /* drivers/gpio.c */
@@ -216,8 +217,6 @@ asmlinkage int sys_execve(const char *fname, char **argv, char **envp,
  * These bracket the sleeping functions..
  */
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched     ((unsigned long) scheduling_functions_start_here)
 #define last_sched      ((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S
index b2c27e147f29..6b73a2c0dad8 100644
--- a/arch/cris/arch-v10/vmlinux.lds.S
+++ b/arch/cris/arch-v10/vmlinux.lds.S
@@ -25,6 +25,7 @@ SECTIONS
 	__stext = .;
 	.text : {
 		*(.text)
+		SCHED_TEXT
 		*(.fixup)
 		*(.text.__*)
 	}
diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c
index d62b355e1706..b884263d3cd4 100644
--- a/arch/cris/kernel/semaphore.c
+++ b/arch/cris/kernel/semaphore.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 /*
@@ -94,7 +95,7 @@ void __up(struct semaphore *sem)
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_VAR
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
@@ -104,7 +105,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int ret = 0;
 	DOWN_VAR
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index bd6ccd542399..8640ea20dba0 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -264,8 +264,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -289,7 +287,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c
index 690efce1e437..1ebb79baaa8c 100644
--- a/arch/h8300/kernel/semaphore.c
+++ b/arch/h8300/kernel/semaphore.c
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -95,7 +96,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -106,7 +107,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 60787f07eb2b..3a643954a8fe 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -82,6 +82,7 @@ SECTIONS
 #endif
 	__stext = . ;
         	*(.text)
+	SCHED_TEXT
 	. = ALIGN(0x4) ;
 		*(.exit.text)
 		*(.text.*)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 3495f1aedf67..7fed9d3823ed 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -632,8 +632,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 #define top_esp                (THREAD_SIZE - sizeof(unsigned long))
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
index 5acd544f0cbd..073912cfcf44 100644
--- a/arch/i386/kernel/semaphore.c
+++ b/arch/i386/kernel/semaphore.c
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/init.h>
 #include <asm/semaphore.h>
 
 /*
@@ -53,7 +54,7 @@ asmlinkage void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-asmlinkage void __down(struct semaphore * sem)
+asmlinkage void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -90,7 +91,7 @@ asmlinkage void __down(struct semaphore * sem)
 	tsk->state = TASK_RUNNING;
 }
 
-asmlinkage int __down_interruptible(struct semaphore * sem)
+asmlinkage int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -187,7 +188,7 @@ asmlinkage int __down_trylock(struct semaphore * sem)
  * value..
  */
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed\n"
 "__down_failed:\n\t"
@@ -210,7 +211,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed_interruptible\n"
 "__down_failed_interruptible:\n\t"
@@ -231,7 +232,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed_trylock\n"
 "__down_failed_trylock:\n\t"
@@ -252,7 +253,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __up_wakeup\n"
 "__up_wakeup:\n\t"
@@ -271,7 +272,7 @@ asm(
  */
 #if defined(CONFIG_SMP)
 asm(
-".text\n"
+".section .sched.text\n"
 ".align	4\n"
 ".globl	__write_lock_failed\n"
 "__write_lock_failed:\n\t"
@@ -285,7 +286,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align	4\n"
 ".globl	__read_lock_failed\n"
 "__read_lock_failed:\n\t"
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 3623d7e2934a..0253c586547b 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -16,6 +16,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a1d09d5c91c4..0d245cbcd1f6 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -660,8 +660,6 @@ get_wchan (struct task_struct *p)
 	/*
 	 * These bracket the sleeping functions..
 	 */
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 #	define first_sched	((unsigned long) scheduling_functions_start_here)
 #	define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
index f3926a3c4d73..2724ef3fbae2 100644
--- a/arch/ia64/kernel/semaphore.c
+++ b/arch/ia64/kernel/semaphore.c
@@ -24,6 +24,7 @@
  * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
  */
 #include <linux/sched.h>
+#include <linux/init.h>
 
 #include <asm/errno.h>
 #include <asm/semaphore.h>
@@ -44,8 +45,7 @@ __up (struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void
-__down (struct semaphore *sem)
+void __sched __down (struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,8 +82,7 @@ __down (struct semaphore *sem)
 	tsk->state = TASK_RUNNING;
 }
 
-int
-__down_interruptible (struct semaphore * sem)
+int __sched __down_interruptible (struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index e5589e49d9da..5c45718a9c82 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -41,6 +41,7 @@ SECTIONS
     {
 	*(.text.ivt)
 	*(.text)
+	SCHED_TEXT
 	*(.gnu.linkonce.t*)
     }
   .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 8d72a5c5b0c7..fc2c753c332b 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -65,8 +65,6 @@ asmlinkage void ret_from_fork(void);
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 	struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
 	/* Check whether the thread is blocked in resume() */
 	if (sw->retpc > (unsigned long)scheduling_functions_start_here &&
@@ -387,8 +385,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -407,7 +403,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c
index 690efce1e437..1ebb79baaa8c 100644
--- a/arch/m68k/kernel/semaphore.c
+++ b/arch/m68k/kernel/semaphore.c
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -95,7 +96,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -106,7 +107,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index bd41fc992169..6dc62684c7b9 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -12,6 +12,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x4e75
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 2e81cde14987..f293e567192c 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -13,6 +13,7 @@ SECTIONS
   .text : {
 	*(.head)
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x4e75
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index c8b87371641a..896d596a1bd8 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -406,8 +406,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -426,7 +424,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
@@ -439,8 +436,6 @@ unsigned long get_wchan(struct task_struct *p)
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 	struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
 
 	/* Check whether the thread is blocked in resume() */
diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c
index 33d704fcf883..c083f4772add 100644
--- a/arch/m68knommu/kernel/semaphore.c
+++ b/arch/m68knommu/kernel/semaphore.c
@@ -6,6 +6,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -96,7 +97,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -107,7 +108,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 1ab8a31ef964..a362870b6e4e 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -191,6 +191,7 @@ SECTIONS {
 	.text : {
 		_stext = . ;
         	*(.text)
+		SCHED_TEXT
         	*(.text.lock)
 
 		. = ALIGN(16);          /* Exception table              */
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index f8ba26770bf4..f4ab9c66b27f 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -283,8 +283,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c
index 11b937f20604..51c3e772c029 100644
--- a/arch/mips/kernel/semaphore.c
+++ b/arch/mips/kernel/semaphore.c
@@ -6,6 +6,7 @@
 #include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 
 #ifdef CONFIG_CPU_HAS_LLDSCD
@@ -104,7 +105,7 @@ static inline int waking_non_zero(struct semaphore *sem)
  * Either form may be used in conjunction with "up()".
  */
 
-void __down_failed(struct semaphore * sem)
+void __sched __down_failed(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	wait_queue_t wait;
@@ -227,7 +228,7 @@ static inline int waking_non_zero_interruptible(struct semaphore *sem,
 
 #endif /* !CONFIG_CPU_HAS_LLDSCD */
 
-int __down_failed_interruptible(struct semaphore * sem)
+int __sched __down_failed_interruptible(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	wait_queue_t wait;
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index b72639f8db65..098cfaa23c0e 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -28,6 +28,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
     *(.text)
+    SCHED_TEXT
     *(.fixup)
     *(.gnu.warning)
   } =0
diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c
index ffb4851451fc..ee806bcc3726 100644
--- a/arch/parisc/kernel/semaphore.c
+++ b/arch/parisc/kernel/semaphore.c
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 /*
  * Semaphores are complex as we wish to avoid using two variables.
@@ -58,7 +59,7 @@ void __up(struct semaphore *sem)
 	sem->count += (sem->count < 0) ? 1 : - 1;
 	
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_HEAD
 
@@ -74,7 +75,7 @@ void __down(struct semaphore * sem)
 	UPDATE_COUNT
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DOWN_HEAD
 
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 14d0882a19d2..e5d5aeef96e5 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text ALIGN(16) : {
 	*(.text*)
+	SCHED_TEXT
 	*(.PARISC.unwind)
 	*(.fixup)
 	*(.lock.text)		/* out-of-line lock text */
diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c
index ada32baeda19..3363a030e00f 100644
--- a/arch/ppc/kernel/process.c
+++ b/arch/ppc/kernel/process.c
@@ -661,8 +661,6 @@ void __init ll_puts(const char *s)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched    ((unsigned long) scheduling_functions_start_here)
 #define last_sched     ((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c
index 7bf51fba5c14..2fe429b27c14 100644
--- a/arch/ppc/kernel/semaphore.c
+++ b/arch/ppc/kernel/semaphore.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/errno.h>
@@ -69,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __down(struct semaphore *sem)
+void __sched __down(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -99,7 +100,7 @@ void __down(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 81b95d449a22..b710d55c5b08 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
   .text      :
   {
     *(.text)
+    SCHED_TEXT
     *(.fixup)
     *(.got1)
     __got2_start = .;
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
index cec7225a6ac1..f74b14d7e58e 100644
--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -475,8 +475,6 @@ static inline int validate_sp(unsigned long sp, struct task_struct *p)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched    (*(unsigned long *)scheduling_functions_start_here)
 #define last_sched     (*(unsigned long *)scheduling_functions_end_here)
 
diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c
index c977029e2465..d723632d59f3 100644
--- a/arch/ppc64/kernel/semaphore.c
+++ b/arch/ppc64/kernel/semaphore.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/errno.h>
@@ -70,7 +71,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __down(struct semaphore *sem)
+void __sched __down(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -99,7 +100,7 @@ void __down(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S
index a8531b1f9ef2..1d9b61143aaa 100644
--- a/arch/ppc64/kernel/vmlinux.lds.S
+++ b/arch/ppc64/kernel/vmlinux.lds.S
@@ -13,6 +13,7 @@ SECTIONS
   /* Read-only sections, merged into text segment: */
   .text : {
 	*(.text .text.*)
+	SCHED_TEXT
 	*(.fixup)
 	. = ALIGN(4096);
 	_etext = .;
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 3676307d1d8a..050585ab5d2a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -384,8 +384,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c
index 8203f5e0228d..8dfb690c159f 100644
--- a/arch/s390/kernel/semaphore.c
+++ b/arch/s390/kernel/semaphore.c
@@ -11,6 +11,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -60,7 +61,7 @@ void __up(struct semaphore *sem)
  *   count > 0: decrement count, wake up queue and exit.
  *   count <= 0: set count to -1, go to sleep.
  */
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,7 +83,7 @@ void __down(struct semaphore * sem)
  *   count > 0: wake up queue and exit.
  *   count <= 0: set count to 0, wake up queue and exit.
  */
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index c9ca7a8e93b3..b4534b2867c3 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -23,6 +23,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x0700
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 773006661b50..7d45ea0acd09 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -464,8 +464,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -481,7 +479,7 @@ unsigned long get_wchan(struct task_struct *p)
 	 * The same comment as on the Alpha applies here, too ...
 	 */
 	pc = thread_saved_pc(p);
-	if (pc >= (unsigned long) interruptible_sleep_on && pc < (unsigned long) add_timer) {
+	if (pc >= first_sched && pc < last_sched) {
 		schedule_frame = ((unsigned long *)(long)p->thread.sp)[1];
 		return (unsigned long)((unsigned long *)schedule_frame)[1];
 	}
diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c
index 0943ad666a67..a3c24dcbf01d 100644
--- a/arch/sh/kernel/semaphore.c
+++ b/arch/sh/kernel/semaphore.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/init.h>
 #include <asm/semaphore.h>
 #include <asm/semaphore-helper.h>
 
@@ -103,7 +104,7 @@ void __up(struct semaphore *sem)
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_VAR
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
@@ -113,7 +114,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int ret = 0;
 	DOWN_VAR
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 2cc86534c130..da0f5d728b3e 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -22,6 +22,7 @@ SECTIONS
 	} = 0
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x0009
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index beae70a970e4..70261b211997 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/reboot.h>
 #include <linux/delay.h>
 #include <linux/pm.h>
+#include <linux/init.h>
 
 #include <asm/auxio.h>
 #include <asm/oplib.h>
@@ -694,9 +695,6 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 	return retval;
 }
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c
index 5a8f3d176a8f..77e63b92ca30 100644
--- a/arch/sparc/kernel/semaphore.c
+++ b/arch/sparc/kernel/semaphore.c
@@ -4,6 +4,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -45,7 +46,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -78,7 +79,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 0862360d865d..8d4bbfaf304c 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -12,6 +12,7 @@ SECTIONS
   .text 0xf0004000 :
   {
     *(.text)
+    SCHED_TEXT
     *(.gnu.warning)
   } =0
   _etext = .;
diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S
index 98b757cb67c6..e7578dc600b8 100644
--- a/arch/sparc/lib/rwsem.S
+++ b/arch/sparc/lib/rwsem.S
@@ -8,7 +8,7 @@
 #include <asm/ptrace.h>
 #include <asm/psr.h>
 
-	.text
+	.section .sched.text
 	.align	4
 
 	.globl		___down_read
@@ -113,6 +113,7 @@ ___down_write:
 	ba		2b
 	 restore	%l5, %g0, %g5
 
+	.text
 	.globl		___up_read
 ___up_read:
 	rd		%psr, %g3
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 1be2b97e4672..0caf962e8155 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/config.h>
 #include <linux/reboot.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 
 #include <asm/oplib.h>
 #include <asm/uaccess.h>
@@ -823,9 +824,6 @@ out:
 	return error;
 }
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
index a9e66d666ceb..9ddfcb9a1900 100644
--- a/arch/sparc64/kernel/semaphore.c
+++ b/arch/sparc64/kernel/semaphore.c
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 /*
  * Atomically update sem->count.
@@ -90,7 +91,7 @@ void up(struct semaphore *sem)
 	: "g5", "g7", "memory", "cc");
 }
 
-static void __down(struct semaphore * sem)
+static void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -108,7 +109,7 @@ static void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
 {
 	might_sleep();
 	/* This atomically does:
@@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem)
 	return ret;
 }
 
-static int __down_interruptible(struct semaphore * sem)
+static int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -216,7 +217,7 @@ static int __down_interruptible(struct semaphore * sem)
 	return retval;
 }
 
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
 {
 	int ret = 0;
 	
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index ad95e88a3cbc..8faeee09fab2 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
   .text 0x0000000000404000 :
   {
     *(.text)
+    SCHED_TEXT
     *(.gnu.warning)
   } =0
   _etext = .;
diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c
index 8e1dfdda91fa..e19968dbc2d1 100644
--- a/arch/sparc64/lib/rwsem.c
+++ b/arch/sparc64/lib/rwsem.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/rwsem.h>
+#include <linux/init.h>
 #include <linux/module.h>
 
 extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
@@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore
 extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
 extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *));
 
-void __down_read(struct rw_semaphore *sem)
+void __sched __down_read(struct rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"! beginning __down_read\n"
@@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(__down_read_trylock);
 
-void __down_write(struct rw_semaphore *sem)
+void __sched __down_write(struct rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"! beginning __down_write\n\t"
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 5c29ae51a303..977d75772d81 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -203,8 +203,6 @@ int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here (void);
-extern void scheduling_functions_end_here (void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -228,7 +226,6 @@ unsigned long get_wchan (struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c
index b78d714384db..2d20886863d8 100644
--- a/arch/v850/kernel/semaphore.c
+++ b/arch/v850/kernel/semaphore.c
@@ -15,6 +15,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -56,7 +57,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +90,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 028c224fa66a..07ab0f292d1c 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -64,6 +64,7 @@
 #define TEXT_CONTENTS							      \
 		__stext = . ;						      \
         	*(.text)						      \
+		SCHED_TEXT
 			*(.exit.text)	/* 2.5 convention */		      \
 			*(.text.exit)	/* 2.4 convention */		      \
 			*(.text.lock)					      \
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7b2414765ca3..d1d9471581a8 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -576,8 +576,6 @@ asmlinkage long sys_vfork(struct pt_regs regs)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
index 5e517814dd07..2bcd4a7ec38d 100644
--- a/arch/x86_64/kernel/semaphore.c
+++ b/arch/x86_64/kernel/semaphore.c
@@ -14,6 +14,7 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/errno.h>
 
 #include <asm/semaphore.h>
@@ -54,7 +55,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -91,7 +92,7 @@ void __down(struct semaphore * sem)
 	tsk->state = TASK_RUNNING;
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7b9e1beb360e..c612e4d213a1 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index 876cb937f9f1..acc1e2ca7ed7 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -35,6 +35,7 @@
 	.endm
 	
 
+	.section .sched.text
 #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
 	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
 	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
@@ -65,7 +66,7 @@ restore_norax:
 
 #ifdef CONFIG_SMP
 /* Support for read/write spinlocks. */
-	
+	.text
 /* rax:	pointer to rwlock_t */	
 ENTRY(__write_lock_failed)
 	lock
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 59c2b950e8b8..a4b6c768cf49 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -51,3 +51,8 @@
 		*(.security_initcall.init) 				\
 		__security_initcall_end = .;				\
 	}
+
+#define SCHED_TEXT							\
+		__scheduling_functions_start_here = .;			\
+		*(.sched.text)						\
+		__scheduling_functions_end_here = .;
diff --git a/include/linux/init.h b/include/linux/init.h
index 45069e275b3d..c6842477243c 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -46,6 +46,8 @@
 #define __exitdata	__attribute__ ((__section__(".exit.data")))
 #define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit")))
 
+#define __sched		__attribute__((__section__(".sched.text")))
+
 #ifdef MODULE
 #define __exit		__attribute__ ((__section__(".exit.text")))
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5fa0c07a7f8..054b3c0d5962 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -170,6 +170,8 @@ extern void update_one_process(struct task_struct *p, unsigned long user,
 			       unsigned long system, int cpu);
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
+extern const unsigned long scheduling_functions_start_here;
+extern const unsigned long scheduling_functions_end_here;
 
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
diff --git a/kernel/sched.c b/kernel/sched.c
index 9e19d4c0d4a9..b42029abe679 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+extern unsigned long __scheduling_functions_start_here;
+extern unsigned long __scheduling_functions_end_here;
+const unsigned long scheduling_functions_start_here =
+			(unsigned long)&__scheduling_functions_start_here;
+const unsigned long scheduling_functions_end_here =
+			(unsigned long)&__scheduling_functions_end_here;
+
 /*
  * Default context-switch locking:
  */
@@ -1587,12 +1594,10 @@ out:
 	rebalance_tick(rq, 0);
 }
 
-void scheduling_functions_start_here(void) { }
-
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule);
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void preempt_schedule(void)
+asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 
@@ -1869,7 +1874,7 @@ void fastcall complete_all(struct completion *x)
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 
-void fastcall wait_for_completion(struct completion *x)
+void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
@@ -1907,7 +1912,7 @@ EXPORT_SYMBOL(wait_for_completion);
 	__remove_wait_queue(q, &wait);			\
 	spin_unlock_irqrestore(&q->lock, flags);
 
-void fastcall interruptible_sleep_on(wait_queue_head_t *q)
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1920,7 +1925,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1935,7 +1940,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
-void fastcall sleep_on(wait_queue_head_t *q)
+void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1948,7 +1953,7 @@ void fastcall sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(sleep_on);
 
-long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1963,8 +1968,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
-void scheduling_functions_end_here(void) { }
-
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
@@ -2424,7 +2427,7 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-void __cond_resched(void)
+void __sched __cond_resched(void)
 {
 	set_current_state(TASK_RUNNING);
 	schedule();
@@ -2438,7 +2441,7 @@ EXPORT_SYMBOL(__cond_resched);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void yield(void)
+void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
@@ -2453,7 +2456,7 @@ EXPORT_SYMBOL(yield);
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
-void io_schedule(void)
+void __sched io_schedule(void)
 {
 	struct runqueue *rq = this_rq();
 
@@ -2464,7 +2467,7 @@ void io_schedule(void)
 
 EXPORT_SYMBOL(io_schedule);
 
-long io_schedule_timeout(long timeout)
+long __sched io_schedule_timeout(long timeout)
 {
 	struct runqueue *rq = this_rq();
 	long ret;
@@ -3010,7 +3013,7 @@ EXPORT_SYMBOL(__might_sleep);
  *
  * Called inside preempt_disable().
  */
-void __preempt_spin_lock(spinlock_t *lock)
+void __sched __preempt_spin_lock(spinlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_spin_lock(lock);
@@ -3026,7 +3029,7 @@ void __preempt_spin_lock(spinlock_t *lock)
 
 EXPORT_SYMBOL(__preempt_spin_lock);
 
-void __preempt_write_lock(rwlock_t *lock)
+void __sched __preempt_write_lock(rwlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_write_lock(lock);
diff --git a/kernel/timer.c b/kernel/timer.c
index f53e0749b0d2..cbcb5522866d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data)
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
-fastcall signed long schedule_timeout(signed long timeout)
+fastcall signed long __sched schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;
@@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long nanosleep_restart(struct restart_block *restart)
+static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	unsigned long expire = restart->arg0, now = jiffies;
 	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 95469d7fb796..85dcae7e9337 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -5,6 +5,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <linux/module.h>
 
 struct rwsem_waiter {
@@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore
 /*
  * wait for the read lock to be granted
  */
-struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem)
+struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
@@ -178,7 +179,7 @@ struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
  * wait for the write lock to be granted
  */
-struct rw_semaphore fastcall *rwsem_down_write_failed(struct rw_semaphore *sem)
+struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
-- 
cgit v1.2.3


From ed678f13aec6fdd86c952b05200f741aa473dba8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:51:16 -0700
Subject: [PATCH] Quota locking fixes

From: Jan Kara <jack@ucw.cz>

Change locking rules in quota code to fix lock ordering especially wrt
journal lock.  Also some unnecessary spinlocking is removed.  The locking
changes are mainly: dqptr_sem, dqio_sem are acquired only when transaction is
already started, dqonoff_sem before a transaction is started.  This change
requires some callbacks to ext3 (also implemented in this patch) to start
transaction before the locks are acquired.
---
 fs/Kconfig               |   6 +-
 fs/dquot.c               | 204 ++++++++++++++++++++++++++---------------------
 fs/ext3/super.c          |  51 +++++++++---
 fs/inode.c               |  16 ++--
 include/linux/quotaops.h |  15 +---
 5 files changed, 165 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index ef8e47fb1c39..c748a2ce35ee 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -417,7 +417,7 @@ config QFMT_V1
 	tristate "Old quota format support"
 	depends on QUOTA
 	help
-	  This quota format was (is) used by kernels earlier than 2.4.??. If
+	  This quota format was (is) used by kernels earlier than 2.4.22. If
 	  you have quota working and you don't want to convert to new quota
 	  format say Y here.
 
@@ -426,8 +426,8 @@ config QFMT_V2
 	depends on QUOTA
 	help
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-	  need this functionality say Y here. Note that you will need latest
-	  quota utilities for new quota format with this kernel.
+	  need this functionality say Y here. Note that you will need recent
+	  quota utilities (>= 3.01) for new quota format with this kernel.
 
 config QUOTACTL
 	bool
diff --git a/fs/dquot.c b/fs/dquot.c
index b7b9b5c44277..e6b39e66207a 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -85,12 +85,31 @@
  * and quota formats and also dqstats structure containing statistics about the
  * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures
  * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
- * Note that we don't have to do the locking of i_blocks and i_bytes when the
- * quota is disabled - i_sem should serialize the access. dq_data_lock should
- * be always grabbed before dq_list_lock.
+ * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
+ * in inode_add_bytes() and inode_sub_bytes().
+ *
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock
  *
  * Note that some things (eg. sb pointer, type, id) doesn't change during
  * the life of the dquot structure and so needn't to be protected by a lock
+ *
+ * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
+ * operation is just reading pointers from inode (or not using them at all) the
+ * read lock is enough. If pointers are altered function must hold write lock.
+ * If operation is holding reference to dquot in other way (e.g. quotactl ops)
+ * it must be guarded by dqonoff_sem.
+ * This locking assures that:
+ *   a) update/access to dquot pointers in inode is serialized
+ *   b) everyone is guarded against invalidate_dquots()
+ *
+ * Each dquot has its dq_lock semaphore. Locked dquots might not be referenced
+ * from inodes (dquot_alloc_space() and such don't check the dq_lock).
+ * Currently dquot is locked only when it is being read to memory on the first
+ * dqget(). Write operations on dquots don't hold dq_lock as they copy data
+ * under dq_data_lock spinlock to internal buffers before writing.
+ *
+ * Lock ordering (including journal_lock) is following:
+ *  dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > dqio_sem
  */
 spinlock_t dq_list_lock = SPIN_LOCK_UNLOCKED;
 spinlock_t dq_data_lock = SPIN_LOCK_UNLOCKED;
@@ -169,23 +188,6 @@ static void put_quota_format(struct quota_format_type *fmt)
  * mechanism to locate a specific dquot.
  */
 
-/*
- * Note that any operation which operates on dquot data (ie. dq_dqb) must
- * hold dq_data_lock.
- *
- * Any operation working with dquots must hold dqptr_sem. If operation is
- * just reading pointers from inodes than read lock is enough. If pointers
- * are altered function must hold write lock.
- *
- * Locked dquots might not be referenced in inodes. Currently dquot it locked
- * only once in its existence - when it's being read to memory on first dqget()
- * and at that time it can't be referenced from inode. Write operations on
- * dquots don't hold dquot lock as they copy data to internal buffers before
- * writing anyway and copying as well as any data update should be atomic. Also
- * nobody can change used entries in dquot structure as this is done only when
- * quota is destroyed and invalidate_dquots() is called only when dq_count == 0.
- */
-
 static LIST_HEAD(inuse_list);
 static LIST_HEAD(free_dquots);
 static struct list_head dquot_hash[NR_DQHASH];
@@ -286,9 +288,9 @@ static int commit_dqblk(struct dquot *dquot)
 }
 
 /* Invalidate all dquots on the list. Note that this function is called after
- * quota is disabled so no new quota might be created. Because we hold dqptr_sem
- * for writing and pointers were already removed from inodes we actually know that
- * no quota for this sb+type should be held. */
+ * quota is disabled so no new quota might be created. Because we hold
+ * dqonoff_sem and pointers were already removed from inodes we actually know
+ * that no quota for this sb+type should be held. */
 static void invalidate_dquots(struct super_block *sb, int type)
 {
 	struct dquot *dquot;
@@ -302,12 +304,11 @@ static void invalidate_dquots(struct super_block *sb, int type)
 			continue;
 		if (dquot->dq_type != type)
 			continue;
-#ifdef __DQUOT_PARANOIA	
-		/* There should be no users of quota - we hold dqptr_sem for writing */
+#ifdef __DQUOT_PARANOIA
 		if (atomic_read(&dquot->dq_count))
 			BUG();
 #endif
-		/* Quota now have no users and it has been written on last dqput() */
+		/* Quota now has no users and it has been written on last dqput() */
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
@@ -323,7 +324,7 @@ static int vfs_quota_sync(struct super_block *sb, int type)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int cnt;
 
-	down_read(&dqopt->dqptr_sem);
+	down(&dqopt->dqonoff_sem);
 restart:
 	/* At this point any dirty dquot will definitely be written so we can clear
 	   dirty flag from info */
@@ -359,7 +360,7 @@ restart:
 	spin_lock(&dq_list_lock);
 	dqstats.syncs++;
 	spin_unlock(&dq_list_lock);
-	up_read(&dqopt->dqptr_sem);
+	up(&dqopt->dqonoff_sem);
 
 	return 0;
 }
@@ -402,7 +403,7 @@ static int shrink_dqcache_memory(int nr, unsigned int gfp_mask)
 /*
  * Put reference to dquot
  * NOTE: If you change this function please check whether dqput_blocks() works right...
- * MUST be called with dqptr_sem held
+ * MUST be called with either dqptr_sem or dqonoff_sem held
  */
 static void dqput(struct dquot *dquot)
 {
@@ -467,7 +468,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 
 /*
  * Get reference to dquot
- * MUST be called with dqptr_sem held
+ * MUST be called with either dqptr_sem or dqonoff_sem held
  */
 static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
@@ -528,7 +529,7 @@ static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-/* This routine is guarded by dqptr_sem semaphore */
+/* This routine is guarded by dqonoff_sem semaphore */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct list_head *p;
@@ -594,7 +595,7 @@ put_it:
 
 /* Free list of dquots - called from inode.c */
 /* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */
-void put_dquot_list(struct list_head *tofree_head)
+static void put_dquot_list(struct list_head *tofree_head)
 {
 	struct list_head *act_head;
 	struct dquot *dquot;
@@ -609,6 +610,20 @@ void put_dquot_list(struct list_head *tofree_head)
 	}
 }
 
+/* Function in inode.c - remove pointers to dquots in icache */
+extern void remove_dquot_ref(struct super_block *, int, struct list_head *);
+
+/* Gather all references from inodes and drop them */
+static void drop_dquot_ref(struct super_block *sb, int type)
+{
+	LIST_HEAD(tofree_head);
+
+	down_write(&sb_dqopt(sb)->dqptr_sem);
+	remove_dquot_ref(sb, type, &tofree_head);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+	put_dquot_list(&tofree_head);
+}
+
 static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
 {
 	dquot->dq_dqb.dqb_curinodes += number;
@@ -804,6 +819,9 @@ void dquot_initialize(struct inode *inode, int type)
 	unsigned int id = 0;
 	int cnt;
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return;
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Having dqptr_sem we know NOQUOTA flags can't be altered... */
 	if (IS_NOQUOTA(inode)) {
@@ -831,50 +849,23 @@ void dquot_initialize(struct inode *inode, int type)
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 }
 
-/*
- *	Remove references to quota from inode
- *	This function needs dqptr_sem for writing
- */
-static void dquot_drop_iupdate(struct inode *inode, struct dquot **to_drop)
-{
-	int cnt;
-
-	inode->i_flags &= ~S_QUOTA;
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		to_drop[cnt] = inode->i_dquot[cnt];
-		inode->i_dquot[cnt] = NODQUOT;
-	}
-}
-
 /*
  * 	Release all quotas referenced by inode
+ *	Transaction must be started at an entry
  */
 void dquot_drop(struct inode *inode)
 {
-	struct dquot *to_drop[MAXQUOTAS];
 	int cnt;
 
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	dquot_drop_iupdate(inode, to_drop);
+	inode->i_flags &= ~S_QUOTA;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT) {
+			dqput(inode->i_dquot[cnt]);
+			inode->i_dquot[cnt] = NODQUOT;
+		}
+	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (to_drop[cnt] != NODQUOT)
-			dqput(to_drop[cnt]);
-}
-
-/*
- *	Release all quotas referenced by inode.
- *	This function assumes dqptr_sem for writing
- */
-void dquot_drop_nolock(struct inode *inode)
-{
-	struct dquot *to_drop[MAXQUOTAS];
-	int cnt;
-
-	dquot_drop_iupdate(inode, to_drop);
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (to_drop[cnt] != NODQUOT)
-			dqput(to_drop[cnt]);
 }
 
 /*
@@ -885,11 +876,17 @@ int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		return QUOTA_OK;
+	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = NOWARN;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	spin_lock(&dq_data_lock);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode))
 		goto add_bytes;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -921,9 +918,13 @@ int dquot_alloc_inode(const struct inode *inode, unsigned long number)
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return QUOTA_OK;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = NOWARN;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
 		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 		return QUOTA_OK;
@@ -956,8 +957,14 @@ void dquot_free_space(struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode)) {
+		inode_sub_bytes(inode, number);
+		return;
+	}
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	spin_lock(&dq_data_lock);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode))
 		goto sub_bytes;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -978,7 +985,11 @@ void dquot_free_inode(const struct inode *inode, unsigned long number)
 {
 	unsigned int cnt;
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
 		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 		return;
@@ -1007,14 +1018,20 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
 	char warntype[MAXQUOTAS];
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return QUOTA_OK;
 	/* Clear the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
 		warntype[cnt] = NOWARN;
 	}
+	down(&sb_dqopt(inode->i_sb)->dqonoff_sem);
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
 		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		up(&sb_dqopt(inode->i_sb)->dqonoff_sem);
 		return QUOTA_OK;
 	}
 	/* First build the transfer_to list - here we can block on reading of dquots... */
@@ -1065,6 +1082,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	ret = QUOTA_OK;
 warn_put_all:
 	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	flush_warnings(transfer_to, warntype);
 	
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1073,7 +1091,7 @@ warn_put_all:
 		if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
 			dqput(transfer_to[cnt]);
 	}
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	up(&sb_dqopt(inode->i_sb)->dqonoff_sem);
 	return ret;
 }
 
@@ -1121,9 +1139,6 @@ static inline void reset_enable_flags(struct quota_info *dqopt, int type)
 	}
 }
 
-/* Function in inode.c - remove pointers to dquots in icache */
-extern void remove_dquot_ref(struct super_block *, int);
-
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
@@ -1137,7 +1152,6 @@ int vfs_quota_off(struct super_block *sb, int type)
 
 	/* We need to serialize quota_off() for device */
 	down(&dqopt->dqonoff_sem);
-	down_write(&dqopt->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
@@ -1146,7 +1160,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 		reset_enable_flags(dqopt, cnt);
 
 		/* Note: these are blocking operations */
-		remove_dquot_ref(sb, cnt);
+		drop_dquot_ref(sb, cnt);
 		invalidate_dquots(sb, cnt);
 		/*
 		 * Now all dquots should be invalidated, all writes done so we should be only
@@ -1168,7 +1182,6 @@ int vfs_quota_off(struct super_block *sb, int type)
 		dqopt->info[cnt].dqi_bgrace = 0;
 		dqopt->ops[cnt] = NULL;
 	}
-	up_write(&dqopt->dqptr_sem);
 	up(&dqopt->dqonoff_sem);
 out:
 	return 0;
@@ -1180,7 +1193,8 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	struct inode *inode;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct quota_format_type *fmt = find_quota_format(format_id);
-	int error;
+	int error, cnt;
+	struct dquot *to_drop[MAXQUOTAS];
 	unsigned int oldflags;
 
 	if (!fmt)
@@ -1202,7 +1216,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 		goto out_f;
 
 	down(&dqopt->dqonoff_sem);
-	down_write(&dqopt->dqptr_sem);
 	if (sb_has_quota_enabled(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
@@ -1213,8 +1226,20 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	if (!fmt->qf_ops->check_quota_file(sb, type))
 		goto out_file_init;
 	/* We don't want quota and atime on quota files (deadlocks possible) */
-	dquot_drop_nolock(inode);
+	down_write(&dqopt->dqptr_sem);
 	inode->i_flags |= S_NOQUOTA | S_NOATIME;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		to_drop[cnt] = inode->i_dquot[cnt];
+		inode->i_dquot[cnt] = NODQUOT;
+	}
+	inode->i_flags &= ~S_QUOTA;
+	up_write(&dqopt->dqptr_sem);
+	/* We must put dquots outside of dqptr_sem because we may need to
+	 * start transaction for write */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (to_drop[cnt])
+			dqput(to_drop[cnt]);
+	}
 
 	dqopt->ops[type] = fmt->qf_ops;
 	dqopt->info[type].dqi_format = fmt;
@@ -1225,7 +1250,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	}
 	up(&dqopt->dqio_sem);
 	set_enable_flags(dqopt, type);
-	up_write(&dqopt->dqptr_sem);
 
 	add_dquot_ref(sb, type);
 	up(&dqopt->dqonoff_sem);
@@ -1268,14 +1292,14 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!(dquot = dqget(sb, id, type))) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
@@ -1337,14 +1361,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!(dquot = dqget(sb, id, type))) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	do_set_dqblk(dquot, di);
 	dqput(dquot);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
@@ -1353,9 +1377,9 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
   
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!sb_has_quota_enabled(sb, type)) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	mi = sb_dqopt(sb)->info + type;
@@ -1365,7 +1389,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	ii->dqi_flags = mi->dqi_flags & DQF_MASK;
 	ii->dqi_valid = IIF_ALL;
 	spin_unlock(&dq_data_lock);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
@@ -1374,9 +1398,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!sb_has_quota_enabled(sb, type)) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	mi = sb_dqopt(sb)->info + type;
@@ -1389,7 +1413,7 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK);
 	mark_info_dirty(mi);
 	spin_unlock(&dq_data_lock);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index baf30c5045ec..e6ae6c9e0f46 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1958,6 +1958,18 @@ int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
 #define EXT3_V0_QFMT_BLOCKS 27
 
 static int (*old_write_dquot)(struct dquot *dquot);
+static void (*old_drop_dquot)(struct inode *inode);
+
+static int fmt_to_blocks(int fmt)
+{
+	switch (fmt) {
+		case QFMT_VFS_OLD:
+			return  EXT3_OLD_QFMT_BLOCKS;
+		case QFMT_VFS_V0:
+			return EXT3_V0_QFMT_BLOCKS;
+	}
+	return EXT3_MAX_TRANS_DATA;
+}
 
 static int ext3_write_dquot(struct dquot *dquot)
 {
@@ -1965,20 +1977,11 @@ static int ext3_write_dquot(struct dquot *dquot)
 	int ret;
 	int err;
 	handle_t *handle;
-	struct quota_info *dqops = sb_dqopt(dquot->dq_sb);
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 	struct inode *qinode;
 
-	switch (dqops->info[dquot->dq_type].dqi_format->qf_fmt_id) {
-		case QFMT_VFS_OLD:
-			nblocks = EXT3_OLD_QFMT_BLOCKS;
-			break;
-		case QFMT_VFS_V0:
-			nblocks = EXT3_V0_QFMT_BLOCKS;
-			break;
-		default:
-			nblocks = EXT3_MAX_TRANS_DATA;
-	}
-	qinode = dqops->files[dquot->dq_type]->f_dentry->d_inode;
+	nblocks = fmt_to_blocks(dqopt->info[dquot->dq_type].dqi_format->qf_fmt_id);
+	qinode = dqopt->files[dquot->dq_type]->f_dentry->d_inode;
 	handle = ext3_journal_start(qinode, nblocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -1991,6 +1994,28 @@ static int ext3_write_dquot(struct dquot *dquot)
 out:
 	return ret;
 }
+
+static void ext3_drop_dquot(struct inode *inode)
+{
+	int nblocks, type;
+	struct quota_info *dqopt = sb_dqopt(inode->i_sb);
+	handle_t *handle;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (sb_has_quota_enabled(inode->i_sb, type))
+			break;
+	}
+	if (type < MAXQUOTAS)
+		nblocks = fmt_to_blocks(dqopt->info[type].dqi_format->qf_fmt_id);
+	else
+		nblocks = 0;	/* No quota => no drop */
+	handle = ext3_journal_start(inode, 2*nblocks);
+	if (IS_ERR(handle))
+		return;
+	old_drop_dquot(inode);
+	ext3_journal_stop(handle);
+	return;
+}
 #endif
 
 static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
@@ -2018,7 +2043,9 @@ static int __init init_ext3_fs(void)
 #ifdef CONFIG_QUOTA
 	init_dquot_operations(&ext3_qops);
 	old_write_dquot = ext3_qops.write_dquot;
+	old_drop_dquot = ext3_qops.drop;
 	ext3_qops.write_dquot = ext3_write_dquot;
+	ext3_qops.drop = ext3_drop_dquot;
 #endif
         err = register_filesystem(&ext3_fs_type);
 	if (err)
diff --git a/fs/inode.c b/fs/inode.c
index 01c5740aacdd..d367d4629f3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1216,15 +1216,13 @@ EXPORT_SYMBOL(inode_needs_sync);
  */
 #ifdef CONFIG_QUOTA
 
-/* Functions back in dquot.c */
-void put_dquot_list(struct list_head *);
+/* Function back in dquot.c */
 int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
 
-void remove_dquot_ref(struct super_block *sb, int type)
+void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head)
 {
 	struct inode *inode;
 	struct list_head *act_head;
-	LIST_HEAD(tofree_head);
 
 	if (!sb->dq_op)
 		return;	/* nothing to do */
@@ -1234,26 +1232,24 @@ void remove_dquot_ref(struct super_block *sb, int type)
 	list_for_each(act_head, &inode_in_use) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	list_for_each(act_head, &inode_unused) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	list_for_each(act_head, &sb->s_dirty) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	list_for_each(act_head, &sb->s_io) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	spin_unlock(&inode_lock);
-
-	put_dquot_list(&tofree_head);
 }
 
 #endif
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 155c9a2af016..e5a9e6bed751 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -64,11 +64,8 @@ static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA)
 			return 1;
 	}
-	else {
-		spin_lock(&dq_data_lock);
+	else
 		inode_add_bytes(inode, nr);
-		spin_unlock(&dq_data_lock);
-	}
 	return 0;
 }
 
@@ -87,11 +84,8 @@ static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA)
 			return 1;
 	}
-	else {
-		spin_lock(&dq_data_lock);
+	else
 		inode_add_bytes(inode, nr);
-		spin_unlock(&dq_data_lock);
-	}
 	return 0;
 }
 
@@ -117,11 +111,8 @@ static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 {
 	if (sb_any_quota_enabled(inode->i_sb))
 		inode->i_sb->dq_op->free_space(inode, nr);
-	else {
-		spin_lock(&dq_data_lock);
+	else
 		inode_sub_bytes(inode, nr);
-		spin_unlock(&dq_data_lock);
-	}
 }
 
 static __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr)
-- 
cgit v1.2.3


From 94b1c3ebf78bd58c2f45b78f2c24c7c939c34a9e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:52:32 -0700
Subject: [PATCH] knfsd: Remove name_lookup.h that noone is using anymore.

From: NeilBrown <neilb@cse.unsw.edu.au>
---
 include/linux/sunrpc/name_lookup.h | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 include/linux/sunrpc/name_lookup.h

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/name_lookup.h b/include/linux/sunrpc/name_lookup.h
deleted file mode 100644
index 0c97ec324ada..000000000000
--- a/include/linux/sunrpc/name_lookup.h
+++ /dev/null
@@ -1,38 +0,0 @@
-
-/*
- * map between user/group name and id for a given 'client' 
- */
-
-struct name_ent {
-	char name[20];
-};
-static inline int name_get_user(int uid, struct name_ent **namep)
-{
-	struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL);
-	if (n) sprintf(n->name, "%d",uid);
-	*namep = n;
-	return n ? 0 : -ENOMEM;
-}
-static inline int name_get_group(int uid, struct name_ent **namep)
-{
-	struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL);
-	if (n) sprintf(n->name, "%d",uid);
-	*namep = n;
-	return n ? 0 : -ENOMEM;
-}
-static inline int name_get_uid(char *name, int name_len, int *uidp)
-{
-	*uidp = simple_strtoul(name, NULL, 0);
-	return 0;
-}
-
-static inline int name_get_gid(char *name, int name_len, int *gidp)
-{
-	*gidp = simple_strtoul(name, NULL, 0);
-	return 0;
-}
-
-static inline void name_put(struct name_ent *ent) 
-{
-	kfree(ent);
-}
-- 
cgit v1.2.3


From c02c0886973521cd77904d8f07aa98d99c63cb3b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:52:44 -0700
Subject: [PATCH] knfsd: Add server-side support for the nfsv4
 mounted_on_fileid attribute.

From: NeilBrown <neilb@cse.unsw.edu.au>
---
 fs/nfsd/nfs4xdr.c         | 11 +++++++++++
 include/linux/nfs4.h      |  1 +
 include/linux/nfsd/nfsd.h |  2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d19b1c6b7f45..8908bfc17184 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1588,7 +1588,18 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 		WRITE32(stat.mtime.tv_sec);
 		WRITE32(stat.mtime.tv_nsec);
 	}
+	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+		struct dentry *mnt_pnt, *mnt_root;
 
+		if ((buflen -= 8) < 0)
+                	goto out_resource;
+		mnt_root = exp->ex_mnt->mnt_root;
+		if (mnt_root->d_inode == dentry->d_inode) {
+			mnt_pnt = exp->ex_mnt->mnt_mountpoint;
+			WRITE64((u64) mnt_pnt->d_inode->i_ino);
+		} else
+                	WRITE64((u64) stat.ino);
+	}
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
 	status = nfs_ok;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e8ea2239a213..520545881a52 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -274,6 +274,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_METADATA      (1 << 20)
 #define FATTR4_WORD1_TIME_MODIFY        (1 << 21)
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1 << 22)
+#define FATTR4_WORD1_MOUNTED_ON_FILEID  (1 << 23)
 
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 6e6a66208308..418356558209 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -278,7 +278,7 @@ static inline int is_fsid(struct svc_fh *fh, struct knfsd_fh *reffh)
  | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
  | FATTR4_WORD1_SPACE_USED      | FATTR4_WORD1_TIME_ACCESS  | FATTR4_WORD1_TIME_ACCESS_SET  \
  | FATTR4_WORD1_TIME_CREATE     | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
- | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET)
+ | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
 
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
 #define NFSD_WRITEONLY_ATTRS_WORD1							    \
-- 
cgit v1.2.3


From 238a06e203a96960843faec4ec8f553f453082b9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:09 -0700
Subject: [PATCH] knfsd: Export a symbol needed by auth_gss

From: NeilBrown <neilb@cse.unsw.edu.au>

From: "J. Bruce Fields" <bfields@fieldses.org>

Without this compiling auth_gss as module fails.
---
 include/linux/sunrpc/xdr.h | 1 +
 net/sunrpc/sunrpc_syms.c   | 1 +
 net/sunrpc/xdr.c           | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 0ccaff2cdee2..2b334dc19962 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -145,6 +145,7 @@ extern void _copy_from_pages(char *, struct page **, size_t, size_t);
 extern void xdr_buf_from_iov(struct iovec *, struct xdr_buf *);
 extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, int, int);
 extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, int);
+extern int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len);
 
 /*
  * Helper structure for copying from an sk_buff.
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 9061f6498cc4..1ae41edbb0f1 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -134,6 +134,7 @@ EXPORT_SYMBOL(xdr_read_pages);
 EXPORT_SYMBOL(xdr_buf_from_iov);
 EXPORT_SYMBOL(xdr_buf_subsegment);
 EXPORT_SYMBOL(xdr_buf_read_netobj);
+EXPORT_SYMBOL(read_bytes_from_xdr_buf);
 
 /* Debugging symbols */
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index accfdd9284df..cae451e8db8d 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -799,7 +799,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-static int
+int
 read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
 {
 	struct xdr_buf subbuf;
-- 
cgit v1.2.3


From 9abdc6608d7c5e3cb09c05bd6c726d04dc59ace4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:24 -0700
Subject: [PATCH] knfsd: Add data integrity to serve rside gss

From: NeilBrown <neilb@cse.unsw.edu.au>

From: "J. Bruce Fields" <bfields@fieldses.org>

rpcsec_gss supports three security levels:

1.  authentication only: sign the header of each rpc request and response.

2. integrity: sign the header and body of each rpc request and response.

3.  privacy: sign the header and encrypt the body of each rpc request and
   response.

The first 2 are already supported on the client; this adds integrity support
on the server.
---
 include/linux/sunrpc/svcauth_gss.h  |   9 --
 net/sunrpc/auth_gss/gss_krb5_mech.c |   2 +
 net/sunrpc/auth_gss/svcauth_gss.c   | 172 ++++++++++++++++++++++++++++++++++--
 3 files changed, 168 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h
index 73ca6ef2c4a8..a444c9edb9e9 100644
--- a/include/linux/sunrpc/svcauth_gss.h
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -22,14 +22,5 @@
 int gss_svc_init(void);
 int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
 
-
-struct gss_svc_data {
-	/* decoded gss client cred: */
-	struct rpc_gss_wire_cred	clcred;
-	/* pointer to the beginning of the procedure-specific results, which
-	 * may be encrypted/checksummed in svcauth_gss_release: */
-	u32				*body_start;
-};
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 42ceee1907d7..57c074a06970 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -236,6 +236,8 @@ static int __init init_kerberos_module(void)
 	gss_register_triple(RPC_AUTH_GSS_KRB5I, gm, 0, RPC_GSS_SVC_INTEGRITY);
 	if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5, "krb5"))
 		printk("Failed to register %s with server!\n", "krb5");
+	if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5I, "krb5i"))
+		printk("Failed to register %s with server!\n", "krb5i");
 	gss_mech_put(gm);
 	return 0;
 }
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 9e13aaa2bc79..2277667d3397 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -670,6 +670,68 @@ out:
 	return stat;
 }
 
+static inline int
+read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+{
+	u32     raw;
+	int     status;
+
+	status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
+	if (status)
+		return status;
+	*obj = ntohl(raw);
+	return 0;
+}
+
+/* It would be nice if this bit of code could be shared with the client.
+ * Obstacles:
+ *	The client shouldn't malloc(), would have to pass in own memory.
+ *	The server uses base of head iovec as read pointer, while the
+ *	client uses separate pointer. */
+static int
+unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+{
+	int stat = -EINVAL;
+	u32 integ_len, maj_stat;
+	struct xdr_netobj mic;
+	struct xdr_buf integ_buf;
+
+	integ_len = ntohl(svc_getu32(&buf->head[0]));
+	if (integ_len & 3)
+		goto out;
+	if (integ_len > buf->len)
+		goto out;
+	if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len))
+		BUG();
+	/* copy out mic... */
+	if (read_u32_from_xdr_buf(buf, integ_len, &mic.len))
+		BUG();
+	if (mic.len > RPC_MAX_AUTH_SIZE)
+		goto out;
+	mic.data = kmalloc(mic.len, GFP_KERNEL);
+	if (!mic.data)
+		goto out;
+	if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
+		goto out;
+	maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL);
+	if (maj_stat != GSS_S_COMPLETE)
+		goto out;
+	if (ntohl(svc_getu32(&buf->head[0])) != seq)
+		goto out;
+	stat = 0;
+out:
+	return stat;
+}
+
+struct gss_svc_data {
+	/* decoded gss client cred: */
+	struct rpc_gss_wire_cred	clcred;
+	/* pointer to the beginning of the procedure-specific results,
+	 * which may be encrypted/checksummed in svcauth_gss_release: */
+	u32				*body_start;
+	struct rsc			*rsci;
+};
+
 /*
  * Accept an rpcsec packet.
  * If context establishment, punt to user space
@@ -701,6 +763,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp)
 	if (!svcdata)
 		goto auth_err;
 	rqstp->rq_auth_data = svcdata;
+	svcdata->body_start = 0;
+	svcdata->rsci = NULL;
 	gc = &svcdata->clcred;
 
 	/* start of rpc packet is 7 u32's back from here:
@@ -754,9 +818,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp)
 		break;
 	case RPC_GSS_PROC_DATA:
 	case RPC_GSS_PROC_DESTROY:
-		/* integrity and privacy unsupported: */
-		if (gc->gc_svc != RPC_GSS_SVC_NONE)
-			goto auth_err;
 		*authp = rpcsec_gsserr_credproblem;
 		rsci = gss_svc_searchbyctx(&gc->gc_ctx);
 		if (!rsci)
@@ -841,10 +902,28 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp)
 		*authp = rpcsec_gsserr_ctxproblem;
 		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
 			goto auth_err;
-		/* For use when wrapping: */
-		svcdata->body_start = resv->iov_base + 1;
 		rqstp->rq_cred = rsci->cred;
 		get_group_info(rsci->cred.cr_group_info);
+		*authp = rpc_autherr_badcred;
+		switch (gc->gc_svc) {
+		case RPC_GSS_SVC_NONE:
+			break;
+		case RPC_GSS_SVC_INTEGRITY:
+			if (unwrap_integ_data(&rqstp->rq_arg,
+					gc->gc_seq, rsci->mechctx))
+				goto auth_err;
+			svcdata->rsci = rsci;
+			cache_get(&rsci->h);
+			/* placeholders for length and seq. number: */
+			svcdata->body_start = resv->iov_base + resv->iov_len;
+			svc_putu32(resv, 0);
+			svc_putu32(resv, 0);
+			break;
+		case RPC_GSS_SVC_PRIVACY:
+			/* currently unsupported */
+		default:
+			goto auth_err;
+		}
 		ret = SVC_OK;
 		goto out;
 	}
@@ -867,14 +946,95 @@ out:
 static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
+	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+	struct rpc_gss_wire_cred *gc = &gsd->clcred;
+	struct xdr_buf *resbuf = &rqstp->rq_res;
+	struct xdr_buf integ_buf;
+	struct xdr_netobj mic;
+	struct iovec *resv;
+	u32 *p;
+	int integ_offset, integ_len;
+	int stat = -EINVAL;
+
+	if (gc->gc_proc != RPC_GSS_PROC_DATA)
+		goto out;
+	/* Release can be called twice, but we only wrap once. */
+	if (gsd->body_start == 0)
+		goto out;
+	/* normally not set till svc_send, but we need it here: */
+	resbuf->len = resbuf->head[0].iov_len
+		+ resbuf->page_len + resbuf->tail[0].iov_len;
+	switch (gc->gc_svc) {
+	case RPC_GSS_SVC_NONE:
+		break;
+	case RPC_GSS_SVC_INTEGRITY:
+		p = gsd->body_start;
+		gsd->body_start = 0;
+		/* move accept_stat to right place: */
+		memcpy(p, p + 2, 4);
+		/* don't wrap in failure case: */
+		/* Note: counting on not getting here if call was not even
+		 * accepted! */
+		if (*p != rpc_success) {
+			resbuf->head[0].iov_len -= 2 * 4;
+			goto out;
+		}
+		p++;
+		integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+		integ_len = resbuf->len - integ_offset;
+		BUG_ON(integ_len % 4);
+		*p++ = htonl(integ_len);
+		*p++ = htonl(gc->gc_seq);
+		if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+					integ_len))
+			BUG();
+		if (resbuf->page_len == 0
+			&& resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
+				< PAGE_SIZE) {
+			BUG_ON(resbuf->tail[0].iov_len);
+			/* Use head for everything */
+			resv = &resbuf->head[0];
+		} else if (resbuf->tail[0].iov_base == NULL) {
+			/* copied from nfsd4_encode_read */
+			svc_take_page(rqstp);
+			resbuf->tail[0].iov_base = page_address(rqstp
+					->rq_respages[rqstp->rq_resused-1]);
+			rqstp->rq_restailpage = rqstp->rq_resused-1;
+			resbuf->tail[0].iov_len = 0;
+			resv = &resbuf->tail[0];
+		} else {
+			resv = &resbuf->tail[0];
+		}
+		mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+		if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
+			goto out_err;
+		svc_putu32(resv, htonl(mic.len));
+		memset(mic.data + mic.len, 0,
+				round_up_to_quad(mic.len) - mic.len);
+		resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+		/* not strictly required: */
+		resbuf->len += XDR_QUADLEN(mic.len) << 2;
+		BUG_ON(resv->iov_len > PAGE_SIZE);
+		break;
+	case RPC_GSS_SVC_PRIVACY:
+	default:
+		goto out_err;
+	}
+
+out:
+	stat = 0;
+out_err:
 	if (rqstp->rq_client)
 		auth_domain_put(rqstp->rq_client);
 	rqstp->rq_client = NULL;
 	if (rqstp->rq_cred.cr_group_info)
 		put_group_info(rqstp->rq_cred.cr_group_info);
 	rqstp->rq_cred.cr_group_info = NULL;
+	if (gsd->rsci)
+		rsc_put(&gsd->rsci->h, &rsc_cache);
+	gsd->rsci = NULL;
 
-	return 0;
+	return stat;
 }
 
 static void
-- 
cgit v1.2.3


From c334f752d8e9d3847d4459d06f7544dea9a49923 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:50 -0700
Subject: [PATCH] posix message queues: code move

From: Manfred Spraul <manfred@colorfullife.com>

cleanup of sysv ipc as a preparation for posix message queues:

- replace !CONFIG_SYSVIPC wrappers for copy_semundo and exit_sem with
  static inline wrappers.  Now the whole ipc/util.c file is only used if
  CONFIG_SYSVIPC is set, use makefile magic instead of #ifdef.

- remove the prototypes for copy_semundo and exit_sem from kernel/fork.c

- they belong into a header file.

- create a new msgutil.c with the helper functions for message queues.

- cleanup the helper functions: run Lindent, add __user tags.
---
 include/linux/msg.h |   3 --
 include/linux/sem.h |  17 ++++++-
 ipc/Makefile        |   4 +-
 ipc/msg.c           | 105 -------------------------------------------
 ipc/msgutil.c       | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 ipc/util.c          |  19 --------
 ipc/util.h          |  10 +++++
 kernel/fork.c       |   4 +-
 8 files changed, 155 insertions(+), 134 deletions(-)
 create mode 100644 ipc/msgutil.c

(limited to 'include/linux')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index b235e862a3dd..2c4c6aa643ff 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -74,9 +74,6 @@ struct msg_msg {
 	/* the actual message follows immediately */
 };
 
-#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
-#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
-
 /* one msq_queue structure for each present queue on the system */
 struct msg_queue {
 	struct kern_ipc_perm q_perm;
diff --git a/include/linux/sem.h b/include/linux/sem.h
index b337c509ac29..aaf45764a56e 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -134,7 +134,22 @@ struct sysv_sem {
 	struct sem_undo_list *undo_list;
 };
 
-void exit_sem(struct task_struct *p);
+#ifdef CONFIG_SYSVIPC
+
+extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
+extern void exit_sem(struct task_struct *tsk);
+
+#else
+static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void exit_sem(struct task_struct *tsk)
+{
+	return;
+}
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/ipc/Makefile b/ipc/Makefile
index ccc6c64c2493..6cd32a30f03f 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -2,7 +2,5 @@
 # Makefile for the linux ipc.
 #
 
-obj-y   := util.o
-
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
-obj-$(CONFIG_SYSVIPC) += msg.o sem.o shm.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
diff --git a/ipc/msg.c b/ipc/msg.c
index 709ff71bf5c1..37e2d3bb17cb 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -51,11 +51,6 @@ struct msg_sender {
 	struct task_struct* tsk;
 };
 
-struct msg_msgseg {
-	struct msg_msgseg* next;
-	/* the next part of the message follows immediately */
-};
-
 #define SEARCH_ANY		1
 #define SEARCH_EQUAL		2
 #define SEARCH_NOTEQUAL		3
@@ -129,106 +124,6 @@ static int newque (key_t key, int msgflg)
 	return msg_buildid(id,msq->q_perm.seq);
 }
 
-static void free_msg(struct msg_msg* msg)
-{
-	struct msg_msgseg* seg;
-
-	security_msg_msg_free(msg);
-
-	seg = msg->next;
-	kfree(msg);
-	while(seg != NULL) {
-		struct msg_msgseg* tmp = seg->next;
-		kfree(seg);
-		seg = tmp;
-	}
-}
-
-static struct msg_msg* load_msg(void* src, int len)
-{
-	struct msg_msg* msg;
-	struct msg_msgseg** pseg;
-	int err;
-	int alen;
-
-	alen = len;
-	if(alen > DATALEN_MSG)
-		alen = DATALEN_MSG;
-
-	msg = (struct msg_msg *) kmalloc (sizeof(*msg) + alen, GFP_KERNEL);
-	if(msg==NULL)
-		return ERR_PTR(-ENOMEM);
-
-	msg->next = NULL;
-	msg->security = NULL;
-
-	if (copy_from_user(msg+1, src, alen)) {
-		err = -EFAULT;
-		goto out_err;
-	}
-
-	len -= alen;
-	src = ((char*)src)+alen;
-	pseg = &msg->next;
-	while(len > 0) {
-		struct msg_msgseg* seg;
-		alen = len;
-		if(alen > DATALEN_SEG)
-			alen = DATALEN_SEG;
-		seg = (struct msg_msgseg *) kmalloc (sizeof(*seg) + alen, GFP_KERNEL);
-		if(seg==NULL) {
-			err=-ENOMEM;
-			goto out_err;
-		}
-		*pseg = seg;
-		seg->next = NULL;
-		if(copy_from_user (seg+1, src, alen)) {
-			err = -EFAULT;
-			goto out_err;
-		}
-		pseg = &seg->next;
-		len -= alen;
-		src = ((char*)src)+alen;
-	}
-	
-	err = security_msg_msg_alloc(msg);
-	if (err)
-		goto out_err;
-
-	return msg;
-
-out_err:
-	free_msg(msg);
-	return ERR_PTR(err);
-}
-
-static int store_msg(void* dest, struct msg_msg* msg, int len)
-{
-	int alen;
-	struct msg_msgseg *seg;
-
-	alen = len;
-	if(alen > DATALEN_MSG)
-		alen = DATALEN_MSG;
-	if(copy_to_user (dest, msg+1, alen))
-		return -1;
-
-	len -= alen;
-	dest = ((char*)dest)+alen;
-	seg = msg->next;
-	while(len > 0) {
-		alen = len;
-		if(alen > DATALEN_SEG)
-			alen = DATALEN_SEG;
-		if(copy_to_user (dest, seg+1, alen))
-			return -1;
-		len -= alen;
-		dest = ((char*)dest)+alen;
-		seg=seg->next;
-	}
-	return 0;
-}
-
 static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss)
 {
 	mss->tsk=current;
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
new file mode 100644
index 000000000000..e48d777de2a3
--- /dev/null
+++ b/ipc/msgutil.c
@@ -0,0 +1,127 @@
+/*
+ * linux/ipc/util.c
+ * Copyright (C) 1999, 2004 Manfred Spraul
+ *
+ * This file is released under GNU General Public Licence version 2 or
+ * (at your option) any later version.
+ *
+ * See the file COPYING for more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/ipc.h>
+#include <asm/uaccess.h>
+
+#include "util.h"
+
+struct msg_msgseg {
+	struct msg_msgseg* next;
+	/* the next part of the message follows immediately */
+};
+
+#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
+#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
+
+struct msg_msg *load_msg(void __user *src, int len)
+{
+	struct msg_msg *msg;
+	struct msg_msgseg **pseg;
+	int err;
+	int alen;
+
+	alen = len;
+	if (alen > DATALEN_MSG)
+		alen = DATALEN_MSG;
+
+	msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	if (msg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	msg->next = NULL;
+	msg->security = NULL;
+
+	if (copy_from_user(msg + 1, src, alen)) {
+		err = -EFAULT;
+		goto out_err;
+	}
+
+	len -= alen;
+	src = ((char *)src) + alen;
+	pseg = &msg->next;
+	while (len > 0) {
+		struct msg_msgseg *seg;
+		alen = len;
+		if (alen > DATALEN_SEG)
+			alen = DATALEN_SEG;
+		seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen,
+						 GFP_KERNEL);
+		if (seg == NULL) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+		*pseg = seg;
+		seg->next = NULL;
+		if (copy_from_user(seg + 1, src, alen)) {
+			err = -EFAULT;
+			goto out_err;
+		}
+		pseg = &seg->next;
+		len -= alen;
+		src = ((char *)src) + alen;
+	}
+
+	err = security_msg_msg_alloc(msg);
+	if (err)
+		goto out_err;
+
+	return msg;
+
+out_err:
+	free_msg(msg);
+	return ERR_PTR(err);
+}
+
+int store_msg(void __user *dest, struct msg_msg *msg, int len)
+{
+	int alen;
+	struct msg_msgseg *seg;
+
+	alen = len;
+	if (alen > DATALEN_MSG)
+		alen = DATALEN_MSG;
+	if (copy_to_user(dest, msg + 1, alen))
+		return -1;
+
+	len -= alen;
+	dest = ((char *)dest) + alen;
+	seg = msg->next;
+	while (len > 0) {
+		alen = len;
+		if (alen > DATALEN_SEG)
+			alen = DATALEN_SEG;
+		if (copy_to_user(dest, seg + 1, alen))
+			return -1;
+		len -= alen;
+		dest = ((char *)dest) + alen;
+		seg = seg->next;
+	}
+	return 0;
+}
+
+void free_msg(struct msg_msg *msg)
+{
+	struct msg_msgseg *seg;
+
+	security_msg_msg_free(msg);
+
+	seg = msg->next;
+	kfree(msg);
+	while (seg != NULL) {
+		struct msg_msgseg *tmp = seg->next;
+		kfree(seg);
+		seg = tmp;
+	}
+}
diff --git a/ipc/util.c b/ipc/util.c
index 6d94883edae0..f74c5eef57d0 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -25,8 +25,6 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-#if defined(CONFIG_SYSVIPC)
-
 #include "util.h"
 
 /**
@@ -531,20 +529,3 @@ int ipc_parse_version (int *cmd)
 }
 
 #endif /* __ia64__ */
-
-#else
-/*
- * Dummy functions when SYSV IPC isn't configured
- */
-
-int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
-{
-	return 0;
-}
-
-void exit_sem(struct task_struct *tsk)
-{
-	return;
-}
-
-#endif /* CONFIG_SYSVIPC */
diff --git a/ipc/util.h b/ipc/util.h
index 79c8fc901317..e6434942c097 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -4,6 +4,10 @@
  *
  * ipc helper functions (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
+
+#ifndef _IPC_UTIL_H
+#define _IPC_UTIL_H
+
 #define USHRT_MAX 0xffff
 #define SEQ_MULTIPLIER	(IPCMNI)
 
@@ -62,3 +66,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
 #else
 int ipc_parse_version (int *cmd);
 #endif
+
+extern void free_msg(struct msg_msg *msg);
+extern struct msg_msg *load_msg(void __user *src, int len);
+extern int store_msg(void __user *dest, struct msg_msg *msg, int len);
+
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b17a249c50d..a1f20cabbdd3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/namespace.h>
 #include <linux/personality.h>
+#include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -39,9 +40,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_sem(struct task_struct *tsk);
-
 /* The idle threads do not count..
  * Protected by write_lock_irq(&tasklist_lock)
  */
-- 
cgit v1.2.3


From c50142a5433ed504fff2b1af152f8f7628830dfb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:03 -0700
Subject: [PATCH] posix message queues: syscall stubs

From: Manfred Spraul <manfred@colorfullife.com>

Add -ENOSYS stubs for the posix message queue syscalls.  The API is a direct
mapping of the api from the unix spec, with two exceptions:

- mq_close() doesn't exist.  Message queue file descriptors can be closed
  with close().

- mq_notify(SIGEV_THREAD) cannot be implemented in the kernel.  The kernel
  returns a pollable file descriptor .  User space must poll (or read) this
  descriptor and call the notifier function if the file descriptor is
  signaled.
---
 arch/i386/kernel/entry.S  |  9 +++++++++
 include/asm-i386/unistd.h | 11 ++++++++++-
 include/linux/mqueue.h    | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h  |  9 +++++++++
 kernel/sys.c              |  6 ++++++
 5 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mqueue.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3024740ba84c..14e64d3ea25c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -882,5 +882,14 @@ ENTRY(sys_call_table)
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
+	.long sys_ni_syscall	/* sys_mbind */
+	.long sys_ni_syscall	/* 275 sys_get_mempolicy */
+	.long sys_ni_syscall	/* sys_set_mempolicy */
+	.long sys_mq_open
+	.long sys_mq_unlink
+	.long sys_mq_timedsend
+	.long sys_mq_timedreceive	/* 280 */
+	.long sys_mq_notify
+	.long sys_mq_getsetattr
 
 syscall_table_size=(.-sys_call_table)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index a2d58a99491e..620a232084f3 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -279,8 +279,17 @@
 #define __NR_utimes		271
 #define __NR_fadvise64_64	272
 #define __NR_vserver		273
+#define __NR_mbind		274
+#define __NR_get_mempolicy	275
+#define __NR_set_mempolicy	276
+#define __NR_mq_open 		277
+#define __NR_mq_unlink		(__NR_mq_open+1)
+#define __NR_mq_timedsend	(__NR_mq_open+2)
+#define __NR_mq_timedreceive	(__NR_mq_open+3)
+#define __NR_mq_notify		(__NR_mq_open+4)
+#define __NR_mq_getsetattr	(__NR_mq_open+5)
 
-#define NR_syscalls 274
+#define NR_syscalls 283
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
new file mode 100644
index 000000000000..c0c5fcc89f0e
--- /dev/null
+++ b/include/linux/mqueue.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2003 Krzysztof Benedyczak & Michal Wronski
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   It is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this software; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _LINUX_MQUEUE_H
+#define _LINUX_MQUEUE_H
+
+#define MQ_PRIO_MAX 	32768
+
+typedef int mqd_t;
+
+struct mq_attr {
+	long	mq_flags;	/* message queue flags			*/
+	long	mq_maxmsg;	/* maximum number of messages		*/
+	long	mq_msgsize;	/* maximum message size			*/
+	long	mq_curmsgs;	/* number of messages currently queued	*/
+};
+
+#define NOTIFY_NONE	0
+#define NOTIFY_WOKENUP	1
+#define NOTIFY_REMOVED	2
+
+#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index aaf87aeacafb..7ee5f67abb5f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,6 +48,8 @@ struct timex;
 struct timezone;
 struct tms;
 struct utimbuf;
+typedef int mqd_t;
+struct mq_attr;
 
 #include <linux/config.h>
 #include <linux/types.h>
@@ -450,6 +452,13 @@ asmlinkage long sys_shmget(key_t key, size_t size, int flag);
 asmlinkage long sys_shmdt(char __user *shmaddr);
 asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
 
+asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
+asmlinkage long sys_mq_unlink(const char __user *name);
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
+asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
+
 asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/kernel/sys.c b/kernel/sys.c
index bc498b12edcc..7d1bf5c57aca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -260,6 +260,12 @@ cond_syscall(sys_msgctl)
 cond_syscall(sys_shmget)
 cond_syscall(sys_shmdt)
 cond_syscall(sys_shmctl)
+cond_syscall(sys_mq_open)
+cond_syscall(sys_mq_unlink)
+cond_syscall(sys_mq_timedsend)
+cond_syscall(sys_mq_timedreceive)
+cond_syscall(sys_mq_notify)
+cond_syscall(sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
-- 
cgit v1.2.3


From f3ca8d5dd5c23594bda07893ae374bed7981d473 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:54 -0700
Subject: [PATCH] posix message queue update

From: Manfred Spraul <manfred@colorfullife.com>

My discussion with Ulrich had one result:

- mq_setattr can accept implementation defined flags.  Right now we have
  none, but we might add some later (e.g.  switch to CLOCK_MONOTONIC for
  mq_timed{send,receive} or something similar).  When we add flags, we
  might need the fields for additional information.  And they don't hurt.
  Therefore add four __reserved fields to mq_attr.

- fail mq_setattr if we get unknown flags - otherwise glibc can't detect
  if it's running on a future kernel that supports new features.

- use memset to initialize the mq_attr structure - theoretically we could
  leak kernel memory.

- Only set O_NONBLOCK in mq_attr, explicitely clear O_RDWR & friends.
  openposix uses getattr, attr |=O_NONBLOCK, setattr - a sane approach.
  Without clearing O_RDWR, this fails.

I've retested all openposix conformance tests with the new patch - the two
new FAILED tests check undefined behavior.  Note that I won't have net
access until Sunday - if the message queue patch breaks something important
either ask Krzysztof or drop it.

Ulrich had another good idea for SIGEV_THREAD, but I must think about it.
It would mean less complexitiy in glibc, but more code in the kernel.  I'm
not yet convinced that it's overall better.
---
 include/linux/mqueue.h | 1 +
 ipc/mqueue.c           | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index c0c5fcc89f0e..535fe4b2f14b 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -27,6 +27,7 @@ struct mq_attr {
 	long	mq_maxmsg;	/* maximum number of messages		*/
 	long	mq_msgsize;	/* maximum message size			*/
 	long	mq_curmsgs;	/* number of messages currently queued	*/
+	long	__reserved[4];	/* ignored for input, zeroed for output */
 };
 
 #define NOTIFY_NONE	0
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c9a3e652a026..b5f731781f56 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -121,7 +121,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
 			INIT_LIST_HEAD(&info->e_wait_q[1].list);
 			info->notify_owner = 0;
 			info->qsize = 0;
-			info->attr.mq_curmsgs = 0;
+			memset(&info->attr, 0, sizeof(info->attr));
 			info->attr.mq_maxmsg = DFLT_MSGMAX;
 			info->attr.mq_msgsize = DFLT_MSGSIZEMAX;
 			info->messages = kmalloc(DFLT_MSGMAX * sizeof(struct msg_msg *), GFP_KERNEL);
@@ -1082,6 +1082,8 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
 	if (u_mqstat != NULL) {
 		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
 			return -EFAULT;
+		if (mqstat.mq_flags & (~O_NONBLOCK))
+			return -EINVAL;
 	}
 
 	ret = -EBADF;
@@ -1097,7 +1099,7 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
 	spin_lock(&info->lock);
 
 	omqstat = info->attr;
-	omqstat.mq_flags = filp->f_flags;
+	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
 	if (u_mqstat) {
 		if (mqstat.mq_flags & O_NONBLOCK)
 			filp->f_flags |= O_NONBLOCK;
-- 
cgit v1.2.3


From ed6dcf4a49c1098e0701762f6cc52b194cb7f661 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:19 -0700
Subject: [PATCH] split netlink_unicast

From: Manfred Spraul <manfred@colorfullife.com>

The attached patch splits netlink_unicast into three steps:

- netlink_getsock{bypid,byfilp}: lookup the destination socket.

- netlink_attachskb: perform the nonblock checks, sleep if the socket
  queue is longer than the limit, etc.

- netlink_sendskb: actually send the skb.

jamal looked over it and didn't see a problem with the netlink change.  The
actual use from ipc/mqueue.c is still open (just send back whatever the C
library passed to mq_notify, add an nlmsghdr or perhaps even make it a
specialized netlink protocol), but the attached patch is independant from
the the message queue change.

(acked by davem)
---
 include/linux/netlink.h  |   7 +++
 net/netlink/af_netlink.c | 120 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 101 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 4e5ea27305a2..e5e15ddadab5 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -119,6 +119,13 @@ extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
 extern int netlink_register_notifier(struct notifier_block *nb);
 extern int netlink_unregister_notifier(struct notifier_block *nb);
 
+/* finegrained unicast helpers: */
+struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid);
+struct sock *netlink_getsockbyfilp(struct file *filp);
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo);
+void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol);
+
 /*
  *	skb should fit one page. This choice is good for headerless malloc.
  *
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 38c27b9bb70a..398cd03f2d7b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -415,38 +415,65 @@ static void netlink_overrun(struct sock *sk)
 	}
 }
 
-int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
+struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
 {
-	struct sock *sk;
-	struct netlink_opt *nlk;
-	int len = skb->len;
 	int protocol = ssk->sk_protocol;
-	long timeo;
-        DECLARE_WAITQUEUE(wait, current);
-
-	timeo = sock_sndtimeo(ssk, nonblock);
+	struct sock *sock;
+	struct netlink_opt *nlk;
 
-retry:
-	sk = netlink_lookup(protocol, pid);
-	if (sk == NULL)
-		goto no_dst;
-	nlk = nlk_sk(sk);
+	sock = netlink_lookup(protocol, pid);
+	if (!sock)
+		return ERR_PTR(-ECONNREFUSED);
 
 	/* Don't bother queuing skb if kernel socket has no input function */
-        if (nlk->pid == 0 && !nlk->data_ready)
-        	goto no_dst;
+	nlk = nlk_sk(sock);
+	if (nlk->pid == 0 && !nlk->data_ready) {
+		sock_put(sock);
+		return ERR_PTR(-ECONNREFUSED);
+	}
+	return sock;
+}
+
+struct sock *netlink_getsockbyfilp(struct file *filp)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct socket *socket;
+	struct sock *sock;
+
+	if (!inode->i_sock || !(socket = SOCKET_I(inode)))
+		return ERR_PTR(-ENOTSOCK);
+
+	sock = socket->sk;
+	if (sock->sk_family != AF_NETLINK)
+		return ERR_PTR(-EINVAL);
+
+	sock_hold(sock);
+	return sock;
+}
+
+/*
+ * Attach a skb to a netlink socket.
+ * The caller must hold a reference to the destination socket. On error, the
+ * reference is dropped. The skb is not send to the destination, just all
+ * all error checks are performed and memory in the queue is reserved.
+ * Return values:
+ * < 0: error. skb freed, reference to sock dropped.
+ * 0: continue
+ * 1: repeat lookup - reference dropped while waiting for socket memory.
+ */
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo)
+{
+	struct netlink_opt *nlk;
+
+	nlk = nlk_sk(sk);
 
 #ifdef NL_EMULATE_DEV
-	if (nlk->handler) {
-		skb_orphan(skb);
-		len = nlk->handler(protocol, skb);
-		sock_put(sk);
-		return len;
-	}
+	if (nlk->handler)
+		return 0;
 #endif
-
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
 	    test_bit(0, &nlk->state)) {
+		DECLARE_WAITQUEUE(wait, current);
 		if (!timeo) {
 			if (!nlk->pid)
 				netlink_overrun(sk);
@@ -471,19 +498,60 @@ retry:
 			kfree_skb(skb);
 			return sock_intr_errno(timeo);
 		}
-		goto retry;
+		return 1;
 	}
-
 	skb_orphan(skb);
 	skb_set_owner_r(skb, sk);
+	return 0;
+}
+
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol)
+{
+	struct netlink_opt *nlk;
+	int len = skb->len;
+
+	nlk = nlk_sk(sk);
+#ifdef NL_EMULATE_DEV
+	if (nlk->handler) {
+		skb_orphan(skb);
+		len = nlk->handler(protocol, skb);
+		sock_put(sk);
+		return len;
+	}
+#endif
+
 	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk, len);
 	sock_put(sk);
 	return len;
+}
 
-no_dst:
+void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
+{
 	kfree_skb(skb);
-	return -ECONNREFUSED;
+	sock_put(sk);
+}
+
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
+{
+	struct sock *sk;
+	int err;
+	long timeo;
+
+	timeo = sock_sndtimeo(ssk, nonblock);
+retry:
+	sk = netlink_getsockbypid(ssk, pid);
+	if (IS_ERR(sk)) {
+		kfree_skb(skb);
+		return PTR_ERR(skb);
+	}
+	err = netlink_attachskb(sk, skb, nonblock, timeo);
+	if (err == 1)
+		goto retry;
+	if (err)
+		return err;
+
+	return netlink_sendskb(sk, skb, ssk->sk_protocol);
 }
 
 static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3


From 34b98f223bb21673f4cab2f5079a763c34a67946 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:32 -0700
Subject: [PATCH] posix message queues: send notifications via netlink

From: Manfred Spraul <manfred@colorfullife.com>

SIGEV_THREAD means that a given callback should be called in the context on a
new thread.  This must be done by the C library.  The kernel must deliver a
notice of the event to the C library when the callback should be called.

This patch switches to a new, simpler interface: User space creates a socket
with socket(PF_NETLINK, SOCK_RAW,0) and passes the fd to the mq_notify call
together with a cookie.  When the mq_notify() condition is satisfied, the
kernel "writes" the cookie to the socket.  User space then reads the cookie
and calls the appropriate callback.
---
 include/linux/mqueue.h |  16 ++++
 ipc/mqueue.c           | 254 +++++++++++++++++++++----------------------------
 2 files changed, 123 insertions(+), 147 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index 535fe4b2f14b..fdab3b8ee242 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -30,8 +30,24 @@ struct mq_attr {
 	long	__reserved[4];	/* ignored for input, zeroed for output */
 };
 
+/*
+ * SIGEV_THREAD implementation:
+ * SIGEV_THREAD must be implemented in user space. If SIGEV_THREAD is passed
+ * to mq_notify, then
+ * - sigev_signo must be the file descriptor of an AF_NETLINK socket. It's not
+ *   necessary that the socket is bound.
+ * - sigev_value.sival_ptr must point to a cookie that is NOTIFY_COOKIE_LEN
+ *   bytes long.
+ * If the notification is triggered, then the cookie is sent to the netlink
+ * socket. The last byte of the cookie is replaced with the NOTIFY_?? codes:
+ * NOTIFY_WOKENUP if the notification got triggered, NOTIFY_REMOVED if it was
+ * removed, either due to a close() on the message queue fd or due to a
+ * mq_notify() that removed the notification.
+ */
 #define NOTIFY_NONE	0
 #define NOTIFY_WOKENUP	1
 #define NOTIFY_REMOVED	2
 
+#define NOTIFY_COOKIE_LEN	32
+
 #endif
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index f0d78fefc28b..f81441d63564 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -20,6 +20,9 @@
 #include <linux/poll.h>
 #include <linux/mqueue.h>
 #include <linux/msg.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <net/sock.h>
 #include "util.h"
 
 #define MQUEUE_MAGIC	0x19800202
@@ -33,9 +36,6 @@
 #define STATE_PENDING	1
 #define STATE_READY	2
 
-#define NP_NONE		((void*)NOTIFY_NONE)
-#define NP_WOKENUP	((void*)NOTIFY_WOKENUP)
-#define NP_REMOVED	((void*)NOTIFY_REMOVED)
 /* used by sysctl */
 #define FS_MQUEUE 	1
 #define CTL_QUEUESMAX 	2
@@ -48,6 +48,8 @@
 #define HARD_MSGMAX 	(131072/sizeof(void*))
 #define DFLT_MSGSIZEMAX 16384	/* max message size */
 
+#define NOTIFY_COOKIE_LEN	32
+
 struct ext_wait_queue {		/* queue of sleeping tasks */
 	struct task_struct *task;
 	struct list_head list;
@@ -56,25 +58,26 @@ struct ext_wait_queue {		/* queue of sleeping tasks */
 };
 
 struct mqueue_inode_info {
-	struct mq_attr attr;
+	spinlock_t lock;
+	struct inode vfs_inode;
+	wait_queue_head_t wait_q;
+
 	struct msg_msg **messages;
+	struct mq_attr attr;
 
-	pid_t notify_owner;	/* != 0 means notification registered */
-	struct sigevent notify;
-	struct file *notify_filp;
+	struct sigevent notify; /* notify.sigev_notify == SIGEV_NONE means */
+	pid_t notify_owner;	/*           no notification registered */
+	struct sock *notify_sock;
+	struct sk_buff *notify_cookie;
 
 	/* for tasks waiting for free space and messages, respectively */
 	struct ext_wait_queue e_wait_q[2];
-	wait_queue_head_t wait_q;
 
 	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
-	spinlock_t lock;
-	struct inode vfs_inode;
 };
 
 static struct inode_operations mqueue_dir_inode_operations;
 static struct file_operations mqueue_file_operations;
-static struct file_operations mqueue_notify_fops;
 static struct super_operations mqueue_super_ops;
 static void remove_notification(struct mqueue_inode_info *info);
 
@@ -119,7 +122,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
 			init_waitqueue_head(&info->wait_q);
 			INIT_LIST_HEAD(&info->e_wait_q[0].list);
 			INIT_LIST_HEAD(&info->e_wait_q[1].list);
-			info->notify_owner = 0;
+			info->notify.sigev_notify = SIGEV_NONE;
 			info->qsize = 0;
 			memset(&info->attr, 0, sizeof(info->attr));
 			info->attr.mq_maxmsg = DFLT_MSGMAX;
@@ -283,10 +286,11 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 	snprintf(buffer, sizeof(buffer),
 			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
 			info->qsize,
-			info->notify_owner ? info->notify.sigev_notify : SIGEV_NONE,
-			(info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL ) ?
+			info->notify.sigev_notify,
+			(info->notify.sigev_notify == SIGEV_SIGNAL ) ?
 				info->notify.sigev_signo : 0,
-			info->notify_owner);
+			(info->notify.sigev_notify != SIGEV_NONE) ?
+				info->notify_owner : 0);
 	spin_unlock(&info->lock);
 	buffer[sizeof(buffer)-1] = '\0';
 	slen = strlen(buffer)+1;
@@ -299,7 +303,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 		count = slen - o;
 
 	if (copy_to_user(u_data, buffer + o, count))
-       		return -EFAULT;
+		return -EFAULT;
 
 	*off = o + count;
 	filp->f_dentry->d_inode->i_atime = filp->f_dentry->d_inode->i_ctime = CURRENT_TIME;
@@ -311,7 +315,8 @@ static int mqueue_flush_file(struct file *filp)
 	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 
 	spin_lock(&info->lock);
-	if (current->tgid == info->notify_owner)
+	if (info->notify.sigev_notify != SIGEV_NONE &&
+			current->tgid == info->notify_owner)
 		remove_notification(info);
 
 	spin_unlock(&info->lock);
@@ -435,6 +440,11 @@ static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
 	return info->messages[info->attr.mq_curmsgs];
 }
 
+static inline void set_cookie(struct sk_buff *skb, char code)
+{
+	((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
+}
+
 /*
  * The next function is only to split too long sys_mq_timedsend
  */
@@ -445,7 +455,8 @@ static void __do_notify(struct mqueue_inode_info *info)
 	 * waiting synchronously for message AND state of queue changed from
 	 * empty to not empty. Here we are sure that no one is waiting
 	 * synchronously. */
-	if (info->notify_owner && info->attr.mq_curmsgs == 1) {
+	if (info->notify.sigev_notify != SIGEV_NONE &&
+			info->attr.mq_curmsgs == 1) {
 		/* sends signal */
 		if (info->notify.sigev_notify == SIGEV_SIGNAL) {
 			struct siginfo sig_i;
@@ -460,10 +471,12 @@ static void __do_notify(struct mqueue_inode_info *info)
 			kill_proc_info(info->notify.sigev_signo,
 				       &sig_i, info->notify_owner);
 		} else if (info->notify.sigev_notify == SIGEV_THREAD) {
-			info->notify_filp->private_data = (void*)NP_WOKENUP;
+			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
+			netlink_sendskb(info->notify_sock,
+					info->notify_cookie, 0);
 		}
 		/* after notification unregisters process */
-		info->notify_owner = 0;
+		info->notify.sigev_notify = SIGEV_NONE;
 	}
 	wake_up(&info->wait_q);
 }
@@ -499,90 +512,13 @@ static long prepare_timeout(const struct timespec __user *u_arg)
 	return timeout;
 }
 
-/*
- * File descriptor based notification, intended to be used to implement
- * SIGEV_THREAD:
- * SIGEV_THREAD means that a notification function should be called in the
- * context of a new thread. The kernel can't do that. Therefore mq_notify
- * calls with SIGEV_THREAD return a new file descriptor. A user space helper
- * must create a new thread and then read from the given file descriptor.
- * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must
- * call the notification function. If it's NOTIFY_REMOVED, then the
- * notification was removed. The file descriptor supports poll, thus one
- * supervisor thread can manage multiple message queue notifications.
- *
- * The implementation must support multiple outstanding notifications:
- * It's possible that a new notification is added and signaled before user
- * space calls mqueue_notify_read for the previous notification.
- * Therefore the notification state is stored in the private_data field of
- * the file descriptor.
- */
-static unsigned int mqueue_notify_poll(struct file *filp,
-					struct poll_table_struct *poll_tab)
-{
-	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
-	int retval;
-
-	poll_wait(filp, &info->wait_q, poll_tab);
-
-	if (filp->private_data == NP_NONE)
-		retval = 0;
-	else
-		retval = POLLIN | POLLRDNORM;
-	return retval;
-}
-
-static ssize_t mqueue_notify_read(struct file *filp, char __user *buf,
-					size_t count, loff_t *ppos)
-{
-	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
-	char result;
-
-	if (!count)
-		return 0;
-	if (*ppos != 0)
-		return 0;
-	spin_lock(&info->lock);
-	while (filp->private_data == NP_NONE) {
-		DEFINE_WAIT(wait);
-		if (filp->f_flags & O_NONBLOCK) {
-			spin_unlock(&info->lock);
-			return -EAGAIN;
-		}
-		prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE);
-		spin_unlock(&info->lock);
-		schedule();
-		finish_wait(&info->wait_q, &wait);
-		spin_lock(&info->lock);
-	}
-	spin_unlock(&info->lock);
-	result = (char)(unsigned long)filp->private_data;
-	if (put_user(result, buf))
-		return -EFAULT;
-	*ppos = 1;
-	return 1;
-}
-
-static int mqueue_notify_release(struct inode *inode, struct file *filp)
-{
-	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
-
-	spin_lock(&info->lock);
-	if (info->notify_owner && info->notify_filp == filp)
-		info->notify_owner = 0;
-	filp->private_data = NP_REMOVED;
-	spin_unlock(&info->lock);
-
-	return 0;
-}
-
 static void remove_notification(struct mqueue_inode_info *info)
 {
 	if (info->notify.sigev_notify == SIGEV_THREAD) {
-		info->notify_filp->private_data = NP_REMOVED;
-		wake_up(&info->wait_q);
+		set_cookie(info->notify_cookie, NOTIFY_REMOVED);
+		netlink_sendskb(info->notify_sock, info->notify_cookie, 0);
 	}
-	info->notify_owner = 0;
+	info->notify.sigev_notify = SIGEV_NONE;
 }
 
 /*
@@ -780,7 +716,8 @@ out_unlock:
  */
 
 /* pipelined_send() - send a message directly to the task waiting in
- * sys_mq_timedreceive() (without inserting message into a queue). */
+ * sys_mq_timedreceive() (without inserting message into a queue).
+ */
 static inline void pipelined_send(struct mqueue_inode_info *info,
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
@@ -978,12 +915,16 @@ out:
 asmlinkage long sys_mq_notify(mqd_t mqdes,
 				const struct sigevent __user *u_notification)
 {
-	int ret, fd;
-	struct file *filp, *nfilp;
+	int ret;
+	struct file *filp;
+	struct sock *sock;
 	struct inode *inode;
 	struct sigevent notification;
 	struct mqueue_inode_info *info;
+	struct sk_buff *nc;
 
+	nc = NULL;
+	sock = NULL;
 	if (u_notification == NULL) {
 		notification.sigev_notify = SIGEV_NONE;
 	} else {
@@ -1000,6 +941,44 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 			 notification.sigev_signo > _NSIG)) {
 			return -EINVAL;
 		}
+		if (notification.sigev_notify == SIGEV_THREAD) {
+			/* create the notify skb */
+			nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
+			ret = -ENOMEM;
+			if (!nc)
+				goto out;
+			ret = -EFAULT;
+			if (copy_from_user(nc->data,
+					notification.sigev_value.sival_ptr,
+					NOTIFY_COOKIE_LEN)) {
+				goto out;
+			}
+
+			/* TODO: add a header? */
+			skb_put(nc, NOTIFY_COOKIE_LEN);
+			/* and attach it to the socket */
+retry:
+			filp = fget(notification.sigev_signo);
+			ret = -EBADF;
+			if (!filp)
+				goto out;
+			sock = netlink_getsockbyfilp(filp);
+			fput(filp);
+			if (IS_ERR(sock)) {
+				ret = PTR_ERR(sock);
+				sock = NULL;
+				goto out;
+			}
+
+			ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT);
+			if (ret == 1)
+		       		goto retry;
+			if (ret) {
+				sock = NULL;
+				nc = NULL;
+				goto out;
+			}
+		}
 	}
 
 	ret = -EBADF;
@@ -1013,47 +992,33 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 	info = MQUEUE_I(inode);
 
 	ret = 0;
-	if (notification.sigev_notify == SIGEV_THREAD) {
-		ret = get_unused_fd();
-		if (ret < 0)
-			goto out_fput;
-		fd = ret;
-		nfilp = get_empty_filp();
-		if (!nfilp) {
-			ret = -ENFILE;
-			goto out_dropfd;
-		}
-		nfilp->private_data = NP_NONE;
-		nfilp->f_op = &mqueue_notify_fops;
-		nfilp->f_vfsmnt = mntget(mqueue_mnt);
-		nfilp->f_dentry = dget(filp->f_dentry);
-		nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping;
-		nfilp->f_flags = O_RDONLY;
-		nfilp->f_mode = FMODE_READ;
-	} else {
-		nfilp = NULL;
-		fd = -1;
-	}
-
 	spin_lock(&info->lock);
-
-	if (notification.sigev_notify == SIGEV_NONE) {
-		if (info->notify_owner == current->tgid) {
+	switch (notification.sigev_notify) {
+	case SIGEV_NONE:
+		if (info->notify.sigev_notify != SIGEV_NONE &&
+				info->notify_owner == current->tgid) {
 			remove_notification(info);
 			inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		}
-	} else if (info->notify_owner) {
-		ret = -EBUSY;
-	} else if (notification.sigev_notify == SIGEV_THREAD) {
-		info->notify_filp = nfilp;
-		fd_install(fd, nfilp);
-		ret = fd;
-		fd = -1;
-		nfilp = NULL;
+		break;
+	case SIGEV_THREAD:
+		if (info->notify.sigev_notify != SIGEV_NONE) {
+			ret = -EBUSY;
+			break;
+		}
+		info->notify_sock = sock;
+		info->notify_cookie = nc;
+		sock = NULL;
+		nc = NULL;
 		info->notify.sigev_notify = SIGEV_THREAD;
 		info->notify_owner = current->tgid;
 		inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	}  else {
+		break;
+	case SIGEV_SIGNAL:
+		if (info->notify.sigev_notify != SIGEV_NONE) {
+			ret = -EBUSY;
+			break;
+		}
 		info->notify.sigev_signo = notification.sigev_signo;
 		info->notify.sigev_value = notification.sigev_value;
 		info->notify.sigev_notify = SIGEV_SIGNAL;
@@ -1061,12 +1026,14 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 		inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	}
 	spin_unlock(&info->lock);
-out_dropfd:
-	if (fd != -1)
-		put_unused_fd(fd);
 out_fput:
 	fput(filp);
 out:
+	if (sock) {
+		netlink_detachskb(sock, nc);
+	} else if (nc) {
+		dev_kfree_skb(nc);
+	}
 	return ret;
 }
 
@@ -1135,13 +1102,6 @@ static struct file_operations mqueue_file_operations = {
 	.read = mqueue_read_file,
 };
 
-static struct file_operations mqueue_notify_fops = {
-	.poll = mqueue_notify_poll,
-	.read = mqueue_notify_read,
-	.release = mqueue_notify_release,
-};
-
-
 static struct super_operations mqueue_super_ops = {
 	.alloc_inode = mqueue_alloc_inode,
 	.destroy_inode = mqueue_destroy_inode,
-- 
cgit v1.2.3


From 87c22e8470366e81aa82bcbadaf147c4ecdfb182 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:45 -0700
Subject: [PATCH] compat emulation for posix message queues

From: Arnd Bergmann <arnd@arndb.de>

I have tested the code with the open posix test suite and found the same
four failures for both 64-bit and compat mode, most tests pass.  The patch
is against -mc1, but I guess it also applies to the other trees around.

What worries me more than mq_attr compatibility is the conversion of struct
sigevent, which might turn out really hard when more fields in there are
used.  AFAICS, the only other part in the kernel ABI is sys_timer_create(),
so maybe it's not too late to deprecate the current structure and create a
structure that can be used properly for compat syscalls.
---
 arch/ia64/ia32/ia32_signal.c     |   7 +-
 arch/mips/kernel/signal32.c      |   7 +-
 arch/s390/kernel/compat_signal.c |   5 +-
 arch/sparc64/kernel/signal32.c   |   7 +-
 arch/x86_64/ia32/ia32_signal.c   |   6 +-
 include/asm-ppc64/ppc32.h        |  14 ---
 include/linux/compat.h           |  17 ++++
 include/linux/mqueue.h           |   4 +-
 include/linux/posix_types.h      |   1 +
 include/linux/syscalls.h         |   1 -
 include/linux/types.h            |   1 +
 ipc/Makefile                     |   3 +-
 ipc/compat_mq.c                  | 196 +++++++++++++++++++++++++++++++++++++++
 kernel/sys.c                     |   5 +
 14 files changed, 251 insertions(+), 23 deletions(-)
 create mode 100644 ipc/compat_mq.c

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c
index 8b1374c172b6..bb1e836fb227 100644
--- a/arch/ia64/ia32/ia32_signal.c
+++ b/arch/ia64/ia32/ia32_signal.c
@@ -114,7 +114,12 @@ copy_siginfo_from_user32 (siginfo_t *to, siginfo_t32 *from)
 			err |= __get_user(to->si_band, &from->si_band);
 			err |= __get_user(to->si_fd, &from->si_fd);
 			break;
-			/* case __SI_RT: This is not generated by the kernel as of now.  */
+		      case __SI_RT: /* This is not generated by the kernel as of now.  */
+		      case __SI_MESGQ:
+			err |= __get_user(to->si_pid, &from->si_pid);
+			err |= __get_user(to->si_uid, &from->si_uid);
+			err |= __get_user(to->si_int, &from->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 5c1489f4fdc2..c52074f84300 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -358,7 +358,12 @@ static int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_pid, &to->si_pid);
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 44fe6e477e92..373040404a5a 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -74,6 +74,10 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
 		switch (from->si_code >> 16) {
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_int, &to->si_int);
+			/* fallthrough */
 		case __SI_KILL >> 16:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
@@ -96,7 +100,6 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 			break;
 		default:
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
 		}
 	}
 	return err;
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index cc3019d6dd65..e2f62a666d8c 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -129,7 +129,12 @@ int copy_siginfo_to_user32(siginfo_t32 __user *to, siginfo_t *from)
 			err |= __put_user(from->si_trapno, &to->si_trapno);
 			err |= __put_user((long)from->si_addr, &to->si_addr);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_pid, &to->si_pid);
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index bce5fbc5be2c..1a828de6a55d 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -85,7 +85,11 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from)
 			err |= __put_user(from->si_overrun, &to->si_overrun); 
 			err |= __put_user((u32)(u64)from->si_ptr, &to->si_ptr);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h
index 53865a8c4f8d..7338ea298a19 100644
--- a/include/asm-ppc64/ppc32.h
+++ b/include/asm-ppc64/ppc32.h
@@ -141,20 +141,6 @@ struct ucontext32 {
 	struct mcontext32	uc_mcontext;
 };
 
-typedef struct compat_sigevent {
-	compat_sigval_t sigev_value;
-	int sigev_signo;
-	int sigev_notify;
-	union {
-		int _pad[SIGEV_PAD_SIZE];
-		int _tid;
-		struct {
-			compat_uptr_t _function;
-			compat_uptr_t _attribute;
-		} _sigev_thread;
-	} _sigev_un;
-} compat_sigevent_t;
-
 struct ipc_kludge_32 {
 	unsigned int msgp;
 	int msgtyp;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 7b82209ab4ab..796204f59bd9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -13,6 +13,7 @@
 #include <linux/sem.h>
 
 #include <asm/compat.h>
+#include <asm/siginfo.h>
 
 #define compat_jiffies_to_clock_t(x)	\
 		(((unsigned long)(x) * COMPAT_USER_HZ) / HZ)
@@ -90,6 +91,22 @@ typedef union compat_sigval {
 	compat_uptr_t	sival_ptr;
 } compat_sigval_t;
 
+typedef struct compat_sigevent {
+	compat_sigval_t sigev_value;
+	compat_int_t sigev_signo;
+	compat_int_t sigev_notify;
+	union {
+		compat_int_t _pad[SIGEV_PAD_SIZE];
+		compat_int_t _tid;
+
+		struct {
+			compat_uptr_t _function;
+			compat_uptr_t _attribute;
+		} _sigev_thread;
+	} _sigev_un;
+} compat_sigevent_t;
+
+
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
 long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index fdab3b8ee242..fc40b774b913 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -18,9 +18,9 @@
 #ifndef _LINUX_MQUEUE_H
 #define _LINUX_MQUEUE_H
 
-#define MQ_PRIO_MAX 	32768
+#include <linux/types.h>
 
-typedef int mqd_t;
+#define MQ_PRIO_MAX 	32768
 
 struct mq_attr {
 	long	mq_flags;	/* message queue flags			*/
diff --git a/include/linux/posix_types.h b/include/linux/posix_types.h
index 3ee2ed9de1db..f04c98cf44f3 100644
--- a/include/linux/posix_types.h
+++ b/include/linux/posix_types.h
@@ -42,6 +42,7 @@ typedef void (*__kernel_sighandler_t)(int);
 
 /* Type of a SYSV IPC key.  */
 typedef int __kernel_key_t;
+typedef int __kernel_mqd_t;
 
 #include <asm/posix_types.h>
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7ee5f67abb5f..89ffe55898f2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,7 +48,6 @@ struct timex;
 struct timezone;
 struct tms;
 struct utimbuf;
-typedef int mqd_t;
 struct mq_attr;
 
 #include <linux/config.h>
diff --git a/include/linux/types.h b/include/linux/types.h
index 3b407b06b48f..93f5f3653561 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -31,6 +31,7 @@ typedef __kernel_key_t		key_t;
 typedef __kernel_suseconds_t	suseconds_t;
 typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
+typedef __kernel_mqd_t		mqd_t;
 
 #ifdef __KERNEL__
 typedef __kernel_uid32_t	uid_t;
diff --git a/ipc/Makefile b/ipc/Makefile
index 913790207d85..0a6d626cd794 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -4,5 +4,6 @@
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
 obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
-obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+obj_mq-$(CONFIG_COMPAT) += compat_mq.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
new file mode 100644
index 000000000000..1520df89c424
--- /dev/null
+++ b/ipc/compat_mq.c
@@ -0,0 +1,196 @@
+/*
+ *  ipc/compat_mq.c
+ *    32 bit emulation for POSIX message queue system calls
+ *
+ *    Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author: Arnd Bergmann <arnd@arndb.de>
+ */
+
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mqueue.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+struct compat_mq_attr {
+	compat_long_t mq_flags;      /* message queue flags		     */
+	compat_long_t mq_maxmsg;     /* maximum number of messages	     */
+	compat_long_t mq_msgsize;    /* maximum message size		     */
+	compat_long_t mq_curmsgs;    /* number of messages currently queued  */
+	compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
+};
+
+static inline int get_compat_mq_attr(struct mq_attr *attr,
+			const struct compat_mq_attr __user *uattr)
+{
+	if (verify_area(VERIFY_READ, uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __get_user(attr->mq_flags, &uattr->mq_flags)
+		| __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __get_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+static inline int put_compat_mq_attr(const struct mq_attr *attr,
+			struct compat_mq_attr __user *uattr)
+{
+	if (clear_user(uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __put_user(attr->mq_flags, &uattr->mq_flags)
+		| __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __put_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+asmlinkage long compat_sys_mq_open(const char __user *u_name,
+			int oflag, compat_mode_t mode,
+			struct compat_mq_attr __user *u_attr)
+{
+	struct mq_attr attr;
+	mm_segment_t oldfs;
+	char *name;
+	long ret;
+
+	if ((oflag & O_CREAT) == 0 || !u_attr)
+		return sys_mq_open(u_name, oflag, mode, 0);
+
+	if (get_compat_mq_attr(&attr, u_attr))
+		return -EFAULT;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_open(name, oflag, mode, &attr);
+	set_fs(oldfs);
+
+	putname(name);
+	return ret;
+}
+
+static struct timespec __user *compat_prepare_timeout(
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec ts, __user *u_ts;
+
+	if (!u_abs_timeout)
+		return 0;
+
+	u_ts = compat_alloc_user_space(sizeof(*u_ts));
+	if (get_compat_timespec(&ts, u_abs_timeout)
+		|| copy_to_user(u_ts, &ts, sizeof(*u_ts)))
+		return ERR_PTR(-EFAULT);
+
+	return u_ts;
+}
+
+asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes,
+			const char __user *u_msg_ptr,
+			size_t msg_len, unsigned int msg_prio,
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec __user *u_ts;
+
+	u_ts = compat_prepare_timeout(u_abs_timeout);
+	if (IS_ERR(u_ts))
+		return -EFAULT;
+
+	return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len,
+			msg_prio, u_ts);
+}
+
+asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes,
+			char __user *u_msg_ptr,
+			size_t msg_len, unsigned int __user *u_msg_prio,
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec *u_ts;
+
+	u_ts = compat_prepare_timeout(u_abs_timeout);
+	if (IS_ERR(u_ts))
+		return -EFAULT;
+
+	return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len,
+			u_msg_prio, u_ts);
+}
+
+static int get_compat_sigevent(struct sigevent *event,
+		const struct compat_sigevent __user *u_event)
+{
+	if (verify_area(VERIFY_READ, u_event, sizeof(*u_event)))
+		return -EFAULT;
+
+	return __get_user(event->sigev_value.sival_int,
+			  &u_event->sigev_value.sival_int)
+	     | __get_user(event->sigev_signo, &u_event->sigev_signo)
+	     | __get_user(event->sigev_notify, &u_event->sigev_notify)
+	     | __get_user(event->sigev_notify_thread_id,
+			  &u_event->sigev_notify_thread_id);
+}
+
+asmlinkage long compat_sys_mq_notify(mqd_t mqdes,
+			const struct compat_sigevent __user *u_notification)
+{
+	mm_segment_t oldfs;
+	struct sigevent notification;
+	char cookie[NOTIFY_COOKIE_LEN];
+	compat_uptr_t u_cookie;
+	long ret;
+
+	if (!u_notification)
+		return sys_mq_notify(mqdes, 0);
+
+	if (get_compat_sigevent(&notification, u_notification))
+		return -EFAULT;
+
+	if (notification.sigev_notify == SIGEV_THREAD) {
+		u_cookie = (compat_uptr_t)notification.sigev_value.sival_int;
+		if (copy_from_user(cookie, compat_ptr(u_cookie),
+						NOTIFY_COOKIE_LEN)) {
+			return -EFAULT;
+		}
+		notification.sigev_value.sival_ptr = cookie;
+	}
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_notify(mqdes, &notification);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
+			const struct compat_mq_attr __user *u_mqstat,
+			struct compat_mq_attr __user *u_omqstat)
+{
+	struct mq_attr mqstat, omqstat;
+	struct mq_attr *p_mqstat = 0, *p_omqstat = 0;
+	mm_segment_t oldfs;
+	long ret;
+
+	if (u_mqstat) {
+		p_mqstat = &mqstat;
+		if (get_compat_mq_attr(p_mqstat, u_mqstat))
+			return -EFAULT;
+	}
+
+	if (u_omqstat)
+		p_omqstat = &omqstat;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_getsetattr(mqdes, p_mqstat, p_omqstat);
+	set_fs(oldfs);
+
+	if (ret)
+		return ret;
+
+	return (u_omqstat) ? put_compat_mq_attr(&omqstat, u_omqstat) : 0;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d1bf5c57aca..81f9e02f2071 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -266,6 +266,11 @@ cond_syscall(sys_mq_timedsend)
 cond_syscall(sys_mq_timedreceive)
 cond_syscall(sys_mq_notify)
 cond_syscall(sys_mq_getsetattr)
+cond_syscall(compat_sys_mq_open)
+cond_syscall(compat_sys_mq_timedsend)
+cond_syscall(compat_sys_mq_timedreceive)
+cond_syscall(compat_sys_mq_notify)
+cond_syscall(compat_sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
-- 
cgit v1.2.3


From 0ab2d6681c4e8502990523d46d928f37b764d52d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:34 -0700
Subject: [PATCH] IPMI driver updates

From: Corey Minyard <minyard@acm.org>

- Add support for messaging through an IPMI LAN interface, which is
  required for some system software that already exists on other IPMI
  drivers.  It also does some renaming and a lot of little cleanups.

- Add the "System Interface" driver.  The previous driver for system
  interfaces only supported the KCS interface, this driver supports all
  system interfaces defined in the IPMI standard.  It also does a much better
  job of handling ACPI and SMBIOS tables for detecting IPMI system
  interfaces.
---
 Documentation/IPMI.txt              |  218 +++-
 drivers/char/ipmi/Kconfig           |    8 +-
 drivers/char/ipmi/Makefile          |    9 +-
 drivers/char/ipmi/ipmi_bt_sm.c      |  513 +++++++++
 drivers/char/ipmi/ipmi_devintf.c    |  197 ++--
 drivers/char/ipmi/ipmi_kcs_intf.c   | 1305 ----------------------
 drivers/char/ipmi/ipmi_kcs_sm.c     |  156 +--
 drivers/char/ipmi/ipmi_kcs_sm.h     |   70 --
 drivers/char/ipmi/ipmi_msghandler.c | 1292 +++++++++++++++++++---
 drivers/char/ipmi/ipmi_si_intf.c    | 2052 +++++++++++++++++++++++++++++++++++
 drivers/char/ipmi/ipmi_si_sm.h      |  117 ++
 drivers/char/ipmi/ipmi_smic_sm.c    |  599 ++++++++++
 drivers/char/ipmi/ipmi_watchdog.c   |  122 +--
 include/linux/ipmi.h                |  131 ++-
 include/linux/ipmi_msgdefs.h        |   36 +-
 include/linux/ipmi_smi.h            |   14 +-
 16 files changed, 5013 insertions(+), 1826 deletions(-)
 create mode 100644 drivers/char/ipmi/ipmi_bt_sm.c
 delete mode 100644 drivers/char/ipmi/ipmi_kcs_intf.c
 delete mode 100644 drivers/char/ipmi/ipmi_kcs_sm.h
 create mode 100644 drivers/char/ipmi/ipmi_si_intf.c
 create mode 100644 drivers/char/ipmi/ipmi_si_sm.h
 create mode 100644 drivers/char/ipmi/ipmi_smic_sm.c

(limited to 'include/linux')

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 825e83cb4acc..ec8a6fa2c34b 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -22,6 +22,58 @@ are not familiar with IPMI itself, see the web site at
 http://www.intel.com/design/servers/ipmi/index.htm.  IPMI is a big
 subject and I can't cover it all here!
 
+Configuration
+-------------
+
+The LinuxIPMI driver is modular, which means you have to pick several
+things to have it work right depending on your hardware.  Most of
+these are available in the 'Character Devices' menu.
+
+No matter what, you must pick 'IPMI top-level message handler' to use
+IPMI.  What you do beyond that depends on your needs and hardware.
+
+The message handler does not provide any user-level interfaces.
+Kernel code (like the watchdog) can still use it.  If you need access
+from userland, you need to select 'Device interface for IPMI' if you
+want access through a device driver.  Another interface is also
+available, you may select 'IPMI sockets' in the 'Networking Support'
+main menu.  This provides a socket interface to IPMI.  You may select
+both of these at the same time, they will both work together.
+
+The driver interface depends on your hardware.  If you have a board
+with a standard interface (These will generally be either "KCS",
+"SMIC", or "BT", consult your hardware manual), choose the 'IPMI SI
+handler' option.  A driver also exists for direct I2C access to the
+IPMI management controller.  Some boards support this, but it is
+unknown if it will work on every board.  For this, choose 'IPMI SMBus
+handler', but be ready to try to do some figuring to see if it will
+work.
+
+There is also a KCS-only driver interface supplied, but it is
+depracated in favor of the SI interface.
+
+You should generally enable ACPI on your system, as systems with IPMI
+should have ACPI tables describing them.
+
+If you have a standard interface and the board manufacturer has done
+their job correctly, the IPMI controller should be automatically
+detect (via ACPI or SMBIOS tables) and should just work.  Sadly, many
+boards do not have this information.  The driver attempts standard
+defaults, but they may not work.  If you fall into this situation, you
+need to read the section below named 'The SI Driver' on how to
+hand-configure your system.
+
+IPMI defines a standard watchdog timer.  You can enable this with the
+'IPMI Watchdog Timer' config option.  If you compile the driver into
+the kernel, then via a kernel command-line option you can have the
+watchdog timer start as soon as it intitializes.  It also have a lot
+of other options, see the 'Watchdog' section below for more details.
+Note that you can also have the watchdog continue to run if it is
+closed (by default it is disabled on close).  Go into the 'Watchdog
+Cards' menu, enable 'Watchdog Timer Support', and enable the option
+'Disable watchdog shutdown on close'.
+
+
 Basic Design
 ------------
 
@@ -41,18 +93,30 @@ ipmi_devintf - This provides a userland IOCTL interface for the IPMI
 driver, each open file for this device ties in to the message handler
 as an IPMI user.
 
-ipmi_kcs_drv - A driver for the KCS SMI.  Most system have a KCS
-interface for IPMI.
+ipmi_si - A driver for various system interfaces.  This supports
+KCS, SMIC, and may support BT in the future.  Unless you have your own
+custom interface, you probably need to use this.
+
+ipmi_smb - A driver for accessing BMCs on the SMBus. It uses the
+I2C kernel driver's SMBus interfaces to send and receive IPMI messages
+over the SMBus.
+
+af_ipmi - A network socket interface to IPMI.  This doesn't take up
+a character device in your system.
 
+Note that the KCS-only interface ahs been removed.
 
 Much documentation for the interface is in the include files.  The
 IPMI include files are:
 
-ipmi.h - Contains the user interface and IOCTL interface for IPMI.
+net/af_ipmi.h - Contains the socket interface.
 
-ipmi_smi.h - Contains the interface for SMI drivers to use.
+linux/ipmi.h - Contains the user interface and IOCTL interface for IPMI.
 
-ipmi_msgdefs.h - General definitions for base IPMI messaging.
+linux/ipmi_smi.h - Contains the interface for system management interfaces
+(things that interface to IPMI controllers) to use.
+
+linux/ipmi_msgdefs.h - General definitions for base IPMI messaging.
 
 
 Addressing
@@ -260,70 +324,131 @@ they register with the message handler.  They are generally assigned
 in the order they register, although if an SMI unregisters and then
 another one registers, all bets are off.
 
-The ipmi_smi.h defines the interface for SMIs, see that for more
-details.
+The ipmi_smi.h defines the interface for management interfaces, see
+that for more details.
 
 
-The KCS Driver
---------------
+The SI Driver
+-------------
 
-The KCS driver allows up to 4 KCS interfaces to be configured in the
-system.  By default, the driver will register one KCS interface at the
-spec-specified I/O port 0xca2 without interrupts.  You can change this
-at module load time (for a module) with:
+The SI driver allows up to 4 KCS or SMIC interfaces to be configured
+in the system.  By default, scan the ACPI tables for interfaces, and
+if it doesn't find any the driver will attempt to register one KCS
+interface at the spec-specified I/O port 0xca2 without interrupts.
+You can change this at module load time (for a module) with:
+
+  modprobe ipmi_si.o type=<type1>,<type2>....
+       ports=<port1>,<port2>... addrs=<addr1>,<addr2>...
+       irqs=<irq1>,<irq2>... trydefaults=[0|1]
+
+Each of these except si_trydefaults is a list, the first item for the
+first interface, second item for the second interface, etc.
 
-  insmod ipmi_kcs_drv.o kcs_ports=<port1>,<port2>... kcs_addrs=<addr1>,<addr2>
-       kcs_irqs=<irq1>,<irq2>... kcs_trydefaults=[0|1]
+The si_type may be either "kcs", "smic", or "bt".  If you leave it blank, it
+defaults to "kcs".
 
-The KCS driver supports two types of interfaces, ports (for I/O port
-based KCS interfaces) and memory addresses (for KCS interfaces in
-memory).  The driver will support both of them simultaneously, setting
-the port to zero (or just not specifying it) will allow the memory
-address to be used.  The port will override the memory address if it
-is specified and non-zero.  kcs_trydefaults sets whether the standard
-IPMI interface at 0xca2 and any interfaces specified by ACPE are
-tried.  By default, the driver tries it, set this value to zero to
-turn this off.
+If you specify si_addrs as non-zero for an interface, the driver will
+use the memory address given as the address of the device.  This
+overrides si_ports.
+
+If you specify si_ports as non-zero for an interface, the driver will
+use the I/O port given as the device address.
+
+If you specify si_irqs as non-zero for an interface, the driver will
+attempt to use the given interrupt for the device.
+
+si_trydefaults sets whether the standard IPMI interface at 0xca2 and
+any interfaces specified by ACPE are tried.  By default, the driver
+tries it, set this value to zero to turn this off.
 
 When compiled into the kernel, the addresses can be specified on the
 kernel command line as:
 
-  ipmi_kcs=<bmc1>:<irq1>,<bmc2>:<irq2>....,[nodefault]
+  ipmi_si.type=<type1>,<type2>...
+       ipmi_si.ports=<port1>,<port2>... ipmi_si.addrs=<addr1>,<addr2>...
+       ipmi_si.irqs=<irq1>,<irq2>... ipmi_si.trydefaults=[0|1]
 
-The <bmcx> values is either "p<port>" or "m<addr>" for port or memory
-addresses.  So for instance, a KCS interface at port 0xca2 using
-interrupt 9 and a memory interface at address 0xf9827341 with no
-interrupt would be specified "ipmi_kcs=p0xca2:9,m0xf9827341".
-If you specify zero for in irq or don't specify it, the driver will
-run polled unless the software can detect the interrupt to use in the
-ACPI tables.
+It works the same as the module parameters of the same names.
 
-By default, the driver will attempt to detect a KCS device at the
-spec-specified 0xca2 address and any address specified by ACPI.  If
-you want to turn this off, use the "nodefault" option.
+By default, the driver will attempt to detect any device specified by
+ACPI, and if none of those then a KCS device at the spec-specified
+0xca2.  If you want to turn this off, set the "trydefaults" option to
+false.
 
 If you have high-res timers compiled into the kernel, the driver will
 use them to provide much better performance.  Note that if you do not
 have high-res timers enabled in the kernel and you don't have
 interrupts enabled, the driver will run VERY slowly.  Don't blame me,
-the KCS interface sucks.
+these interfaces suck.
+
+
+The SMBus Driver
+----------------
+
+The SMBus driver allows up to 4 SMBus devices to be configured in the
+system.  By default, the driver will register any SMBus interfaces it finds
+in the I2C address range of 0x20 to 0x4f on any adapter.  You can change this
+at module load time (for a module) with:
+
+  modprobe ipmi_smb.o
+	addr=<adapter1>,<i2caddr1>[,<adapter2>,<i2caddr2>[,...]]
+	dbg=<flags1>,<flags2>...
+	[defaultprobe=0] [dbg_probe=1]
+
+The addresses are specified in pairs, the first is the adapter ID and the
+second is the I2C address on that adapter.
+
+The debug flags are bit flags for each BMC found, they are:
+IPMI messages: 1, driver state: 2, timing: 4, I2C probe: 8
+
+Setting smb_defaultprobe to zero disabled the default probing of SMBus
+interfaces at address range 0x20 to 0x4f.  This means that only the
+BMCs specified on the smb_addr line will be detected.
+
+Setting smb_dbg_probe to 1 will enable debugging of the probing and
+detection process for BMCs on the SMBusses.
+
+Discovering the IPMI compilant BMC on the SMBus can cause devices
+on the I2C bus to fail. The SMBus driver writes a "Get Device ID" IPMI
+message as a block write to the I2C bus and waits for a response.
+This action can be detrimental to some I2C devices. It is highly recommended
+that the known I2c address be given to the SMBus driver in the smb_addr
+parameter. The default adrress range will not be used when a smb_addr
+parameter is provided.
+
+When compiled into the kernel, the addresses can be specified on the
+kernel command line as:
+
+  ipmb_smb.addr=<adapter1>,<i2caddr1>[,<adapter2>,<i2caddr2>[,...]]
+	ipmi_smb.dbg=<flags1>,<flags2>...
+	ipmi_smb.defaultprobe=0 ipmi_smb.dbg_probe=1
+
+These are the same options as on the module command line.
+
+Note that you might need some I2C changes if CONFIG_IPMI_PANIC_EVENT
+is enabled along with this, so the I2C driver knows to run to
+completion during sending a panic event.
 
 
 Other Pieces
 ------------
 
 Watchdog
+--------
 
 A watchdog timer is provided that implements the Linux-standard
 watchdog timer interface.  It has three module parameters that can be
 used to control it:
 
-  insmod ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
-      preaction=<preaction type> preop=<preop type>
+  modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
+      preaction=<preaction type> preop=<preop type> start_now=x
 
 The timeout is the number of seconds to the action, and the pretimeout
 is the amount of seconds before the reset that the pre-timeout panic will
-occur (if pretimeout is zero, then pretimeout will not be enabled).
+occur (if pretimeout is zero, then pretimeout will not be enabled).  Note
+that the pretimeout is the time before the final timeout.  So if the
+timeout is 50 seconds and the pretimeout is 10 seconds, then the pretimeout
+will occur in 40 second (10 seconds before the timeout).
 
 The action may be "reset", "power_cycle", or "power_off", and
 specifies what to do when the timer times out, and defaults to
@@ -344,16 +469,19 @@ When preop is set to "preop_give_data", one byte comes ready to read
 on the device when the pretimeout occurs.  Select and fasync work on
 the device, as well.
 
+If start_now is set to 1, the watchdog timer will start running as
+soon as the driver is loaded.
+
 When compiled into the kernel, the kernel command line is available
 for configuring the watchdog:
 
-  ipmi_wdog=<timeout>[,<pretimeout>[,<option>[,<options>....]]]
+  ipmi_watchdog.timeout=<t> ipmi_watchdog.pretimeout=<t>
+	ipmi_watchdog.action=<action type>
+	ipmi_watchdog.preaction=<preaction type>
+	ipmi_watchdog.preop=<preop type>
+	ipmi_watchdog.start_now=x
 
-The options are the actions and preaction above (if an option
-controlling the same thing is specified twice, the last is taken).  An
-options "start_now" is also there, if included, the watchdog will
-start running immediately when all the drivers are ready, it doesn't
-have to have a user hooked up to start it.
+The options are the same as the module parameter options.
 
 The watchdog will panic and start a 120 second reset timeout if it
 gets a pre-action.  During a panic or a reboot, the watchdog will
diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
index 9940b2dccbea..b632538fff36 100644
--- a/drivers/char/ipmi/Kconfig
+++ b/drivers/char/ipmi/Kconfig
@@ -43,11 +43,13 @@ config IPMI_DEVICE_INTERFACE
          This provides an IOCTL interface to the IPMI message handler so
 	 userland processes may use IPMI.  It supports poll() and select().
 
-config IPMI_KCS
-       tristate 'IPMI KCS handler'
+config IPMI_SI
+       tristate 'IPMI System Interface handler'
        depends on IPMI_HANDLER
        help
-         Provides a driver for a KCS-style interface to a BMC.
+         Provides a driver for System Interfaces (KCS, SMIC, BT).
+	 Currently, only KCS and SMIC are supported.  If
+	 you are using IPMI, you should probably say "y" here.
 
 config IPMI_WATCHDOG
        tristate 'IPMI Watchdog Timer'
diff --git a/drivers/char/ipmi/Makefile b/drivers/char/ipmi/Makefile
index 1f55b46a8188..b7d8230721a2 100644
--- a/drivers/char/ipmi/Makefile
+++ b/drivers/char/ipmi/Makefile
@@ -2,12 +2,13 @@
 # Makefile for the ipmi drivers.
 #
 
-ipmi_kcs_drv-objs := ipmi_kcs_sm.o ipmi_kcs_intf.o
+ipmi_si-objs := ipmi_si_intf.o ipmi_kcs_sm.o ipmi_smic_sm.o ipmi_bt_sm.o
 
 obj-$(CONFIG_IPMI_HANDLER) += ipmi_msghandler.o
 obj-$(CONFIG_IPMI_DEVICE_INTERFACE) += ipmi_devintf.o
-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs_drv.o
+obj-$(CONFIG_IPMI_SI) += ipmi_si.o
 obj-$(CONFIG_IPMI_WATCHDOG) += ipmi_watchdog.o
 
-ipmi_kcs_drv.o:	$(ipmi_kcs_drv-objs)
-	$(LD) -r -o $@ $(ipmi_kcs_drv-objs) 
+ipmi_si.o:	$(ipmi_si-objs)
+	$(LD) -r -o $@ $(ipmi_si-objs)
+
diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c
new file mode 100644
index 000000000000..622456a52e5c
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_bt_sm.c
@@ -0,0 +1,513 @@
+/*
+ *  ipmi_bt_sm.c
+ *
+ *  The state machine for an Open IPMI BT sub-driver under ipmi_si.c, part
+ *  of the driver architecture at http://sourceforge.net/project/openipmi
+ *
+ *  Author:	Rocky Craig <first.last@hp.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+#include <linux/kernel.h> /* For printk. */
+#include <linux/string.h>
+#include <linux/ipmi_msgdefs.h>		/* for completion codes */
+#include "ipmi_si_sm.h"
+
+#define IPMI_BT_VERSION "v31"
+
+static int bt_debug = 0x00;	/* Production value 0, see following flags */
+
+#define	BT_DEBUG_ENABLE	1
+#define BT_DEBUG_MSG	2
+#define BT_DEBUG_STATES	4
+
+/* Typical "Get BT Capabilities" values are 2-3 retries, 5-10 seconds,
+   and 64 byte buffers.  However, one HP implementation wants 255 bytes of
+   buffer (with a documented message of 160 bytes) so go for the max.
+   Since the Open IPMI architecture is single-message oriented at this
+   stage, the queue depth of BT is of no concern. */
+
+#define BT_NORMAL_TIMEOUT	2000000	/* seconds in microseconds */
+#define BT_RETRY_LIMIT		2
+#define BT_RESET_DELAY		6000000	/* 6 seconds after warm reset */
+
+enum bt_states {
+	BT_STATE_IDLE,
+	BT_STATE_XACTION_START,
+	BT_STATE_WRITE_BYTES,
+	BT_STATE_WRITE_END,
+	BT_STATE_WRITE_CONSUME,
+	BT_STATE_B2H_WAIT,
+	BT_STATE_READ_END,
+	BT_STATE_RESET1,		/* These must come last */
+	BT_STATE_RESET2,
+	BT_STATE_RESET3,
+	BT_STATE_RESTART,
+	BT_STATE_HOSED
+};
+
+struct si_sm_data {
+	enum bt_states	state;
+	enum bt_states	last_state;	/* assist printing and resets */
+	unsigned char	seq;		/* BT sequence number */
+	struct si_sm_io	*io;
+        unsigned char	write_data[IPMI_MAX_MSG_LENGTH];
+        int		write_count;
+        unsigned char	read_data[IPMI_MAX_MSG_LENGTH];
+        int		read_count;
+        int		truncated;
+        long		timeout;
+        unsigned int	error_retries;	/* end of "common" fields */
+	int		nonzero_status;	/* hung BMCs stay all 0 */
+};
+
+#define BT_CLR_WR_PTR	0x01	/* See IPMI 1.5 table 11.6.4 */
+#define BT_CLR_RD_PTR	0x02
+#define BT_H2B_ATN	0x04
+#define BT_B2H_ATN	0x08
+#define BT_SMS_ATN	0x10
+#define BT_OEM0		0x20
+#define BT_H_BUSY	0x40
+#define BT_B_BUSY	0x80
+
+/* Some bits are toggled on each write: write once to set it, once
+   more to clear it; writing a zero does nothing.  To absolutely
+   clear it, check its state and write if set.  This avoids the "get
+   current then use as mask" scheme to modify one bit.  Note that the
+   variable "bt" is hardcoded into these macros. */
+
+#define BT_STATUS	bt->io->inputb(bt->io, 0)
+#define BT_CONTROL(x)	bt->io->outputb(bt->io, 0, x)
+
+#define BMC2HOST	bt->io->inputb(bt->io, 1)
+#define HOST2BMC(x)	bt->io->outputb(bt->io, 1, x)
+
+#define BT_INTMASK_R	bt->io->inputb(bt->io, 2)
+#define BT_INTMASK_W(x)	bt->io->outputb(bt->io, 2, x)
+
+/* Convenience routines for debugging.  These are not multi-open safe!
+   Note the macros have hardcoded variables in them. */
+
+static char *state2txt(unsigned char state)
+{
+	switch (state) {
+		case BT_STATE_IDLE:		return("IDLE");
+		case BT_STATE_XACTION_START:	return("XACTION");
+		case BT_STATE_WRITE_BYTES:	return("WR_BYTES");
+		case BT_STATE_WRITE_END:	return("WR_END");
+		case BT_STATE_WRITE_CONSUME:	return("WR_CONSUME");
+		case BT_STATE_B2H_WAIT:		return("B2H_WAIT");
+		case BT_STATE_READ_END:		return("RD_END");
+		case BT_STATE_RESET1:		return("RESET1");
+		case BT_STATE_RESET2:		return("RESET2");
+		case BT_STATE_RESET3:		return("RESET3");
+		case BT_STATE_RESTART:		return("RESTART");
+		case BT_STATE_HOSED:		return("HOSED");
+	}
+	return("BAD STATE");
+}
+#define STATE2TXT state2txt(bt->state)
+
+static char *status2txt(unsigned char status, char *buf)
+{
+	strcpy(buf, "[ ");
+	if (status & BT_B_BUSY) strcat(buf, "B_BUSY ");
+	if (status & BT_H_BUSY) strcat(buf, "H_BUSY ");
+	if (status & BT_OEM0) strcat(buf, "OEM0 ");
+	if (status & BT_SMS_ATN) strcat(buf, "SMS ");
+	if (status & BT_B2H_ATN) strcat(buf, "B2H ");
+	if (status & BT_H2B_ATN) strcat(buf, "H2B ");
+	strcat(buf, "]");
+	return buf;
+}
+#define STATUS2TXT(buf) status2txt(status, buf)
+
+/* This will be called from within this module on a hosed condition */
+#define FIRST_SEQ	0
+static unsigned int bt_init_data(struct si_sm_data *bt, struct si_sm_io *io)
+{
+	bt->state = BT_STATE_IDLE;
+	bt->last_state = BT_STATE_IDLE;
+	bt->seq = FIRST_SEQ;
+	bt->io = io;
+	bt->write_count = 0;
+	bt->read_count = 0;
+	bt->error_retries = 0;
+	bt->nonzero_status = 0;
+	bt->truncated = 0;
+	bt->timeout = BT_NORMAL_TIMEOUT;
+	return 3; /* We claim 3 bytes of space; ought to check SPMI table */
+}
+
+static int bt_start_transaction(struct si_sm_data *bt,
+				unsigned char *data,
+				unsigned int size)
+{
+	unsigned int i;
+
+	if ((size < 2) || (size > IPMI_MAX_MSG_LENGTH)) return -1;
+
+	if ((bt->state != BT_STATE_IDLE) && (bt->state != BT_STATE_HOSED))
+		return -2;
+
+	if (bt_debug & BT_DEBUG_MSG) {
+    		printk(KERN_WARNING "+++++++++++++++++++++++++++++++++++++\n");
+		printk(KERN_WARNING "BT: write seq=0x%02X:", bt->seq);
+		for (i = 0; i < size; i ++) printk (" %02x", data[i]);
+		printk("\n");
+	}
+	bt->write_data[0] = size + 1;	/* all data plus seq byte */
+	bt->write_data[1] = *data;	/* NetFn/LUN */
+	bt->write_data[2] = bt->seq;
+	memcpy(bt->write_data + 3, data + 1, size - 1);
+	bt->write_count = size + 2;
+
+	bt->error_retries = 0;
+	bt->nonzero_status = 0;
+	bt->read_count = 0;
+	bt->truncated = 0;
+	bt->state = BT_STATE_XACTION_START;
+	bt->last_state = BT_STATE_IDLE;
+	bt->timeout = BT_NORMAL_TIMEOUT;
+	return 0;
+}
+
+/* After the upper state machine has been told SI_SM_TRANSACTION_COMPLETE
+   it calls this.  Strip out the length and seq bytes. */
+
+static int bt_get_result(struct si_sm_data *bt,
+			   unsigned char *data,
+			   unsigned int length)
+{
+	int i, msg_len;
+
+	msg_len = bt->read_count - 2;		/* account for length & seq */
+	/* Always NetFn, Cmd, cCode */
+	if (msg_len < 3 || msg_len > IPMI_MAX_MSG_LENGTH) {
+		printk(KERN_WARNING "BT results: bad msg_len = %d\n", msg_len);
+		data[0] = bt->write_data[1] | 0x4;	/* Kludge a response */
+		data[1] = bt->write_data[3];
+		data[2] = IPMI_ERR_UNSPECIFIED;
+		msg_len = 3;
+	} else {
+		data[0] = bt->read_data[1];
+		data[1] = bt->read_data[3];
+		if (length < msg_len) bt->truncated = 1;
+		if (bt->truncated) {	/* can be set in read_all_bytes() */
+			data[2] = IPMI_ERR_MSG_TRUNCATED;
+			msg_len = 3;
+		} else memcpy(data + 2, bt->read_data + 4, msg_len - 2);
+
+		if (bt_debug & BT_DEBUG_MSG) {
+			printk (KERN_WARNING "BT: res (raw)");
+			for (i = 0; i < msg_len; i++) printk(" %02x", data[i]);
+			printk ("\n");
+		}
+	}
+	bt->read_count = 0;	/* paranoia */
+	return msg_len;
+}
+
+/* This bit's functionality is optional */
+#define BT_BMC_HWRST	0x80
+
+static void reset_flags(struct si_sm_data *bt)
+{
+	if (BT_STATUS & BT_H_BUSY) BT_CONTROL(BT_H_BUSY);
+	if (BT_STATUS & BT_B_BUSY) BT_CONTROL(BT_B_BUSY);
+	BT_CONTROL(BT_CLR_WR_PTR);
+	BT_CONTROL(BT_SMS_ATN);
+	BT_INTMASK_W(BT_BMC_HWRST);
+#ifdef DEVELOPMENT_ONLY_NOT_FOR_PRODUCTION
+	if (BT_STATUS & BT_B2H_ATN) {
+		int i;
+		BT_CONTROL(BT_H_BUSY);
+		BT_CONTROL(BT_B2H_ATN);
+		BT_CONTROL(BT_CLR_RD_PTR);
+		for (i = 0; i < IPMI_MAX_MSG_LENGTH + 2; i++) BMC2HOST;
+		BT_CONTROL(BT_H_BUSY);
+	}
+#endif
+}
+
+static inline void write_all_bytes(struct si_sm_data *bt)
+{
+	int i;
+
+	if (bt_debug & BT_DEBUG_MSG) {
+    		printk(KERN_WARNING "BT: write %d bytes seq=0x%02X",
+			bt->write_count, bt->seq);
+		for (i = 0; i < bt->write_count; i++)
+			printk (" %02x", bt->write_data[i]);
+		printk ("\n");
+	}
+	for (i = 0; i < bt->write_count; i++) HOST2BMC(bt->write_data[i]);
+}
+
+static inline int read_all_bytes(struct si_sm_data *bt)
+{
+	unsigned char i;
+
+	bt->read_data[0] = BMC2HOST;
+	bt->read_count = bt->read_data[0];
+	if (bt_debug & BT_DEBUG_MSG)
+    		printk(KERN_WARNING "BT: read %d bytes:", bt->read_count);
+
+	/* minimum: length, NetFn, Seq, Cmd, cCode == 5 total, or 4 more
+	   following the length byte. */
+	if (bt->read_count < 4 || bt->read_count >= IPMI_MAX_MSG_LENGTH) {
+		if (bt_debug & BT_DEBUG_MSG)
+			printk("bad length %d\n", bt->read_count);
+		bt->truncated = 1;
+		return 1;	/* let next XACTION START clean it up */
+	}
+	for (i = 1; i <= bt->read_count; i++) bt->read_data[i] = BMC2HOST;
+	bt->read_count++;	/* account for the length byte */
+
+	if (bt_debug & BT_DEBUG_MSG) {
+	    	for (i = 0; i < bt->read_count; i++)
+			printk (" %02x", bt->read_data[i]);
+	    	printk ("\n");
+	}
+	if (bt->seq != bt->write_data[2])	/* idiot check */
+		printk(KERN_WARNING "BT: internal error: sequence mismatch\n");
+
+	/* per the spec, the (NetFn, Seq, Cmd) tuples should match */
+	if ((bt->read_data[3] == bt->write_data[3]) &&		/* Cmd */
+        	(bt->read_data[2] == bt->write_data[2]) &&	/* Sequence */
+        	((bt->read_data[1] & 0xF8) == (bt->write_data[1] & 0xF8)))
+			return 1;
+
+	if (bt_debug & BT_DEBUG_MSG) printk(KERN_WARNING "BT: bad packet: "
+		"want 0x(%02X, %02X, %02X) got (%02X, %02X, %02X)\n",
+		bt->write_data[1], bt->write_data[2], bt->write_data[3],
+		bt->read_data[1],  bt->read_data[2],  bt->read_data[3]);
+	return 0;
+}
+
+/* Modifies bt->state appropriately, need to get into the bt_event() switch */
+
+static void error_recovery(struct si_sm_data *bt, char *reason)
+{
+	unsigned char status;
+	char buf[40]; /* For getting status */
+
+	bt->timeout = BT_NORMAL_TIMEOUT; /* various places want to retry */
+
+	status = BT_STATUS;
+	printk(KERN_WARNING "BT: %s in %s %s ", reason, STATE2TXT,
+	       STATUS2TXT(buf));
+
+	(bt->error_retries)++;
+	if (bt->error_retries > BT_RETRY_LIMIT) {
+		printk("retry limit (%d) exceeded\n", BT_RETRY_LIMIT);
+		bt->state = BT_STATE_HOSED;
+		if (!bt->nonzero_status)
+			printk(KERN_ERR "IPMI: BT stuck, try power cycle\n");
+		else if (bt->seq == FIRST_SEQ + BT_RETRY_LIMIT) {
+			/* most likely during insmod */
+			printk(KERN_WARNING "IPMI: BT reset (takes 5 secs)\n");
+        		bt->state = BT_STATE_RESET1;
+		}
+	return;
+	}
+
+	/* Sometimes the BMC queues get in an "off-by-one" state...*/
+	if ((bt->state == BT_STATE_B2H_WAIT) && (status & BT_B2H_ATN)) {
+    		printk("retry B2H_WAIT\n");
+		return;
+	}
+
+	printk("restart command\n");
+	bt->state = BT_STATE_RESTART;
+}
+
+/* Check the status and (possibly) advance the BT state machine.  The
+   default return is SI_SM_CALL_WITH_DELAY. */
+
+static enum si_sm_result bt_event(struct si_sm_data *bt, long time)
+{
+	unsigned char status;
+	char buf[40]; /* For getting status */
+	int i;
+
+	status = BT_STATUS;
+	bt->nonzero_status |= status;
+
+	if ((bt_debug & BT_DEBUG_STATES) && (bt->state != bt->last_state))
+		printk(KERN_WARNING "BT: %s %s TO=%ld - %ld \n",
+			STATE2TXT,
+			STATUS2TXT(buf),
+			bt->timeout,
+			time);
+	bt->last_state = bt->state;
+
+	if (bt->state == BT_STATE_HOSED) return SI_SM_HOSED;
+
+	if (bt->state != BT_STATE_IDLE) {	/* do timeout test */
+
+		/* Certain states, on error conditions, can lock up a CPU
+		   because they are effectively in an infinite loop with
+		   CALL_WITHOUT_DELAY (right back here with time == 0).
+		   Prevent infinite lockup by ALWAYS decrementing timeout. */
+
+    	/* FIXME: bt_event is sometimes called with time > BT_NORMAL_TIMEOUT
+              (noticed in ipmi_smic_sm.c January 2004) */
+
+		if ((time <= 0) || (time >= BT_NORMAL_TIMEOUT)) time = 100;
+		bt->timeout -= time;
+		if ((bt->timeout < 0) && (bt->state < BT_STATE_RESET1)) {
+			error_recovery(bt, "timed out");
+			return SI_SM_CALL_WITHOUT_DELAY;
+		}
+	}
+
+	switch (bt->state) {
+
+    	case BT_STATE_IDLE:	/* check for asynchronous messages */
+		if (status & BT_SMS_ATN) {
+			BT_CONTROL(BT_SMS_ATN);	/* clear it */
+			return SI_SM_ATTN;
+		}
+		return SI_SM_IDLE;
+
+	case BT_STATE_XACTION_START:
+		if (status & BT_H_BUSY) {
+			BT_CONTROL(BT_H_BUSY);
+			break;
+		}
+    		if (status & BT_B2H_ATN) break;
+		bt->state = BT_STATE_WRITE_BYTES;
+		return SI_SM_CALL_WITHOUT_DELAY;	/* for logging */
+
+	case BT_STATE_WRITE_BYTES:
+		if (status & (BT_B_BUSY | BT_H2B_ATN)) break;
+		BT_CONTROL(BT_CLR_WR_PTR);
+		write_all_bytes(bt);
+		BT_CONTROL(BT_H2B_ATN);	/* clears too fast to catch? */
+		bt->state = BT_STATE_WRITE_CONSUME;
+		return SI_SM_CALL_WITHOUT_DELAY; /* it MIGHT sail through */
+
+	case BT_STATE_WRITE_CONSUME: /* BMCs usually blow right thru here */
+        	if (status & (BT_H2B_ATN | BT_B_BUSY)) break;
+		bt->state = BT_STATE_B2H_WAIT;
+		/* fall through with status */
+
+	/* Stay in BT_STATE_B2H_WAIT until a packet matches.  However, spinning
+	   hard here, constantly reading status, seems to hold off the
+	   generation of B2H_ATN so ALWAYS return CALL_WITH_DELAY. */
+
+	case BT_STATE_B2H_WAIT:
+    		if (!(status & BT_B2H_ATN)) break;
+
+		/* Assume ordered, uncached writes: no need to wait */
+		if (!(status & BT_H_BUSY)) BT_CONTROL(BT_H_BUSY); /* set */
+		BT_CONTROL(BT_B2H_ATN);		/* clear it, ACK to the BMC */
+		BT_CONTROL(BT_CLR_RD_PTR);	/* reset the queue */
+		i = read_all_bytes(bt);
+		BT_CONTROL(BT_H_BUSY);		/* clear */
+		if (!i) break;			/* Try this state again */
+		bt->state = BT_STATE_READ_END;
+		return SI_SM_CALL_WITHOUT_DELAY;	/* for logging */
+
+    	case BT_STATE_READ_END:
+
+		/* I could wait on BT_H_BUSY to go clear for a truly clean
+		   exit.  However, this is already done in XACTION_START
+		   and the (possible) extra loop/status/possible wait affects
+		   performance.  So, as long as it works, just ignore H_BUSY */
+
+#ifdef MAKE_THIS_TRUE_IF_NECESSARY
+
+		if (status & BT_H_BUSY) break;
+#endif
+		bt->seq++;
+		bt->state = BT_STATE_IDLE;
+		return SI_SM_TRANSACTION_COMPLETE;
+
+	case BT_STATE_RESET1:
+    		reset_flags(bt);
+    		bt->timeout = BT_RESET_DELAY;;
+		bt->state = BT_STATE_RESET2;
+		break;
+
+	case BT_STATE_RESET2:		/* Send a soft reset */
+		BT_CONTROL(BT_CLR_WR_PTR);
+		HOST2BMC(3);		/* number of bytes following */
+		HOST2BMC(0x18);		/* NetFn/LUN == Application, LUN 0 */
+		HOST2BMC(42);		/* Sequence number */
+		HOST2BMC(3);		/* Cmd == Soft reset */
+		BT_CONTROL(BT_H2B_ATN);
+		bt->state = BT_STATE_RESET3;
+		break;
+
+	case BT_STATE_RESET3:
+		if (bt->timeout > 0) return SI_SM_CALL_WITH_DELAY;
+		bt->state = BT_STATE_RESTART;	/* printk in debug modes */
+		break;
+
+	case BT_STATE_RESTART:		/* don't reset retries! */
+		bt->write_data[2] = ++bt->seq;
+		bt->read_count = 0;
+		bt->nonzero_status = 0;
+		bt->timeout = BT_NORMAL_TIMEOUT;
+		bt->state = BT_STATE_XACTION_START;
+		break;
+
+	default:	/* HOSED is supposed to be caught much earlier */
+		error_recovery(bt, "internal logic error");
+		break;
+  	}
+  	return SI_SM_CALL_WITH_DELAY;
+}
+
+static int bt_detect(struct si_sm_data *bt)
+{
+	/* It's impossible for the BT status and interrupt registers to be
+	   all 1's, (assuming a properly functioning, self-initialized BMC)
+	   but that's what you get from reading a bogus address, so we
+	   test that first.  The calling routine uses negative logic. */
+
+	if ((BT_STATUS == 0xFF) && (BT_INTMASK_R == 0xFF)) return 1;
+	reset_flags(bt);
+	return 0;
+}
+
+static void bt_cleanup(struct si_sm_data *bt)
+{
+}
+
+static int bt_size(void)
+{
+	return sizeof(struct si_sm_data);
+}
+
+struct si_sm_handlers bt_smi_handlers =
+{
+	.version           = IPMI_BT_VERSION,
+	.init_data         = bt_init_data,
+	.start_transaction = bt_start_transaction,
+	.get_result        = bt_get_result,
+	.event             = bt_event,
+	.detect            = bt_detect,
+	.cleanup           = bt_cleanup,
+	.size              = bt_size,
+};
diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index b69ff3d19284..afd1de325f93 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -33,6 +33,7 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/errno.h>
 #include <asm/system.h>
 #include <linux/sched.h>
@@ -44,6 +45,8 @@
 #include <asm/semaphore.h>
 #include <linux/init.h>
 
+#define IPMI_DEVINTF_VERSION "v31"
+
 struct ipmi_file_private
 {
 	ipmi_user_t          user;
@@ -53,6 +56,8 @@ struct ipmi_file_private
 	struct fasync_struct *fasync_queue;
 	wait_queue_head_t    wait;
 	struct semaphore     recv_sem;
+	int                  default_retries;
+	unsigned int         default_retry_time_ms;
 };
 
 static void file_receive_handler(struct ipmi_recv_msg *msg,
@@ -138,6 +143,10 @@ static int ipmi_open(struct inode *inode, struct file *file)
 	priv->fasync_queue = NULL;
 	sema_init(&(priv->recv_sem), 1);
 
+	/* Use the low-level defaults. */
+	priv->default_retries = -1;
+	priv->default_retry_time_ms = 0;
+
 	return 0;
 }
 
@@ -158,6 +167,63 @@ static int ipmi_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int handle_send_req(ipmi_user_t     user,
+			   struct ipmi_req *req,
+			   int             retries,
+			   unsigned int    retry_time_ms)
+{
+	int              rv;
+	struct ipmi_addr addr;
+	unsigned char    *msgdata;
+
+	if (req->addr_len > sizeof(struct ipmi_addr))
+		return -EINVAL;
+
+	if (copy_from_user(&addr, req->addr, req->addr_len))
+		return -EFAULT;
+
+	msgdata = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+	if (!msgdata)
+		return -ENOMEM;
+
+	/* From here out we cannot return, we must jump to "out" for
+	   error exits to free msgdata. */
+
+	rv = ipmi_validate_addr(&addr, req->addr_len);
+	if (rv)
+		goto out;
+
+	if (req->msg.data != NULL) {
+		if (req->msg.data_len > IPMI_MAX_MSG_LENGTH) {
+			rv = -EMSGSIZE;
+			goto out;
+		}
+
+		if (copy_from_user(&msgdata,
+				   req->msg.data,
+				   req->msg.data_len))
+		{
+			rv = -EFAULT;
+			goto out;
+		}
+	} else {
+		req->msg.data_len = 0;
+	}
+	req->msg.data = msgdata;
+
+	rv = ipmi_request_settime(user,
+				  &addr,
+				  req->msgid,
+				  &(req->msg),
+				  NULL,
+				  0,
+				  retries,
+				  retry_time_ms);
+ out:
+	kfree(msgdata);
+	return rv;
+}
+
 static int ipmi_ioctl(struct inode  *inode,
 		      struct file   *file,
 		      unsigned int  cmd,
@@ -170,54 +236,33 @@ static int ipmi_ioctl(struct inode  *inode,
 	{
 	case IPMICTL_SEND_COMMAND:
 	{
-		struct ipmi_req    req;
-		struct ipmi_addr   addr;
-		unsigned char msgdata[IPMI_MAX_MSG_LENGTH];
+		struct ipmi_req req;
 
 		if (copy_from_user(&req, (void *) data, sizeof(req))) {
 			rv = -EFAULT;
 			break;
 		}
 
-		if (req.addr_len > sizeof(struct ipmi_addr))
-		{
-			rv = -EINVAL;
-			break;
-		}
+		rv = handle_send_req(priv->user,
+				     &req,
+				     priv->default_retries,
+				     priv->default_retry_time_ms);
+		break;
+	}
+
+	case IPMICTL_SEND_COMMAND_SETTIME:
+	{
+		struct ipmi_req_settime req;
 
-		if (copy_from_user(&addr, req.addr, req.addr_len)) {
+		if (copy_from_user(&req, (void *) data, sizeof(req))) {
 			rv = -EFAULT;
 			break;
 		}
 
-		rv = ipmi_validate_addr(&addr, req.addr_len);
-		if (rv)
-			break;
-
-		if (req.msg.data != NULL) {
-			if (req.msg.data_len > IPMI_MAX_MSG_LENGTH) {
-				rv = -EMSGSIZE;
-				break;
-			}
-
-			if (copy_from_user(&msgdata,
-					   req.msg.data,
-					   req.msg.data_len))
-			{
-				rv = -EFAULT;
-				break;
-			}
-		} else {
-			req.msg.data_len = 0;
-		}
-
-		req.msg.data = msgdata;
-
-		rv = ipmi_request(priv->user,
-				  &addr,
-				  req.msgid,
-				  &(req.msg),
-				  0);
+		rv = handle_send_req(priv->user,
+				     &req.req,
+				     req.retries,
+				     req.retry_time_ms);
 		break;
 	}
 
@@ -416,7 +461,36 @@ static int ipmi_ioctl(struct inode  *inode,
 		rv = 0;
 		break;
 	}
+	case IPMICTL_SET_TIMING_PARMS_CMD:
+	{
+		struct ipmi_timing_parms parms;
+
+		if (copy_from_user(&parms, (void *) data, sizeof(parms))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		priv->default_retries = parms.retries;
+		priv->default_retry_time_ms = parms.retry_time_ms;
+		rv = 0;
+		break;
+	}
+
+	case IPMICTL_GET_TIMING_PARMS_CMD:
+	{
+		struct ipmi_timing_parms parms;
+
+		parms.retries = priv->default_retries;
+		parms.retry_time_ms = priv->default_retry_time_ms;
 
+		if (copy_to_user((void *) data, &parms, sizeof(parms))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		rv = 0;
+		break;
+	}
 	}
   
 	return rv;
@@ -435,29 +509,30 @@ static struct file_operations ipmi_fops = {
 #define DEVICE_NAME     "ipmidev"
 
 static int ipmi_major = 0;
-MODULE_PARM(ipmi_major, "i");
-
-#define MAX_DEVICES 10
+module_param(ipmi_major, int, 0);
+MODULE_PARM_DESC(ipmi_major, "Sets the major number of the IPMI device.  By"
+		 " default, or if you set it to zero, it will choose the next"
+		 " available device.  Setting it to -1 will disable the"
+		 " interface.  Other values will set the major device number"
+		 " to that value.");
 
 static void ipmi_new_smi(int if_num)
 {
-	if (if_num <= MAX_DEVICES) {
-		devfs_mk_cdev(MKDEV(ipmi_major, if_num),
-				S_IFCHR | S_IRUSR | S_IWUSR,
-				"ipmidev/%d", if_num);
-	}
+	devfs_mk_cdev(MKDEV(ipmi_major, if_num),
+		      S_IFCHR | S_IRUSR | S_IWUSR,
+		      "ipmidev/%d", if_num);
 }
 
 static void ipmi_smi_gone(int if_num)
 {
-	if (if_num <= MAX_DEVICES)
-		devfs_remove("ipmidev/%d", if_num);
+	devfs_remove("ipmidev/%d", if_num);
 }
 
 static struct ipmi_smi_watcher smi_watcher =
 {
-	.new_smi	= ipmi_new_smi,
-	.smi_gone	= ipmi_smi_gone,
+	.owner    = THIS_MODULE,
+	.new_smi  = ipmi_new_smi,
+	.smi_gone = ipmi_smi_gone,
 };
 
 static __init int init_ipmi_devintf(void)
@@ -467,6 +542,9 @@ static __init int init_ipmi_devintf(void)
 	if (ipmi_major < 0)
 		return -EINVAL;
 
+	printk(KERN_INFO "ipmi device interface version "
+	       IPMI_DEVINTF_VERSION "\n");
+
 	rv = register_chrdev(ipmi_major, DEVICE_NAME, &ipmi_fops);
 	if (rv < 0) {
 		printk(KERN_ERR "ipmi: can't get major %d\n", ipmi_major);
@@ -482,13 +560,10 @@ static __init int init_ipmi_devintf(void)
 	rv = ipmi_smi_watcher_register(&smi_watcher);
 	if (rv) {
 		unregister_chrdev(ipmi_major, DEVICE_NAME);
-		printk(KERN_WARNING "ipmi: can't register smi watcher");
+		printk(KERN_WARNING "ipmi: can't register smi watcher\n");
 		return rv;
 	}
 
-	printk(KERN_INFO "ipmi: device interface at char major %d\n",
-	       ipmi_major);
-
 	return 0;
 }
 module_init(init_ipmi_devintf);
@@ -500,21 +575,5 @@ static __exit void cleanup_ipmi(void)
 	unregister_chrdev(ipmi_major, DEVICE_NAME);
 }
 module_exit(cleanup_ipmi);
-#ifndef MODULE
-static __init int ipmi_setup (char *str)
-{
-	int x;
-
-	if (get_option (&str, &x)) {
-		/* ipmi=x sets the major number to x. */
-		ipmi_major = x;
-	} else if (!strcmp(str, "off")) {
-		ipmi_major = -1;
-	}
-
-	return 1;
-}
-#endif
 
-__setup("ipmi=", ipmi_setup);
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_kcs_intf.c b/drivers/char/ipmi/ipmi_kcs_intf.c
deleted file mode 100644
index f215d7697160..000000000000
--- a/drivers/char/ipmi/ipmi_kcs_intf.c
+++ /dev/null
@@ -1,1305 +0,0 @@
-/*
- * ipmi_kcs_intf.c
- *
- * The interface to the IPMI driver for the KCS.
- *
- * Author: MontaVista Software, Inc.
- *         Corey Minyard <minyard@mvista.com>
- *         source@mvista.com
- *
- * Copyright 2002 MontaVista Software Inc.
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
- *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * This file holds the "policy" for the interface to the KCS state
- * machine.  It does the configuration, handles timers and interrupts,
- * and drives the real KCS state machine.
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <asm/system.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/list.h>
-#include <linux/ioport.h>
-#ifdef CONFIG_HIGH_RES_TIMERS
-#include <linux/hrtime.h>
-#endif
-#include <linux/interrupt.h>
-#include <linux/rcupdate.h>
-#include <linux/ipmi_smi.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include "ipmi_kcs_sm.h"
-#include <linux/init.h>
-
-/* Measure times between events in the driver. */
-#undef DEBUG_TIMING
-
-/* Timing parameters.  Call every 10 ms when not doing anything,
-   otherwise call every KCS_SHORT_TIMEOUT_USEC microseconds. */
-#define KCS_TIMEOUT_TIME_USEC	10000
-#define KCS_USEC_PER_JIFFY	(1000000/HZ)
-#define KCS_TIMEOUT_JIFFIES	(KCS_TIMEOUT_TIME_USEC/KCS_USEC_PER_JIFFY)
-#define KCS_SHORT_TIMEOUT_USEC  250 /* .25ms when the SM request a
-                                       short timeout */
-
-#ifdef CONFIG_IPMI_KCS
-/* This forces a dependency to the config file for this option. */
-#endif
-
-enum kcs_intf_state {
-	KCS_NORMAL,
-	KCS_GETTING_FLAGS,
-	KCS_GETTING_EVENTS,
-	KCS_CLEARING_FLAGS,
-	KCS_CLEARING_FLAGS_THEN_SET_IRQ,
-	KCS_GETTING_MESSAGES,
-	KCS_ENABLE_INTERRUPTS1,
-	KCS_ENABLE_INTERRUPTS2
-	/* FIXME - add watchdog stuff. */
-};
-
-struct kcs_info
-{
-	ipmi_smi_t          intf;
-	struct kcs_data     *kcs_sm;
-	spinlock_t          kcs_lock;
-	spinlock_t          msg_lock;
-	struct list_head    xmit_msgs;
-	struct list_head    hp_xmit_msgs;
-	struct ipmi_smi_msg *curr_msg;
-	enum kcs_intf_state kcs_state;
-
-	/* Flags from the last GET_MSG_FLAGS command, used when an ATTN
-	   is set to hold the flags until we are done handling everything
-	   from the flags. */
-#define RECEIVE_MSG_AVAIL	0x01
-#define EVENT_MSG_BUFFER_FULL	0x02
-#define WDT_PRE_TIMEOUT_INT	0x08
-	unsigned char       msg_flags;
-
-	/* If set to true, this will request events the next time the
-	   state machine is idle. */
-	atomic_t            req_events;
-
-	/* If true, run the state machine to completion on every send
-	   call.  Generally used after a panic to make sure stuff goes
-	   out. */
-	int                 run_to_completion;
-
-	/* The I/O port of a KCS interface. */
-	int                 port;
-
-	/* zero if no irq; */
-	int                 irq;
-
-	/* The physical and remapped memory addresses of a KCS interface. */
-	unsigned long	    physaddr;
-	unsigned char	    *addr;
-
-	/* The timer for this kcs. */
-	struct timer_list   kcs_timer;
-
-	/* The time (in jiffies) the last timeout occurred at. */
-	unsigned long       last_timeout_jiffies;
-
-	/* Used to gracefully stop the timer without race conditions. */
-	volatile int        stop_operation;
-	volatile int        timer_stopped;
-
-	/* The driver will disable interrupts when it gets into a
-	   situation where it cannot handle messages due to lack of
-	   memory.  Once that situation clears up, it will re-enable
-	   interrupts. */
-	int                 interrupt_disabled;
-};
-
-static void kcs_restart_short_timer(struct kcs_info *kcs_info);
-
-static void deliver_recv_msg(struct kcs_info *kcs_info, struct ipmi_smi_msg *msg)
-{
-	/* Deliver the message to the upper layer with the lock
-           released. */
-	spin_unlock(&(kcs_info->kcs_lock));
-	ipmi_smi_msg_received(kcs_info->intf, msg);
-	spin_lock(&(kcs_info->kcs_lock));
-}
-
-static void return_hosed_msg(struct kcs_info *kcs_info)
-{
-	struct ipmi_smi_msg *msg = kcs_info->curr_msg;
-
-	/* Make it a reponse */
-	msg->rsp[0] = msg->data[0] | 4;
-	msg->rsp[1] = msg->data[1];
-	msg->rsp[2] = 0xFF; /* Unknown error. */
-	msg->rsp_size = 3;
-			
-	kcs_info->curr_msg = NULL;
-	deliver_recv_msg(kcs_info, msg);
-}
-
-static enum kcs_result start_next_msg(struct kcs_info *kcs_info)
-{
-	int              rv;
-	struct list_head *entry = NULL;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	/* No need to save flags, we aleady have interrupts off and we
-	   already hold the KCS lock. */
-	spin_lock(&(kcs_info->msg_lock));
-	
-	/* Pick the high priority queue first. */
-	if (! list_empty(&(kcs_info->hp_xmit_msgs))) {
-		entry = kcs_info->hp_xmit_msgs.next;
-	} else if (! list_empty(&(kcs_info->xmit_msgs))) {
-		entry = kcs_info->xmit_msgs.next;
-	}
-
-	if (!entry) {
-		kcs_info->curr_msg = NULL;
-		rv = KCS_SM_IDLE;
-	} else {
-		int err;
-
-		list_del(entry);
-		kcs_info->curr_msg = list_entry(entry,
-						struct ipmi_smi_msg,
-						link);
-#ifdef DEBUG_TIMING
-		do_gettimeofday(&t);
-		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-		err = start_kcs_transaction(kcs_info->kcs_sm,
-					   kcs_info->curr_msg->data,
-					   kcs_info->curr_msg->data_size);
-		if (err) {
-			return_hosed_msg(kcs_info);
-		}
-
-		rv = KCS_CALL_WITHOUT_DELAY;
-	}
-	spin_unlock(&(kcs_info->msg_lock));
-
-	return rv;
-}
-
-static void start_enable_irq(struct kcs_info *kcs_info)
-{
-	unsigned char msg[2];
-
-	/* If we are enabling interrupts, we have to tell the
-	   BMC to use them. */
-	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-	msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
-
-	start_kcs_transaction(kcs_info->kcs_sm, msg, 2);
-	kcs_info->kcs_state = KCS_ENABLE_INTERRUPTS1;
-}
-
-static void start_clear_flags(struct kcs_info *kcs_info)
-{
-	unsigned char msg[3];
-
-	/* Make sure the watchdog pre-timeout flag is not set at startup. */
-	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-	msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD;
-	msg[2] = WDT_PRE_TIMEOUT_INT;
-
-	start_kcs_transaction(kcs_info->kcs_sm, msg, 3);
-	kcs_info->kcs_state = KCS_CLEARING_FLAGS;
-}
-
-/* When we have a situtaion where we run out of memory and cannot
-   allocate messages, we just leave them in the BMC and run the system
-   polled until we can allocate some memory.  Once we have some
-   memory, we will re-enable the interrupt. */
-static inline void disable_kcs_irq(struct kcs_info *kcs_info)
-{
-	if ((kcs_info->irq) && (!kcs_info->interrupt_disabled)) {
-		disable_irq_nosync(kcs_info->irq);
-		kcs_info->interrupt_disabled = 1;
-	}
-}
-
-static inline void enable_kcs_irq(struct kcs_info *kcs_info)
-{
-	if ((kcs_info->irq) && (kcs_info->interrupt_disabled)) {
-		enable_irq(kcs_info->irq);
-		kcs_info->interrupt_disabled = 0;
-	}
-}
-
-static void handle_flags(struct kcs_info *kcs_info)
-{
-	if (kcs_info->msg_flags & WDT_PRE_TIMEOUT_INT) {
-		/* Watchdog pre-timeout */
-		start_clear_flags(kcs_info);
-		kcs_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
-		spin_unlock(&(kcs_info->kcs_lock));
-		ipmi_smi_watchdog_pretimeout(kcs_info->intf);
-		spin_lock(&(kcs_info->kcs_lock));
-	} else if (kcs_info->msg_flags & RECEIVE_MSG_AVAIL) {
-		/* Messages available. */
-		kcs_info->curr_msg = ipmi_alloc_smi_msg();
-		if (!kcs_info->curr_msg) {
-			disable_kcs_irq(kcs_info);
-			kcs_info->kcs_state = KCS_NORMAL;
-			return;
-		}
-		enable_kcs_irq(kcs_info);
-
-		kcs_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		kcs_info->curr_msg->data[1] = IPMI_GET_MSG_CMD;
-		kcs_info->curr_msg->data_size = 2;
-
-		start_kcs_transaction(kcs_info->kcs_sm,
-				      kcs_info->curr_msg->data,
-				      kcs_info->curr_msg->data_size);
-		kcs_info->kcs_state = KCS_GETTING_MESSAGES;
-	} else if (kcs_info->msg_flags & EVENT_MSG_BUFFER_FULL) {
-		/* Events available. */
-		kcs_info->curr_msg = ipmi_alloc_smi_msg();
-		if (!kcs_info->curr_msg) {
-			disable_kcs_irq(kcs_info);
-			kcs_info->kcs_state = KCS_NORMAL;
-			return;
-		}
-		enable_kcs_irq(kcs_info);
-
-		kcs_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		kcs_info->curr_msg->data[1] = IPMI_READ_EVENT_MSG_BUFFER_CMD;
-		kcs_info->curr_msg->data_size = 2;
-
-		start_kcs_transaction(kcs_info->kcs_sm,
-				      kcs_info->curr_msg->data,
-				      kcs_info->curr_msg->data_size);
-		kcs_info->kcs_state = KCS_GETTING_EVENTS;
-	} else {
-		kcs_info->kcs_state = KCS_NORMAL;
-	}
-}
-
-static void handle_transaction_done(struct kcs_info *kcs_info)
-{
-	struct ipmi_smi_msg *msg;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-
-	do_gettimeofday(&t);
-	printk("**Done: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-	switch (kcs_info->kcs_state) {
-	case KCS_NORMAL:
-		if (!kcs_info->curr_msg)
-			break;
-			
-		kcs_info->curr_msg->rsp_size
-			= kcs_get_result(kcs_info->kcs_sm,
-					 kcs_info->curr_msg->rsp,
-					 IPMI_MAX_MSG_LENGTH);
-		
-		/* Do this here becase deliver_recv_msg() releases the
-		   lock, and a new message can be put in during the
-		   time the lock is released. */
-		msg = kcs_info->curr_msg;
-		kcs_info->curr_msg = NULL;
-		deliver_recv_msg(kcs_info, msg);
-		break;
-		
-	case KCS_GETTING_FLAGS:
-	{
-		unsigned char msg[4];
-		unsigned int  len;
-
-		/* We got the flags from the KCS, now handle them. */
-		len = kcs_get_result(kcs_info->kcs_sm, msg, 4);
-		if (msg[2] != 0) {
-			/* Error fetching flags, just give up for
-			   now. */
-			kcs_info->kcs_state = KCS_NORMAL;
-		} else if (len < 3) {
-			/* Hmm, no flags.  That's technically illegal, but
-			   don't use uninitialized data. */
-			kcs_info->kcs_state = KCS_NORMAL;
-		} else {
-			kcs_info->msg_flags = msg[3];
-			handle_flags(kcs_info);
-		}
-		break;
-	}
-
-	case KCS_CLEARING_FLAGS:
-	case KCS_CLEARING_FLAGS_THEN_SET_IRQ:
-	{
-		unsigned char msg[3];
-
-		/* We cleared the flags. */
-		kcs_get_result(kcs_info->kcs_sm, msg, 3);
-		if (msg[2] != 0) {
-			/* Error clearing flags */
-			printk(KERN_WARNING
-			       "ipmi_kcs: Error clearing flags: %2.2x\n",
-			       msg[2]);
-		}
-		if (kcs_info->kcs_state == KCS_CLEARING_FLAGS_THEN_SET_IRQ)
-			start_enable_irq(kcs_info);
-		else
-			kcs_info->kcs_state = KCS_NORMAL;
-		break;
-	}
-
-	case KCS_GETTING_EVENTS:
-	{
-		kcs_info->curr_msg->rsp_size
-			= kcs_get_result(kcs_info->kcs_sm,
-					 kcs_info->curr_msg->rsp,
-					 IPMI_MAX_MSG_LENGTH);
-
-		/* Do this here becase deliver_recv_msg() releases the
-		   lock, and a new message can be put in during the
-		   time the lock is released. */
-		msg = kcs_info->curr_msg;
-		kcs_info->curr_msg = NULL;
-		if (msg->rsp[2] != 0) {
-			/* Error getting event, probably done. */
-			msg->done(msg);
-
-			/* Take off the event flag. */
-			kcs_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL;
-		} else {
-			deliver_recv_msg(kcs_info, msg);
-		}
-		handle_flags(kcs_info);
-		break;
-	}
-
-	case KCS_GETTING_MESSAGES:
-	{
-		kcs_info->curr_msg->rsp_size
-			= kcs_get_result(kcs_info->kcs_sm,
-					 kcs_info->curr_msg->rsp,
-					 IPMI_MAX_MSG_LENGTH);
-
-		/* Do this here becase deliver_recv_msg() releases the
-		   lock, and a new message can be put in during the
-		   time the lock is released. */
-		msg = kcs_info->curr_msg;
-		kcs_info->curr_msg = NULL;
-		if (msg->rsp[2] != 0) {
-			/* Error getting event, probably done. */
-			msg->done(msg);
-
-			/* Take off the msg flag. */
-			kcs_info->msg_flags &= ~RECEIVE_MSG_AVAIL;
-		} else {
-			deliver_recv_msg(kcs_info, msg);
-		}
-		handle_flags(kcs_info);
-		break;
-	}
-
-	case KCS_ENABLE_INTERRUPTS1:
-	{
-		unsigned char msg[4];
-
-		/* We got the flags from the KCS, now handle them. */
-		kcs_get_result(kcs_info->kcs_sm, msg, 4);
-		if (msg[2] != 0) {
-			printk(KERN_WARNING
-			       "ipmi_kcs: Could not enable interrupts"
-			       ", failed get, using polled mode.\n");
-			kcs_info->kcs_state = KCS_NORMAL;
-		} else {
-			msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-			msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
-			msg[2] = msg[3] | 1; /* enable msg queue int */
-			start_kcs_transaction(kcs_info->kcs_sm, msg,3);
-			kcs_info->kcs_state = KCS_ENABLE_INTERRUPTS2;
-		}
-		break;
-	}
-
-	case KCS_ENABLE_INTERRUPTS2:
-	{
-		unsigned char msg[4];
-
-		/* We got the flags from the KCS, now handle them. */
-		kcs_get_result(kcs_info->kcs_sm, msg, 4);
-		if (msg[2] != 0) {
-			printk(KERN_WARNING
-			       "ipmi_kcs: Could not enable interrupts"
-			       ", failed set, using polled mode.\n");
-		}
-		kcs_info->kcs_state = KCS_NORMAL;
-		break;
-	}
-	}
-}
-
-/* Called on timeouts and events.  Timeouts should pass the elapsed
-   time, interrupts should pass in zero. */
-static enum kcs_result kcs_event_handler(struct kcs_info *kcs_info, int time)
-{
-	enum kcs_result kcs_result;
-
- restart:
-	/* There used to be a loop here that waited a little while
-	   (around 25us) before giving up.  That turned out to be
-	   pointless, the minimum delays I was seeing were in the 300us
-	   range, which is far too long to wait in an interrupt.  So
-	   we just run until the state machine tells us something
-	   happened or it needs a delay. */
-	kcs_result = kcs_event(kcs_info->kcs_sm, time);
-	time = 0;
-	while (kcs_result == KCS_CALL_WITHOUT_DELAY)
-	{
-		kcs_result = kcs_event(kcs_info->kcs_sm, 0);
-	}
-
-	if (kcs_result == KCS_TRANSACTION_COMPLETE)
-	{
-		handle_transaction_done(kcs_info);
-		kcs_result = kcs_event(kcs_info->kcs_sm, 0);
-	}
-	else if (kcs_result == KCS_SM_HOSED)
-	{
-		if (kcs_info->curr_msg != NULL) {
-			/* If we were handling a user message, format
-                           a response to send to the upper layer to
-                           tell it about the error. */
-			return_hosed_msg(kcs_info);
-		}
-		kcs_result = kcs_event(kcs_info->kcs_sm, 0);
-		kcs_info->kcs_state = KCS_NORMAL;
-	}
-
-	/* We prefer handling attn over new messages. */
-	if (kcs_result == KCS_ATTN)
-	{
-		unsigned char msg[2];
-
-		/* Got a attn, send down a get message flags to see
-                   what's causing it.  It would be better to handle
-                   this in the upper layer, but due to the way
-                   interrupts work with the KCS, that's not really
-                   possible. */
-		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
-
-		start_kcs_transaction(kcs_info->kcs_sm, msg, 2);
-		kcs_info->kcs_state = KCS_GETTING_FLAGS;
-		goto restart;
-	}
-
-	/* If we are currently idle, try to start the next message. */
-	if (kcs_result == KCS_SM_IDLE) {
-		kcs_result = start_next_msg(kcs_info);
-		if (kcs_result != KCS_SM_IDLE)
-			goto restart;
-        }
-
-	if ((kcs_result == KCS_SM_IDLE)
-	    && (atomic_read(&kcs_info->req_events)))
-	{
-		/* We are idle and the upper layer requested that I fetch
-		   events, so do so. */
-		unsigned char msg[2];
-
-		atomic_set(&kcs_info->req_events, 0);
-		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
-
-		start_kcs_transaction(kcs_info->kcs_sm, msg, 2);
-		kcs_info->kcs_state = KCS_GETTING_FLAGS;
-		goto restart;
-	}
-
-	return kcs_result;
-}
-
-static void sender(void                *send_info,
-		   struct ipmi_smi_msg *msg,
-		   int                 priority)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) send_info;
-	enum kcs_result result;
-	unsigned long   flags;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	spin_lock_irqsave(&(kcs_info->msg_lock), flags);
-#ifdef DEBUG_TIMING
-	do_gettimeofday(&t);
-	printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-
-	if (kcs_info->run_to_completion) {
-		/* If we are running to completion, then throw it in
-		   the list and run transactions until everything is
-		   clear.  Priority doesn't matter here. */
-		list_add_tail(&(msg->link), &(kcs_info->xmit_msgs));
-
-		/* We have to release the msg lock and claim the kcs
-		   lock in this case, because of race conditions. */
-		spin_unlock_irqrestore(&(kcs_info->msg_lock), flags);
-
-		spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-		result = kcs_event_handler(kcs_info, 0);
-		while (result != KCS_SM_IDLE) {
-			udelay(KCS_SHORT_TIMEOUT_USEC);
-			result = kcs_event_handler(kcs_info,
-						   KCS_SHORT_TIMEOUT_USEC);
-		}
-		spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-		return;
-	} else {
-		if (priority > 0) {
-			list_add_tail(&(msg->link), &(kcs_info->hp_xmit_msgs));
-		} else {
-			list_add_tail(&(msg->link), &(kcs_info->xmit_msgs));
-		}
-	}
-	spin_unlock_irqrestore(&(kcs_info->msg_lock), flags);
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-	if ((kcs_info->kcs_state == KCS_NORMAL)
-	    && (kcs_info->curr_msg == NULL))
-	{
-		start_next_msg(kcs_info);
-		kcs_restart_short_timer(kcs_info);
-	}
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-}
-
-static void set_run_to_completion(void *send_info, int i_run_to_completion)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) send_info;
-	enum kcs_result result;
-	unsigned long   flags;
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-
-	kcs_info->run_to_completion = i_run_to_completion;
-	if (i_run_to_completion) {
-		result = kcs_event_handler(kcs_info, 0);
-		while (result != KCS_SM_IDLE) {
-			udelay(KCS_SHORT_TIMEOUT_USEC);
-			result = kcs_event_handler(kcs_info,
-						   KCS_SHORT_TIMEOUT_USEC);
-		}
-	}
-
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-}
-
-static void request_events(void *send_info)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) send_info;
-
-	atomic_set(&kcs_info->req_events, 1);
-}
-
-static int initialized = 0;
-
-/* Must be called with interrupts off and with the kcs_lock held. */
-static void kcs_restart_short_timer(struct kcs_info *kcs_info)
-{
-	if (del_timer(&(kcs_info->kcs_timer))) {
-#ifdef CONFIG_HIGH_RES_TIMERS
-		unsigned long jiffies_now;
-
-		/* If we don't delete the timer, then it will go off
-		   immediately, anyway.  So we only process if we
-		   actually delete the timer. */
-
-		/* We already have irqsave on, so no need for it
-                   here. */
-		read_lock(&xtime_lock);
-		jiffies_now = jiffies;
-		kcs_info->kcs_timer.expires = jiffies_now;
-
-		kcs_info->kcs_timer.sub_expires
-			= quick_update_jiffies_sub(jiffies_now);
-		read_unlock(&xtime_lock);
-
-		kcs_info->kcs_timer.sub_expires
-			+= usec_to_arch_cycles(KCS_SHORT_TIMEOUT_USEC);
-		while (kcs_info->kcs_timer.sub_expires >= cycles_per_jiffies) {
-			kcs_info->kcs_timer.expires++;
-			kcs_info->kcs_timer.sub_expires -= cycles_per_jiffies;
-		}
-#else
-		kcs_info->kcs_timer.expires = jiffies + 1;
-#endif
-		add_timer(&(kcs_info->kcs_timer));
-	}
-}
-
-static void kcs_timeout(unsigned long data)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) data;
-	enum kcs_result kcs_result;
-	unsigned long   flags;
-	unsigned long   jiffies_now;
-	unsigned long   time_diff;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	if (kcs_info->stop_operation) {
-		kcs_info->timer_stopped = 1;
-		return;
-	}
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-#ifdef DEBUG_TIMING
-	do_gettimeofday(&t);
-	printk("**Timer: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-	jiffies_now = jiffies;
-
-	time_diff = ((jiffies_now - kcs_info->last_timeout_jiffies)
-		     * KCS_USEC_PER_JIFFY);
-	kcs_result = kcs_event_handler(kcs_info, time_diff);
-
-	kcs_info->last_timeout_jiffies = jiffies_now;
-
-	if ((kcs_info->irq) && (! kcs_info->interrupt_disabled)) {
-		/* Running with interrupts, only do long timeouts. */
-		kcs_info->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-		goto do_add_timer;
-	}
-
-	/* If the state machine asks for a short delay, then shorten
-           the timer timeout. */
-#ifdef CONFIG_HIGH_RES_TIMERS
-	if (kcs_result == KCS_CALL_WITH_DELAY) {
-		kcs_info->kcs_timer.sub_expires
-			+= usec_to_arch_cycles(KCS_SHORT_TIMEOUT_USEC);
-		while (kcs_info->kcs_timer.sub_expires >= cycles_per_jiffies) {
-			kcs_info->kcs_timer.expires++;
-			kcs_info->kcs_timer.sub_expires -= cycles_per_jiffies;
-		}
-	} else {
-		kcs_info->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-		kcs_info->kcs_timer.sub_expires = 0;
-	}
-#else
-	/* If requested, take the shortest delay possible */
-	if (kcs_result == KCS_CALL_WITH_DELAY) {
-		kcs_info->kcs_timer.expires = jiffies + 1;
-	} else {
-		kcs_info->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-	}
-#endif
-
- do_add_timer:
-	add_timer(&(kcs_info->kcs_timer));
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-}
-
-static irqreturn_t kcs_irq_handler(int irq, void *data, struct pt_regs *regs)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) data;
-	unsigned long   flags;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-	if (kcs_info->stop_operation)
-		goto out;
-
-#ifdef DEBUG_TIMING
-	do_gettimeofday(&t);
-	printk("**Interrupt: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-	kcs_event_handler(kcs_info, 0);
- out:
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-	return IRQ_HANDLED;
-}
-
-static struct ipmi_smi_handlers handlers =
-{
-	.owner			= THIS_MODULE,
-	.sender			= sender,
-	.request_events		= request_events,
-	.set_run_to_completion	= set_run_to_completion,
-};
-
-static unsigned char ipmi_kcs_dev_rev;
-static unsigned char ipmi_kcs_fw_rev_major;
-static unsigned char ipmi_kcs_fw_rev_minor;
-static unsigned char ipmi_version_major;
-static unsigned char ipmi_version_minor;
-
-extern int kcs_dbg;
-static int ipmi_kcs_detect_hardware(unsigned int port,
-				    unsigned char *addr,
-				    struct kcs_data *data)
-{
-	unsigned char   msg[2];
-	unsigned char   resp[IPMI_MAX_MSG_LENGTH];
-	unsigned long   resp_len;
-	enum kcs_result kcs_result;
-
-	/* It's impossible for the KCS status register to be all 1's,
-	   (assuming a properly functioning, self-initialized BMC)
-	   but that's what you get from reading a bogus address, so we
-	   test that first. */
-
-	if (port) {
-		if (inb(port+1) == 0xff) return -ENODEV; 
-	} else { 
-		if (readb(addr+1) == 0xff) return -ENODEV; 
-	}
-
-	/* Do a Get Device ID command, since it comes back with some
-	   useful info. */
-	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
-	msg[1] = IPMI_GET_DEVICE_ID_CMD;
-	start_kcs_transaction(data, msg, 2);
-	
-	kcs_result = kcs_event(data, 0);
-	for (;;)
-	{
-		if (kcs_result == KCS_CALL_WITH_DELAY) {
-			udelay(100);
-			kcs_result = kcs_event(data, 100);
-		}
-		else if (kcs_result == KCS_CALL_WITHOUT_DELAY)
-		{
-			kcs_result = kcs_event(data, 0);
-		}
-		else
-			break;
-	}
-	if (kcs_result == KCS_SM_HOSED) {
-		/* We couldn't get the state machine to run, so whatever's at
-		   the port is probably not an IPMI KCS interface. */
-		return -ENODEV;
-	}
-	/* Otherwise, we got some data. */
-	resp_len = kcs_get_result(data, resp, IPMI_MAX_MSG_LENGTH);
-	if (resp_len < 6)
-		/* That's odd, it should be longer. */
-		return -EINVAL;
-	
-	if ((resp[1] != IPMI_GET_DEVICE_ID_CMD) || (resp[2] != 0))
-		/* That's odd, it shouldn't be able to fail. */
-		return -EINVAL;
-	
-	ipmi_kcs_dev_rev = resp[4] & 0xf;
-	ipmi_kcs_fw_rev_major = resp[5] & 0x7f;
-	ipmi_kcs_fw_rev_minor = resp[6];
-	ipmi_version_major = resp[7] & 0xf;
-	ipmi_version_minor = resp[7] >> 4;
-
-	return 0;
-}
-
-/* There can be 4 IO ports passed in (with or without IRQs), 4 addresses,
-   a default IO port, and 1 ACPI/SPMI address.  That sets KCS_MAX_DRIVERS */
-
-#define KCS_MAX_PARMS 4
-#define KCS_MAX_DRIVERS ((KCS_MAX_PARMS * 2) + 2)
-static struct kcs_info *kcs_infos[KCS_MAX_DRIVERS] =
-{ NULL, NULL, NULL, NULL };
-
-#define DEVICE_NAME "ipmi_kcs"
-
-#define DEFAULT_IO_PORT 0xca2
-
-static int kcs_trydefaults = 1;
-static unsigned long kcs_addrs[KCS_MAX_PARMS] = { 0, 0, 0, 0 };
-static int kcs_ports[KCS_MAX_PARMS] = { 0, 0, 0, 0 };
-static int kcs_irqs[KCS_MAX_PARMS] = { 0, 0, 0, 0 };
-
-MODULE_PARM(kcs_trydefaults, "i");
-MODULE_PARM(kcs_addrs, "1-4l");
-MODULE_PARM(kcs_irqs, "1-4i");
-MODULE_PARM(kcs_ports, "1-4i");
-
-/* Returns 0 if initialized, or negative on an error. */
-static int init_one_kcs(int kcs_port, 
-			int irq, 
-			unsigned long kcs_physaddr,
-			struct kcs_info **kcs)
-{
-	int		rv;
-	struct kcs_info *new_kcs;
-
-	/* Did anything get passed in at all?  Both == zero disables the
-	   driver. */
-
-	if (!(kcs_port || kcs_physaddr)) 
-		return -ENODEV;
-	
-	/* Only initialize a port OR a physical address on this call.
-	   Also, IRQs can go with either ports or addresses. */
-
-	if (kcs_port && kcs_physaddr)
-		return -EINVAL;
-
-	new_kcs = kmalloc(sizeof(*new_kcs), GFP_KERNEL);
-	if (!new_kcs) {
-		printk(KERN_ERR "ipmi_kcs: out of memory\n");
-		return -ENOMEM;
-	}
-
-	/* So we know not to free it unless we have allocated one. */
-	new_kcs->kcs_sm = NULL;
-
-	new_kcs->addr = NULL;
-	new_kcs->physaddr = kcs_physaddr;
-	new_kcs->port = kcs_port;
-
-	if (kcs_port) {
-		if (request_region(kcs_port, 2, DEVICE_NAME) == NULL) {
-			kfree(new_kcs);
-			printk(KERN_ERR 
-			       "ipmi_kcs: can't reserve port @ 0x%4.4x\n",
-		       	       kcs_port);
-			return -EIO;
-		}
-	} else {
-		if (request_mem_region(kcs_physaddr, 2, DEVICE_NAME) == NULL) {
-			kfree(new_kcs);
-			printk(KERN_ERR 
-			       "ipmi_kcs: can't reserve memory @ 0x%lx\n",
-		       	       kcs_physaddr);
-			return -EIO;
-		}
-		if ((new_kcs->addr = ioremap(kcs_physaddr, 2)) == NULL) {
-			kfree(new_kcs);
-			printk(KERN_ERR 
-			       "ipmi_kcs: can't remap memory at 0x%lx\n",
-		       	       kcs_physaddr);
-			return -EIO;
-		}
-	}
-
-	new_kcs->kcs_sm = kmalloc(kcs_size(), GFP_KERNEL);
-	if (!new_kcs->kcs_sm) {
-		printk(KERN_ERR "ipmi_kcs: out of memory\n");
-		rv = -ENOMEM;
-		goto out_err;
-	}
-	init_kcs_data(new_kcs->kcs_sm, kcs_port, new_kcs->addr);
-	spin_lock_init(&(new_kcs->kcs_lock));
-	spin_lock_init(&(new_kcs->msg_lock));
-
-	rv = ipmi_kcs_detect_hardware(kcs_port, new_kcs->addr, new_kcs->kcs_sm);
-	if (rv) {
-		if (kcs_port) 
-			printk(KERN_ERR 
-			       "ipmi_kcs: No KCS @ port 0x%4.4x\n", 
-			       kcs_port);
-		else
-			printk(KERN_ERR 
-			       "ipmi_kcs: No KCS @ addr 0x%lx\n", 
-			       kcs_physaddr);
-		goto out_err;
-	}
-
-	if (irq != 0) {
-		rv = request_irq(irq,
-				 kcs_irq_handler,
-				 SA_INTERRUPT,
-				 DEVICE_NAME,
-				 new_kcs);
-		if (rv) {
-			printk(KERN_WARNING
-			       "ipmi_kcs: %s unable to claim interrupt %d,"
-			       " running polled\n",
-			       DEVICE_NAME, irq);
-			irq = 0;
-		}
-	}
-	new_kcs->irq = irq;
-
-	INIT_LIST_HEAD(&(new_kcs->xmit_msgs));
-	INIT_LIST_HEAD(&(new_kcs->hp_xmit_msgs));
-	new_kcs->curr_msg = NULL;
-	atomic_set(&new_kcs->req_events, 0);
-	new_kcs->run_to_completion = 0;
-
-	start_clear_flags(new_kcs);
-
-	if (irq) {
-		new_kcs->kcs_state = KCS_CLEARING_FLAGS_THEN_SET_IRQ;
-
-		printk(KERN_INFO 
-		       "ipmi_kcs: Acquiring BMC @ port=0x%x irq=%d\n",
-		       kcs_port, irq);
-
-	} else {
-		if (kcs_port)
-			printk(KERN_INFO 
-			       "ipmi_kcs: Acquiring BMC @ port=0x%x\n",
-		       	       kcs_port);
-		else
-			printk(KERN_INFO 
-			       "ipmi_kcs: Acquiring BMC @ addr=0x%lx\n",
-		       	       kcs_physaddr);
-	}
-
-	rv = ipmi_register_smi(&handlers,
-			       new_kcs,
-			       ipmi_version_major,
-			       ipmi_version_minor,
-			       &(new_kcs->intf));
-	if (rv) {
-		free_irq(irq, new_kcs);
-		printk(KERN_ERR 
-		       "ipmi_kcs: Unable to register device: error %d\n",
-		       rv);
-		goto out_err;
-	}
-
-	new_kcs->interrupt_disabled = 0;
-	new_kcs->timer_stopped = 0;
-	new_kcs->stop_operation = 0;
-
-	init_timer(&(new_kcs->kcs_timer));
-	new_kcs->kcs_timer.data = (long) new_kcs;
-	new_kcs->kcs_timer.function = kcs_timeout;
-	new_kcs->last_timeout_jiffies = jiffies;
-	new_kcs->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-	add_timer(&(new_kcs->kcs_timer));
-
-	*kcs = new_kcs;
-
-	return 0;
-
- out_err:
-	if (kcs_port) 
-		release_region (kcs_port, 2);
-	if (new_kcs->addr) 
-		iounmap(new_kcs->addr);
-	if (kcs_physaddr) 
-		release_mem_region(kcs_physaddr, 2);
-	if (new_kcs->kcs_sm)
-		kfree(new_kcs->kcs_sm);
-	kfree(new_kcs);
-	return rv;
-}
-
-#ifdef CONFIG_ACPI_INTERPRETER
-
-#include <linux/acpi.h>
-
-struct SPMITable {
-	s8      Signature[4];
-	u32     Length;
-	u8      Revision;
-	u8      Checksum;
-	s8      OEMID[6];
-	s8      OEMTableID[8];
-	s8      OEMRevision[4];
-	s8      CreatorID[4];
-	s8      CreatorRevision[4];
-	u8      InterfaceType[2];
-	s16     SpecificationRevision;
-
-	/*
-	 * Bit 0 - SCI interrupt supported
-	 * Bit 1 - I/O APIC/SAPIC
-	 */
-	u8      InterruptType;
-
-	/* If bit 0 of InterruptType is set, then this is the SCI
-	   interrupt in the GPEx_STS register. */
-	u8      GPE;
-
-	s16     Reserved;
-
-	/* If bit 1 of InterruptType is set, then this is the I/O
-	   APIC/SAPIC interrupt. */
-	u32     GlobalSystemInterrupt;
-
-	/* The actual register address. */
-	struct acpi_generic_address addr;
-
-	u8      UID[4];
-
-	s8      spmi_id[1]; /* A '\0' terminated array starts here. */
-};
-
-static int acpi_find_bmc(unsigned long *physaddr, int *port)
-{
-	acpi_status          status;
-	struct SPMITable     *spmi;
-	
-	status = acpi_get_firmware_table("SPMI", 1,
-					 ACPI_LOGICAL_ADDRESSING,
-					 (struct acpi_table_header **) &spmi);
-	if (status != AE_OK)
-		goto not_found;
-
-	if (spmi->InterfaceType[0] != 1)
-		/* Not IPMI. */
-		goto not_found;
-
-	if (spmi->InterfaceType[1] != 1)
-		/* Not KCS. */
-		goto not_found;
-
-	if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
-		*physaddr = spmi->addr.address;
-		printk("ipmi_kcs_intf: Found ACPI-specified state machine"
-		       " at memory address 0x%lx\n",
-		       (unsigned long) spmi->addr.address);
-	} else if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
-		*port = spmi->addr.address;
-		printk("ipmi_kcs_intf: Found ACPI-specified state machine"
-		       " at I/O address 0x%lx\n",
-		       (unsigned long) spmi->addr.address);
-	} else
-		goto not_found; /* Not an address type we recognise. */
-
-	return 0;
-
- not_found:
-	return -ENODEV;
-}
-#endif
-
-static __init int init_ipmi_kcs(void)
-{
-	int		rv = 0;
-	int		pos = 0;
-	int		i = 0;
-#ifdef CONFIG_ACPI_INTERPRETER
-	unsigned long	physaddr = 0;
-	int             port = 0;
-#endif
-
-	if (initialized)
-		return 0;
-	initialized = 1;
-
-	/* First do the "command-line" parameters */
-
-	for (i=0; i < KCS_MAX_PARMS; i++) {
-		rv = init_one_kcs(kcs_ports[i], 
-				  kcs_irqs[i], 
-				  0, 
-				  &(kcs_infos[pos]));
-		if (rv == 0)
-			pos++;
-
-		rv = init_one_kcs(0, 
-				  kcs_irqs[i], 
-				  kcs_addrs[i], 
-				  &(kcs_infos[pos]));
-		if (rv == 0)
-			pos++;
-	}
-
-	/* Only try the defaults if enabled and resources are available
-	   (because they weren't already specified above). */
-
-	if (kcs_trydefaults && (pos == 0)) {
-		rv = -EINVAL;
-#ifdef CONFIG_ACPI_INTERPRETER
-		if (rv && (physaddr = acpi_find_bmc(&physaddr, &port) == 0)) {
-			rv = init_one_kcs(port, 
-					  0, 
-					  physaddr, 
-					  &(kcs_infos[pos]));
-			if (rv == 0)
-				pos++;
-		}
-#endif
-		if (rv) {
-			rv = init_one_kcs(DEFAULT_IO_PORT, 
-					  0, 
-					  0, 
-					  &(kcs_infos[pos]));
-			if (rv == 0)
-				pos++;
-		}
-	}
-
-	if (kcs_infos[0] == NULL) {
-		printk("ipmi_kcs: Unable to find any KCS interfaces\n");
-		return -ENODEV;
-	} 
-
-	return 0;
-}
-module_init(init_ipmi_kcs);
-
-#ifdef MODULE
-void __exit cleanup_one_kcs(struct kcs_info *to_clean)
-{
-	int           rv;
-	unsigned long flags;
-
-	if (! to_clean)
-		return;
-
-	/* Tell the timer and interrupt handlers that we are shutting
-	   down. */
-	spin_lock_irqsave(&(to_clean->kcs_lock), flags);
-	spin_lock(&(to_clean->msg_lock));
-
-	to_clean->stop_operation = 1;
-
-	if (to_clean->irq != 0)
-		free_irq(to_clean->irq, to_clean);
-	if (to_clean->port) {
-		printk(KERN_INFO 
-		       "ipmi_kcs: Releasing BMC @ port=0x%x\n",
-		       to_clean->port);
-		release_region (to_clean->port, 2);
-	}
-	if (to_clean->addr) {
-		printk(KERN_INFO 
-		       "ipmi_kcs: Releasing BMC @ addr=0x%lx\n",
-		       to_clean->physaddr);
-		iounmap(to_clean->addr);
-		release_mem_region(to_clean->physaddr, 2);
-	}
-
-	spin_unlock(&(to_clean->msg_lock));
-	spin_unlock_irqrestore(&(to_clean->kcs_lock), flags);
-
-	/* Wait until we know that we are out of any interrupt
-	   handlers might have been running before we freed the
-	   interrupt. */
-	synchronize_kernel();
-
-	/* Wait for the timer to stop.  This avoids problems with race
-	   conditions removing the timer here. */
-	while (!to_clean->timer_stopped) {
-		schedule_timeout(1);
-	}
-
-	rv = ipmi_unregister_smi(to_clean->intf);
-	if (rv) {
-		printk(KERN_ERR 
-		       "ipmi_kcs: Unable to unregister device: errno=%d\n",
-		       rv);
-	}
-
-	initialized = 0;
-
-	kfree(to_clean->kcs_sm);
-	kfree(to_clean);
-}
-
-static __exit void cleanup_ipmi_kcs(void)
-{
-	int i;
-
-	if (!initialized)
-		return;
-
-	for (i=0; i<KCS_MAX_DRIVERS; i++) {
-		cleanup_one_kcs(kcs_infos[i]);
-	}
-}
-module_exit(cleanup_ipmi_kcs);
-#else
-
-/* Unfortunately, cmdline::get_options() only returns integers, not
-   longs.  Since we need ulongs (64-bit physical addresses) parse the 
-   comma-separated list manually.  Arguments can be one of these forms:
-   m0xaabbccddeeff	A physical memory address without an IRQ
-   m0xaabbccddeeff:cc	A physical memory address with an IRQ
-   p0xaabb		An IO port without an IRQ
-   p0xaabb:cc		An IO port with an IRQ
-   nodefaults		Suppress trying the default IO port or ACPI address 
-
-   For example, to pass one IO port with an IRQ, one address, and 
-   suppress the use of the default IO port and ACPI address,
-   use this option string: ipmi_kcs=p0xCA2:5,m0xFF5B0022,nodefaults
-
-   Remember, ipmi_kcs_setup() is passed the string after the equal sign. */
-
-static int __init ipmi_kcs_setup(char *str)
-{
-	unsigned long val;
-	char *cur, *colon;
-	int pos;
-
-	pos = 0;
-	
-	cur = strsep(&str, ",");
-	while ((cur) && (*cur) && (pos < KCS_MAX_PARMS)) {
-		switch (*cur) {
-		case 'n':
-			if (strcmp(cur, "nodefaults") == 0)
-				kcs_trydefaults = 0;
-			else
-				printk(KERN_INFO 
-				       "ipmi_kcs: bad parameter value %s\n",
-				       cur);
-			break;
-		
-		case 'm':
-		case 'p':
-			val = simple_strtoul(cur + 1,
-					     &colon,
-					     0);
-			if (*cur == 'p')
-				kcs_ports[pos] = val;
-			else
-				kcs_addrs[pos] = val;
-			if (*colon == ':') {
-				val = simple_strtoul(colon + 1,
-						     &colon,
-						     0);
-				kcs_irqs[pos] = val;
-			}
-			pos++;
-			break;
-
-		default:
-			printk(KERN_INFO 
-			       "ipmi_kcs: bad parameter value %s\n",
-			       cur);
-		}
-		cur = strsep(&str, ",");
-	}
-
-	return 1;
-}
-__setup("ipmi_kcs=", ipmi_kcs_setup);
-#endif
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c
index 29b14608e292..f4dd321e9638 100644
--- a/drivers/char/ipmi/ipmi_kcs_sm.c
+++ b/drivers/char/ipmi/ipmi_kcs_sm.c
@@ -37,13 +37,12 @@
  * that document.
  */
 
-#include <linux/types.h>
+#include <linux/kernel.h> /* For printk. */
+#include <linux/string.h>
+#include <linux/ipmi_msgdefs.h>		/* for completion codes */
+#include "ipmi_si_sm.h"
 
-#include <asm/io.h>
-#include <asm/string.h>		/* Gets rid of memcpy warning */
-#include <asm/system.h>
-
-#include "ipmi_kcs_sm.h"
+#define IPMI_KCS_VERSION "v31"
 
 /* Set this if you want a printout of why the state machine was hosed
    when it gets hosed. */
@@ -95,32 +94,28 @@ enum kcs_states {
 #define OBF_RETRY_TIMEOUT 1000000
 #define MAX_ERROR_RETRIES 10
 
-#define IPMI_ERR_MSG_TRUNCATED	0xc6
-#define IPMI_ERR_UNSPECIFIED	0xff
-
-struct kcs_data
+struct si_sm_data
 {
-	enum kcs_states state;
-	unsigned int    port;
-	unsigned char	*addr;
-	unsigned char   write_data[MAX_KCS_WRITE_SIZE];
-	int             write_pos;
-	int             write_count;
-	int             orig_write_count;
-	unsigned char   read_data[MAX_KCS_READ_SIZE];
-	int             read_pos;
-	int	        truncated;
+	enum kcs_states  state;
+	struct si_sm_io *io;
+	unsigned char    write_data[MAX_KCS_WRITE_SIZE];
+	int              write_pos;
+	int              write_count;
+	int              orig_write_count;
+	unsigned char    read_data[MAX_KCS_READ_SIZE];
+	int              read_pos;
+	int	         truncated;
 
 	unsigned int  error_retries;
 	long          ibf_timeout;
 	long          obf_timeout;
 };
 
-void init_kcs_data(struct kcs_data *kcs, unsigned int port, unsigned char *addr)
+static unsigned int init_kcs_data(struct si_sm_data *kcs,
+				  struct si_sm_io *io)
 {
 	kcs->state = KCS_IDLE;
-	kcs->port = port;
-	kcs->addr = addr;
+	kcs->io = io;
 	kcs->write_pos = 0;
 	kcs->write_count = 0;
 	kcs->orig_write_count = 0;
@@ -129,40 +124,29 @@ void init_kcs_data(struct kcs_data *kcs, unsigned int port, unsigned char *addr)
 	kcs->truncated = 0;
 	kcs->ibf_timeout = IBF_RETRY_TIMEOUT;
 	kcs->obf_timeout = OBF_RETRY_TIMEOUT;
-}
 
-/* Remember, init_one_kcs() insured port and addr can't both be set */
+	/* Reserve 2 I/O bytes. */
+	return 2;
+}
 
-static inline unsigned char read_status(struct kcs_data *kcs)
+static inline unsigned char read_status(struct si_sm_data *kcs)
 {
-        if (kcs->port)
-		return inb(kcs->port + 1);
-        else
-		return readb(kcs->addr + 1);
+	return kcs->io->inputb(kcs->io, 1);
 }
 
-static inline unsigned char read_data(struct kcs_data *kcs)
+static inline unsigned char read_data(struct si_sm_data *kcs)
 {
-        if (kcs->port)
-		return inb(kcs->port + 0);
-        else
-		return readb(kcs->addr + 0);
+	return kcs->io->inputb(kcs->io, 0);
 }
 
-static inline void write_cmd(struct kcs_data *kcs, unsigned char data)
+static inline void write_cmd(struct si_sm_data *kcs, unsigned char data)
 {
-        if (kcs->port)
-		outb(data, kcs->port + 1);
-        else
-		writeb(data, kcs->addr + 1);
+	kcs->io->outputb(kcs->io, 1, data);
 }
 
-static inline void write_data(struct kcs_data *kcs, unsigned char data)
+static inline void write_data(struct si_sm_data *kcs, unsigned char data)
 {
-        if (kcs->port)
-		outb(data, kcs->port + 0);
-        else
-		writeb(data, kcs->addr + 0);
+	kcs->io->outputb(kcs->io, 0, data);
 }
 
 /* Control codes. */
@@ -182,14 +166,14 @@ static inline void write_data(struct kcs_data *kcs, unsigned char data)
 #define GET_STATUS_OBF(status) ((status) & 0x01)
 
 
-static inline void write_next_byte(struct kcs_data *kcs)
+static inline void write_next_byte(struct si_sm_data *kcs)
 {
 	write_data(kcs, kcs->write_data[kcs->write_pos]);
 	(kcs->write_pos)++;
 	(kcs->write_count)--;
 }
 
-static inline void start_error_recovery(struct kcs_data *kcs, char *reason)
+static inline void start_error_recovery(struct si_sm_data *kcs, char *reason)
 {
 	(kcs->error_retries)++;
 	if (kcs->error_retries > MAX_ERROR_RETRIES) {
@@ -202,7 +186,7 @@ static inline void start_error_recovery(struct kcs_data *kcs, char *reason)
 	}
 }
 
-static inline void read_next_byte(struct kcs_data *kcs)
+static inline void read_next_byte(struct si_sm_data *kcs)
 {
 	if (kcs->read_pos >= MAX_KCS_READ_SIZE) {
 		/* Throw the data away and mark it truncated. */
@@ -215,9 +199,8 @@ static inline void read_next_byte(struct kcs_data *kcs)
 	write_data(kcs, KCS_READ_BYTE);
 }
 
-static inline int check_ibf(struct kcs_data *kcs,
-			    unsigned char   status,
-			    long            time)
+static inline int check_ibf(struct si_sm_data *kcs, unsigned char status,
+			    long time)
 {
 	if (GET_STATUS_IBF(status)) {
 		kcs->ibf_timeout -= time;
@@ -232,9 +215,8 @@ static inline int check_ibf(struct kcs_data *kcs,
 	return 1;
 }
 
-static inline int check_obf(struct kcs_data *kcs,
-			    unsigned char   status,
-			    long            time)
+static inline int check_obf(struct si_sm_data *kcs, unsigned char status,
+			    long time)
 {
 	if (! GET_STATUS_OBF(status)) {
 		kcs->obf_timeout -= time;
@@ -248,13 +230,13 @@ static inline int check_obf(struct kcs_data *kcs,
 	return 1;
 }
 
-static void clear_obf(struct kcs_data *kcs, unsigned char status)
+static void clear_obf(struct si_sm_data *kcs, unsigned char status)
 {
 	if (GET_STATUS_OBF(status))
 		read_data(kcs);
 }
 
-static void restart_kcs_transaction(struct kcs_data *kcs)
+static void restart_kcs_transaction(struct si_sm_data *kcs)
 {
 	kcs->write_count = kcs->orig_write_count;
 	kcs->write_pos = 0;
@@ -265,7 +247,8 @@ static void restart_kcs_transaction(struct kcs_data *kcs)
 	write_cmd(kcs, KCS_WRITE_START);
 }
 
-int start_kcs_transaction(struct kcs_data *kcs, char *data, unsigned int size)
+static int start_kcs_transaction(struct si_sm_data *kcs, unsigned char *data,
+				 unsigned int size)
 {
 	if ((size < 2) || (size > MAX_KCS_WRITE_SIZE)) {
 		return -1;
@@ -287,7 +270,8 @@ int start_kcs_transaction(struct kcs_data *kcs, char *data, unsigned int size)
 	return 0;
 }
 
-int kcs_get_result(struct kcs_data *kcs, unsigned char *data, int length)
+static int get_kcs_result(struct si_sm_data *kcs, unsigned char *data,
+			  unsigned int length)
 {
 	if (length < kcs->read_pos) {
 		kcs->read_pos = length;
@@ -316,7 +300,7 @@ int kcs_get_result(struct kcs_data *kcs, unsigned char *data, int length)
 /* This implements the state machine defined in the IPMI manual, see
    that for details on how this works.  Divide that flowchart into
    sections delimited by "Wait for IBF" and this will become clear. */
-enum kcs_result kcs_event(struct kcs_data *kcs, long time)
+static enum si_sm_result kcs_event(struct si_sm_data *kcs, long time)
 {
 	unsigned char status;
 	unsigned char state;
@@ -328,7 +312,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 #endif
 	/* All states wait for ibf, so just do it here. */
 	if (!check_ibf(kcs, status, time))
-		return KCS_CALL_WITH_DELAY;
+		return SI_SM_CALL_WITH_DELAY;
 
 	/* Just about everything looks at the KCS state, so grab that, too. */
 	state = GET_STATUS_STATE(status);
@@ -339,9 +323,9 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 		clear_obf(kcs, status);
 
 		if (GET_STATUS_ATN(status))
-			return KCS_ATTN;
+			return SI_SM_ATTN;
 		else
-			return KCS_SM_IDLE;
+			return SI_SM_IDLE;
 
 	case KCS_START_OP:
 		if (state != KCS_IDLE) {
@@ -408,7 +392,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 
 		if (state == KCS_READ_STATE) {
 			if (! check_obf(kcs, status, time))
-				return KCS_CALL_WITH_DELAY;
+				return SI_SM_CALL_WITH_DELAY;
 			read_next_byte(kcs);
 		} else {
 			/* We don't implement this exactly like the state
@@ -421,7 +405,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 			clear_obf(kcs, status);
 			kcs->orig_write_count = 0;
 			kcs->state = KCS_IDLE;
-			return KCS_TRANSACTION_COMPLETE;
+			return SI_SM_TRANSACTION_COMPLETE;
 		}
 		break;
 
@@ -444,7 +428,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 			break;
 		}
 		if (! check_obf(kcs, status, time))
-			return KCS_CALL_WITH_DELAY;
+			return SI_SM_CALL_WITH_DELAY;
 
 		clear_obf(kcs, status);
 		write_data(kcs, KCS_READ_BYTE);
@@ -459,14 +443,14 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 		}
 
 		if (! check_obf(kcs, status, time))
-			return KCS_CALL_WITH_DELAY;
+			return SI_SM_CALL_WITH_DELAY;
 
 		clear_obf(kcs, status);
 		if (kcs->orig_write_count) {
 			restart_kcs_transaction(kcs);
 		} else {
 			kcs->state = KCS_IDLE;
-			return KCS_TRANSACTION_COMPLETE;
+			return SI_SM_TRANSACTION_COMPLETE;
 		}
 		break;
 			
@@ -475,14 +459,42 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 	}
 
 	if (kcs->state == KCS_HOSED) {
-		init_kcs_data(kcs, kcs->port, kcs->addr);
-		return KCS_SM_HOSED;
+		init_kcs_data(kcs, kcs->io);
+		return SI_SM_HOSED;
 	}
 
-	return KCS_CALL_WITHOUT_DELAY;
+	return SI_SM_CALL_WITHOUT_DELAY;
 }
 
-int kcs_size(void)
+static int kcs_size(void)
 {
-	return sizeof(struct kcs_data);
+	return sizeof(struct si_sm_data);
 }
+
+static int kcs_detect(struct si_sm_data *kcs)
+{
+	/* It's impossible for the KCS status register to be all 1's,
+	   (assuming a properly functioning, self-initialized BMC)
+	   but that's what you get from reading a bogus address, so we
+	   test that first. */
+	if (read_status(kcs) == 0xff)
+		return 1;
+
+	return 0;
+}
+
+static void kcs_cleanup(struct si_sm_data *kcs)
+{
+}
+
+struct si_sm_handlers kcs_smi_handlers =
+{
+	.version           = IPMI_KCS_VERSION,
+	.init_data         = init_kcs_data,
+	.start_transaction = start_kcs_transaction,
+	.get_result        = get_kcs_result,
+	.event             = kcs_event,
+	.detect            = kcs_detect,
+	.cleanup           = kcs_cleanup,
+	.size              = kcs_size,
+};
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.h b/drivers/char/ipmi/ipmi_kcs_sm.h
deleted file mode 100644
index 81cf952f6314..000000000000
--- a/drivers/char/ipmi/ipmi_kcs_sm.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * ipmi_kcs_sm.h
- *
- * State machine for handling IPMI KCS interfaces.
- *
- * Author: MontaVista Software, Inc.
- *         Corey Minyard <minyard@mvista.com>
- *         source@mvista.com
- *
- * Copyright 2002 MontaVista Software Inc.
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
- *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-struct kcs_data;
-
-void init_kcs_data(struct kcs_data *kcs,
-		   unsigned int    port,
-		   unsigned char   *addr);
-
-/* Start a new transaction in the state machine.  This will return -2
-   if the state machine is not idle, -1 if the size is invalid (to
-   large or too small), or 0 if the transaction is successfully
-   completed. */
-int start_kcs_transaction(struct kcs_data *kcs, char *data, unsigned int size);
-
-/* Return the results after the transaction.  This will return -1 if
-   the buffer is too small, zero if no transaction is present, or the
-   actual length of the result data. */
-int kcs_get_result(struct kcs_data *kcs, unsigned char *data, int length);
-
-enum kcs_result
-{
-	KCS_CALL_WITHOUT_DELAY, /* Call the driver again immediately */
-	KCS_CALL_WITH_DELAY,	/* Delay some before calling again. */
-	KCS_TRANSACTION_COMPLETE, /* A transaction is finished. */
-	KCS_SM_IDLE,		/* The SM is in idle state. */
-	KCS_SM_HOSED,		/* The hardware violated the state machine. */
-	KCS_ATTN		/* The hardware is asserting attn and the
-				   state machine is idle. */
-};
-
-/* Call this periodically (for a polled interface) or upon receiving
-   an interrupt (for a interrupt-driven interface).  If interrupt
-   driven, you should probably poll this periodically when not in idle
-   state.  This should be called with the time that passed since the
-   last call, if it is significant.  Time is in microseconds. */
-enum kcs_result kcs_event(struct kcs_data *kcs, long time);
-
-/* Return the size of the KCS structure in bytes. */
-int kcs_size(void);
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ebbd8032fa9a..c1e4abf1463b 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -44,16 +44,21 @@
 #include <linux/ipmi_smi.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/proc_fs.h>
+
+#define IPMI_MSGHANDLER_VERSION "v31"
 
 struct ipmi_recv_msg *ipmi_alloc_recv_msg(void);
 static int ipmi_init_msghandler(void);
 
 static int initialized = 0;
 
+static struct proc_dir_entry *proc_ipmi_root = NULL;
+
 #define MAX_EVENTS_IN_QUEUE	25
 
 /* Don't let a message sit in a queue forever, always time it with at lest
-   the max message timer. */
+   the max message timer.  This is in milliseconds. */
 #define MAX_MSG_TIMEOUT		60000
 
 struct ipmi_user
@@ -82,7 +87,8 @@ struct cmd_rcvr
 
 struct seq_table
 {
-	int                  inuse : 1;
+	unsigned int         inuse : 1;
+	unsigned int         broadcast : 1;
 
 	unsigned long        timeout;
 	unsigned long        orig_timeout;
@@ -111,10 +117,19 @@ struct seq_table
 
 #define NEXT_SEQID(seqid) (((seqid) + 1) & 0x3fffff)
 
+struct ipmi_channel
+{
+	unsigned char medium;
+	unsigned char protocol;
+};
 
 #define IPMI_IPMB_NUM_SEQ	64
+#define IPMI_MAX_CHANNELS       8
 struct ipmi_smi
 {
+	/* What interface number are we? */
+	int intf_num;
+
 	/* The list of upper layers that are using me.  We read-lock
            this when delivering messages to the upper layer to keep
            the user from going away while we are processing the
@@ -123,6 +138,9 @@ struct ipmi_smi
 	rwlock_t                users_lock;
 	struct list_head        users;
 
+	/* Used for wake ups at startup. */
+	wait_queue_head_t waitq;
+
 	/* The IPMI version of the BMC on the other end. */
 	unsigned char       version_major;
 	unsigned char       version_minor;
@@ -182,6 +200,86 @@ struct ipmi_smi
 	   it.  Note that the message will still be freed by the
 	   caller.  This only works on the system interface. */
 	void (*null_user_handler)(ipmi_smi_t intf, struct ipmi_smi_msg *msg);
+
+	/* When we are scanning the channels for an SMI, this will
+	   tell which channel we are scanning. */
+	int curr_channel;
+
+	/* Channel information */
+	struct ipmi_channel channels[IPMI_MAX_CHANNELS];
+
+	/* Proc FS stuff. */
+	struct proc_dir_entry *proc_dir;
+	char                  proc_dir_name[10];
+
+	spinlock_t   counter_lock; /* For making counters atomic. */
+
+	/* Commands we got that were invalid. */
+	unsigned int sent_invalid_commands;
+
+	/* Commands we sent to the MC. */
+	unsigned int sent_local_commands;
+	/* Responses from the MC that were delivered to a user. */
+	unsigned int handled_local_responses;
+	/* Responses from the MC that were not delivered to a user. */
+	unsigned int unhandled_local_responses;
+
+	/* Commands we sent out to the IPMB bus. */
+	unsigned int sent_ipmb_commands;
+	/* Commands sent on the IPMB that had errors on the SEND CMD */
+	unsigned int sent_ipmb_command_errs;
+	/* Each retransmit increments this count. */
+	unsigned int retransmitted_ipmb_commands;
+	/* When a message times out (runs out of retransmits) this is
+           incremented. */
+	unsigned int timed_out_ipmb_commands;
+
+	/* This is like above, but for broadcasts.  Broadcasts are
+           *not* included in the above count (they are expected to
+           time out). */
+	unsigned int timed_out_ipmb_broadcasts;
+
+	/* Responses I have sent to the IPMB bus. */
+	unsigned int sent_ipmb_responses;
+
+	/* The response was delivered to the user. */
+	unsigned int handled_ipmb_responses;
+	/* The response had invalid data in it. */
+	unsigned int invalid_ipmb_responses;
+	/* The response didn't have anyone waiting for it. */
+	unsigned int unhandled_ipmb_responses;
+
+	/* Commands we sent out to the IPMB bus. */
+	unsigned int sent_lan_commands;
+	/* Commands sent on the IPMB that had errors on the SEND CMD */
+	unsigned int sent_lan_command_errs;
+	/* Each retransmit increments this count. */
+	unsigned int retransmitted_lan_commands;
+	/* When a message times out (runs out of retransmits) this is
+           incremented. */
+	unsigned int timed_out_lan_commands;
+
+	/* Responses I have sent to the IPMB bus. */
+	unsigned int sent_lan_responses;
+
+	/* The response was delivered to the user. */
+	unsigned int handled_lan_responses;
+	/* The response had invalid data in it. */
+	unsigned int invalid_lan_responses;
+	/* The response didn't have anyone waiting for it. */
+	unsigned int unhandled_lan_responses;
+
+	/* The command was delivered to the user. */
+	unsigned int handled_commands;
+	/* The command had invalid data in it. */
+	unsigned int invalid_commands;
+	/* The command didn't have anyone waiting for it. */
+	unsigned int unhandled_commands;
+
+	/* Invalid data in an event. */
+	unsigned int invalid_events;
+	/* Events that were received with the proper format. */
+	unsigned int events;
 };
 
 int
@@ -264,6 +362,21 @@ int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher)
 	return 0;
 }
 
+static void
+call_smi_watchers(int i)
+{
+	struct ipmi_smi_watcher *w;
+
+	down_read(&smi_watchers_sem);
+	list_for_each_entry(w, &smi_watchers, link) {
+		if (try_module_get(w->owner)) {
+			w->new_smi(i);
+			module_put(w->owner);
+		}
+	}
+	up_read(&smi_watchers_sem);
+}
+
 int
 ipmi_addr_equal(struct ipmi_addr *addr1, struct ipmi_addr *addr2)
 {
@@ -293,6 +406,19 @@ ipmi_addr_equal(struct ipmi_addr *addr1, struct ipmi_addr *addr2)
 			&& (ipmb_addr1->lun == ipmb_addr2->lun));
 	}
 
+	if (addr1->addr_type == IPMI_LAN_ADDR_TYPE) {
+		struct ipmi_lan_addr *lan_addr1
+			= (struct ipmi_lan_addr *) addr1;
+		struct ipmi_lan_addr *lan_addr2
+		    = (struct ipmi_lan_addr *) addr2;
+
+		return ((lan_addr1->remote_SWID == lan_addr2->remote_SWID)
+			&& (lan_addr1->local_SWID == lan_addr2->local_SWID)
+			&& (lan_addr1->session_handle
+			    == lan_addr2->session_handle)
+			&& (lan_addr1->lun == lan_addr2->lun));
+	}
+
 	return 1;
 }
 
@@ -322,6 +448,13 @@ int ipmi_validate_addr(struct ipmi_addr *addr, int len)
 		return 0;
 	}
 
+	if (addr->addr_type == IPMI_LAN_ADDR_TYPE) {
+		if (len < sizeof(struct ipmi_lan_addr)) {
+			return -EINVAL;
+		}
+		return 0;
+	}
+
 	return -EINVAL;
 }
 
@@ -341,7 +474,7 @@ unsigned int ipmi_addr_length(int addr_type)
 
 static void deliver_response(struct ipmi_recv_msg *msg)
 {
-    msg->user->handler->ipmi_recv_hndl(msg, msg->user->handler_data);
+	msg->user->handler->ipmi_recv_hndl(msg, msg->user->handler_data);
 }
 
 /* Find the next sequence number not being used and add the given
@@ -351,6 +484,7 @@ static int intf_next_seq(ipmi_smi_t           intf,
 			 struct ipmi_recv_msg *recv_msg,
 			 unsigned long        timeout,
 			 int                  retries,
+			 int                  broadcast,
 			 unsigned char        *seq,
 			 long                 *seqid)
 {
@@ -373,6 +507,7 @@ static int intf_next_seq(ipmi_smi_t           intf,
 		intf->seq_table[i].timeout = MAX_MSG_TIMEOUT;
 		intf->seq_table[i].orig_timeout = timeout;
 		intf->seq_table[i].retries_left = retries;
+		intf->seq_table[i].broadcast = broadcast;
 		intf->seq_table[i].inuse = 1;
 		intf->seq_table[i].seqid = NEXT_SEQID(intf->seq_table[i].seqid);
 		*seq = i;
@@ -425,8 +560,8 @@ static int intf_find_seq(ipmi_smi_t           intf,
 
 
 /* Start the timer for a specific sequence table entry. */
-static int intf_start_seq_timer(ipmi_smi_t           intf,
-				long                 msgid)
+static int intf_start_seq_timer(ipmi_smi_t intf,
+				long       msgid)
 {
 	int           rv = -ENODEV;
 	unsigned long flags;
@@ -451,6 +586,46 @@ static int intf_start_seq_timer(ipmi_smi_t           intf,
 	return rv;
 }
 
+/* Got an error for the send message for a specific sequence number. */
+static int intf_err_seq(ipmi_smi_t   intf,
+			long         msgid,
+			unsigned int err)
+{
+	int                  rv = -ENODEV;
+	unsigned long        flags;
+	unsigned char        seq;
+	unsigned long        seqid;
+	struct ipmi_recv_msg *msg = NULL;
+
+
+	GET_SEQ_FROM_MSGID(msgid, seq, seqid);
+
+	spin_lock_irqsave(&(intf->seq_lock), flags);
+	/* We do this verification because the user can be deleted
+           while a message is outstanding. */
+	if ((intf->seq_table[seq].inuse)
+	    && (intf->seq_table[seq].seqid == seqid))
+	{
+		struct seq_table *ent = &(intf->seq_table[seq]);
+
+		ent->inuse = 0;
+		msg = ent->recv_msg;
+		rv = 0;
+	}
+	spin_unlock_irqrestore(&(intf->seq_lock), flags);
+
+	if (msg) {
+		msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+		msg->msg_data[0] = err;
+		msg->msg.netfn |= 1; /* Convert to a response. */
+		msg->msg.data_len = 1;
+		msg->msg.data = msg->msg_data;
+		deliver_response(msg);
+	}
+
+	return rv;
+}
+
 
 int ipmi_create_user(unsigned int          if_num,
 		     struct ipmi_user_hndl *handler,
@@ -523,15 +698,14 @@ static int ipmi_destroy_user_nolock(ipmi_user_t user)
 {
 	int              rv = -ENODEV;
 	ipmi_user_t      t_user;
-	struct list_head *entry, *entry2;
+	struct cmd_rcvr  *rcvr, *rcvr2;
 	int              i;
 	unsigned long    flags;
 
 	/* Find the user and delete them from the list. */
-	list_for_each(entry, &(user->intf->users)) {
-		t_user = list_entry(entry, struct ipmi_user, link);
+	list_for_each_entry(t_user, &(user->intf->users), link) {
 		if (t_user == user) {
-			list_del(entry);
+			list_del(&t_user->link);
 			rv = 0;
 			break;
 		}
@@ -554,11 +728,9 @@ static int ipmi_destroy_user_nolock(ipmi_user_t user)
 
 	/* Remove the user from the command receiver's table. */
 	write_lock_irqsave(&(user->intf->cmd_rcvr_lock), flags);
-	list_for_each_safe(entry, entry2, &(user->intf->cmd_rcvrs)) {
-		struct cmd_rcvr *rcvr;
-		rcvr = list_entry(entry, struct cmd_rcvr, link);
+	list_for_each_entry_safe(rcvr, rcvr2, &(user->intf->cmd_rcvrs), link) {
 		if (rcvr->user == user) {
-			list_del(entry);
+			list_del(&rcvr->link);
 			kfree(rcvr);
 		}
 	}
@@ -621,8 +793,7 @@ unsigned char ipmi_get_my_LUN(ipmi_user_t user)
 int ipmi_set_gets_events(ipmi_user_t user, int val)
 {
 	unsigned long         flags;
-	struct list_head      *e, *e2;
-	struct ipmi_recv_msg  *msg;
+	struct ipmi_recv_msg  *msg, *msg2;
 
 	read_lock(&(user->intf->users_lock));
 	spin_lock_irqsave(&(user->intf->events_lock), flags);
@@ -630,9 +801,8 @@ int ipmi_set_gets_events(ipmi_user_t user, int val)
 
 	if (val) {
 		/* Deliver any queued events. */
-		list_for_each_safe(e, e2, &(user->intf->waiting_events)) {
-			msg = list_entry(e, struct ipmi_recv_msg, link);
-			list_del(e);
+		list_for_each_entry_safe(msg, msg2, &(user->intf->waiting_events), link) {
+			list_del(&msg->link);
 			msg->user = user;
 			deliver_response(msg);
 		}
@@ -648,7 +818,7 @@ int ipmi_register_for_cmd(ipmi_user_t   user,
 			  unsigned char netfn,
 			  unsigned char cmd)
 {
-	struct list_head *entry;
+	struct cmd_rcvr  *cmp;
 	unsigned long    flags;
 	struct cmd_rcvr  *rcvr;
 	int              rv = 0;
@@ -666,9 +836,7 @@ int ipmi_register_for_cmd(ipmi_user_t   user,
 	}
 
 	/* Make sure the command/netfn is not already registered. */
-	list_for_each(entry, &(user->intf->cmd_rcvrs)) {
-		struct cmd_rcvr *cmp;
-		cmp = list_entry(entry, struct cmd_rcvr, link);
+	list_for_each_entry(cmp, &(user->intf->cmd_rcvrs), link) {
 		if ((cmp->netfn == netfn) && (cmp->cmd == cmd)) {
 			rv = -EBUSY;
 			break;
@@ -695,7 +863,6 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
 			    unsigned char netfn,
 			    unsigned char cmd)
 {
-	struct list_head *entry;
 	unsigned long    flags;
 	struct cmd_rcvr  *rcvr;
 	int              rv = -ENOENT;
@@ -703,11 +870,10 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
 	read_lock(&(user->intf->users_lock));
 	write_lock_irqsave(&(user->intf->cmd_rcvr_lock), flags);
 	/* Make sure the command/netfn is not already registered. */
-	list_for_each(entry, &(user->intf->cmd_rcvrs)) {
-		rcvr = list_entry(entry, struct cmd_rcvr, link);
+	list_for_each_entry(rcvr, &(user->intf->cmd_rcvrs), link) {
 		if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)) {
 			rv = 0;
-			list_del(entry);
+			list_del(&rcvr->link);
 			kfree(rcvr);
 			break;
 		}
@@ -771,6 +937,43 @@ static inline void format_ipmb_msg(struct ipmi_smi_msg   *smi_msg,
 	smi_msg->msgid = msgid;
 }
 
+static inline void format_lan_msg(struct ipmi_smi_msg   *smi_msg,
+				  struct ipmi_msg       *msg,
+				  struct ipmi_lan_addr  *lan_addr,
+				  long                  msgid,
+				  unsigned char         ipmb_seq,
+				  unsigned char         source_lun)
+{
+	/* Format the IPMB header data. */
+	smi_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+	smi_msg->data[1] = IPMI_SEND_MSG_CMD;
+	smi_msg->data[2] = lan_addr->channel;
+	smi_msg->data[3] = lan_addr->session_handle;
+	smi_msg->data[4] = lan_addr->remote_SWID;
+	smi_msg->data[5] = (msg->netfn << 2) | (lan_addr->lun & 0x3);
+	smi_msg->data[6] = ipmb_checksum(&(smi_msg->data[4]), 2);
+	smi_msg->data[7] = lan_addr->local_SWID;
+	smi_msg->data[8] = (ipmb_seq << 2) | source_lun;
+	smi_msg->data[9] = msg->cmd;
+
+	/* Now tack on the data to the message. */
+	if (msg->data_len > 0)
+		memcpy(&(smi_msg->data[10]), msg->data,
+		       msg->data_len);
+	smi_msg->data_size = msg->data_len + 10;
+
+	/* Now calculate the checksum and tack it on. */
+	smi_msg->data[smi_msg->data_size]
+		= ipmb_checksum(&(smi_msg->data[7]),
+				smi_msg->data_size-7);
+
+	/* Add on the checksum size and the offset from the
+	   broadcast. */
+	smi_msg->data_size += 1;
+
+	smi_msg->msgid = msgid;
+}
+
 /* Separate from ipmi_request so that the user does not have to be
    supplied in certain circumstances (mainly at panic time).  If
    messages are supplied, they will be freed, even if an error
@@ -780,11 +983,14 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 				 struct ipmi_addr     *addr,
 				 long                 msgid,
 				 struct ipmi_msg      *msg,
+				 void                 *user_msg_data,
 				 void                 *supplied_smi,
 				 struct ipmi_recv_msg *supplied_recv,
 				 int                  priority,
 				 unsigned char        source_address,
-				 unsigned char        source_lun)
+				 unsigned char        source_lun,
+				 int                  retries,
+				 unsigned int         retry_time_ms)
 {
 	int                  rv = 0;
 	struct ipmi_smi_msg  *smi_msg;
@@ -800,6 +1006,7 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 			return -ENOMEM;
 		}
 	}
+	recv_msg->user_msg_data = user_msg_data;
 
 	if (supplied_smi) {
 		smi_msg = (struct ipmi_smi_msg *) supplied_smi;
@@ -811,11 +1018,6 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		}
 	}
 
-	if (addr->channel > IPMI_NUM_CHANNELS) {
-	    rv = -EINVAL;
-	    goto out_err;
-	}
-
 	recv_msg->user = user;
 	recv_msg->msgid = msgid;
 	/* Store the message to send in the receive message so timeout
@@ -825,10 +1027,20 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 	if (addr->addr_type == IPMI_SYSTEM_INTERFACE_ADDR_TYPE) {
 		struct ipmi_system_interface_addr *smi_addr;
 
+		if (msg->netfn & 1) {
+			/* Responses are not allowed to the SMI. */
+			rv = -EINVAL;
+			goto out_err;
+		}
 
 		smi_addr = (struct ipmi_system_interface_addr *) addr;
-		if (smi_addr->lun > 3)
-			return -EINVAL;
+		if (smi_addr->lun > 3) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
 
 		memcpy(&recv_msg->addr, smi_addr, sizeof(*smi_addr));
 
@@ -839,11 +1051,17 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		{
 			/* We don't let the user do these, since we manage
 			   the sequence numbers. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EINVAL;
 			goto out_err;
 		}
 
 		if ((msg->data_len + 2) > IPMI_MAX_MSG_LENGTH) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EMSGSIZE;
 			goto out_err;
 		}
@@ -855,41 +1073,69 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		if (msg->data_len > 0)
 			memcpy(&(smi_msg->data[2]), msg->data, msg->data_len);
 		smi_msg->data_size = msg->data_len + 2;
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->sent_local_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 	} else if ((addr->addr_type == IPMI_IPMB_ADDR_TYPE)
 		   || (addr->addr_type == IPMI_IPMB_BROADCAST_ADDR_TYPE))
 	{
 		struct ipmi_ipmb_addr *ipmb_addr;
 		unsigned char         ipmb_seq;
 		long                  seqid;
-		int                   broadcast;
-		int                   retries;
+		int                   broadcast = 0;
+
+		if (addr->channel > IPMI_NUM_CHANNELS) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
 
-		if (addr == NULL) {
+		if (intf->channels[addr->channel].medium
+		    != IPMI_CHANNEL_MEDIUM_IPMB)
+		{
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EINVAL;
 			goto out_err;
 		}
 
+		if (retries < 0) {
+		    if (addr->addr_type == IPMI_IPMB_BROADCAST_ADDR_TYPE)
+			retries = 0; /* Don't retry broadcasts. */
+		    else
+			retries = 4;
+		}
 		if (addr->addr_type == IPMI_IPMB_BROADCAST_ADDR_TYPE) {
 		    /* Broadcasts add a zero at the beginning of the
 		       message, but otherwise is the same as an IPMB
 		       address. */
 		    addr->addr_type = IPMI_IPMB_ADDR_TYPE;
 		    broadcast = 1;
-		    retries = 0; /* Don't retry broadcasts. */
-		} else {
-		    broadcast = 0;
-		    retries = 4;
 		}
 
+
+		/* Default to 1 second retries. */
+		if (retry_time_ms == 0)
+		    retry_time_ms = 1000;
+
 		/* 9 for the header and 1 for the checksum, plus
                    possibly one for the broadcast. */
 		if ((msg->data_len + 10 + broadcast) > IPMI_MAX_MSG_LENGTH) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EMSGSIZE;
 			goto out_err;
 		}
 
 		ipmb_addr = (struct ipmi_ipmb_addr *) addr;
 		if (ipmb_addr->lun > 3) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EINVAL;
 			goto out_err;
 		}
@@ -899,21 +1145,32 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		if (recv_msg->msg.netfn & 0x1) {
 			/* It's a response, so use the user's sequence
                            from msgid. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_ipmb_responses++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			format_ipmb_msg(smi_msg, msg, ipmb_addr, msgid,
 					msgid, broadcast,
 					source_address, source_lun);
+
+			/* Save the receive message so we can use it
+			   to deliver the response. */
+			smi_msg->user_data = recv_msg;
 		} else {
 			/* It's a command, so get a sequence for it. */
 
 			spin_lock_irqsave(&(intf->seq_lock), flags);
 
+			spin_lock(&intf->counter_lock);
+			intf->sent_ipmb_commands++;
+			spin_unlock(&intf->counter_lock);
+
 			/* Create a sequence number with a 1 second
                            timeout and 4 retries. */
-			/* FIXME - magic number for the timeout. */
 			rv = intf_next_seq(intf,
 					   recv_msg,
-					   1000,
+					   retry_time_ms,
 					   retries,
+					   broadcast,
 					   &ipmb_seq,
 					   &seqid);
 			if (rv) {
@@ -939,6 +1196,117 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 			recv_msg->msg.data = recv_msg->msg_data;
 			recv_msg->msg.data_len = smi_msg->data_size;
 
+			/* We don't unlock until here, because we need
+                           to copy the completed message into the
+                           recv_msg before we release the lock.
+                           Otherwise, race conditions may bite us.  I
+                           know that's pretty paranoid, but I prefer
+                           to be correct. */
+			spin_unlock_irqrestore(&(intf->seq_lock), flags);
+		}
+	} else if (addr->addr_type == IPMI_LAN_ADDR_TYPE) {
+		struct ipmi_lan_addr  *lan_addr;
+		unsigned char         ipmb_seq;
+		long                  seqid;
+
+		if (addr->channel > IPMI_NUM_CHANNELS) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
+
+		if ((intf->channels[addr->channel].medium
+		    != IPMI_CHANNEL_MEDIUM_8023LAN)
+		    && (intf->channels[addr->channel].medium
+			!= IPMI_CHANNEL_MEDIUM_ASYNC))
+		{
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
+
+		retries = 4;
+
+		/* Default to 1 second retries. */
+		if (retry_time_ms == 0)
+		    retry_time_ms = 1000;
+
+		/* 11 for the header and 1 for the checksum. */
+		if ((msg->data_len + 12) > IPMI_MAX_MSG_LENGTH) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EMSGSIZE;
+			goto out_err;
+		}
+
+		lan_addr = (struct ipmi_lan_addr *) addr;
+		if (lan_addr->lun > 3) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
+
+		memcpy(&recv_msg->addr, lan_addr, sizeof(*lan_addr));
+
+		if (recv_msg->msg.netfn & 0x1) {
+			/* It's a response, so use the user's sequence
+                           from msgid. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_lan_responses++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			format_lan_msg(smi_msg, msg, lan_addr, msgid,
+				       msgid, source_lun);
+
+			/* Save the receive message so we can use it
+			   to deliver the response. */
+			smi_msg->user_data = recv_msg;
+		} else {
+			/* It's a command, so get a sequence for it. */
+
+			spin_lock_irqsave(&(intf->seq_lock), flags);
+
+			spin_lock(&intf->counter_lock);
+			intf->sent_lan_commands++;
+			spin_unlock(&intf->counter_lock);
+
+			/* Create a sequence number with a 1 second
+                           timeout and 4 retries. */
+			rv = intf_next_seq(intf,
+					   recv_msg,
+					   retry_time_ms,
+					   retries,
+					   0,
+					   &ipmb_seq,
+					   &seqid);
+			if (rv) {
+				/* We have used up all the sequence numbers,
+				   probably, so abort. */
+				spin_unlock_irqrestore(&(intf->seq_lock),
+						       flags);
+				goto out_err;
+			}
+
+			/* Store the sequence number in the message,
+                           so that when the send message response
+                           comes back we can start the timer. */
+			format_lan_msg(smi_msg, msg, lan_addr,
+				       STORE_SEQ_IN_MSGID(ipmb_seq, seqid),
+				       ipmb_seq, source_lun);
+
+			/* Copy the message into the recv message data, so we
+			   can retransmit it later if necessary. */
+			memcpy(recv_msg->msg_data, smi_msg->data,
+			       smi_msg->data_size);
+			recv_msg->msg.data = recv_msg->msg_data;
+			recv_msg->msg.data_len = smi_msg->data_size;
+
 			/* We don't unlock until here, because we need
                            to copy the completed message into the
                            recv_msg before we release the lock.
@@ -949,16 +1317,19 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		}
 	} else {
 	    /* Unknown address type. */
-	    rv = -EINVAL;
-	    goto out_err;
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->sent_invalid_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		rv = -EINVAL;
+		goto out_err;
 	}
 
 #if DEBUG_MSGING
 	{
-	    int m;
-	    for (m=0; m<smi_msg->data_size; m++)
-		printk(" %2.2x", smi_msg->data[m]);
-	    printk("\n");
+		int m;
+		for (m=0; m<smi_msg->data_size; m++)
+			printk(" %2.2x", smi_msg->data[m]);
+		printk("\n");
 	}
 #endif
 	intf->handlers->sender(intf->send_info, smi_msg, priority);
@@ -975,6 +1346,7 @@ int ipmi_request(ipmi_user_t      user,
 		 struct ipmi_addr *addr,
 		 long             msgid,
 		 struct ipmi_msg  *msg,
+		 void             *user_msg_data,
 		 int              priority)
 {
 	return i_ipmi_request(user,
@@ -982,16 +1354,42 @@ int ipmi_request(ipmi_user_t      user,
 			      addr,
 			      msgid,
 			      msg,
+			      user_msg_data,
+			      NULL, NULL,
+			      priority,
+			      user->intf->my_address,
+			      user->intf->my_lun,
+			      -1, 0);
+}
+
+int ipmi_request_settime(ipmi_user_t      user,
+			 struct ipmi_addr *addr,
+			 long             msgid,
+			 struct ipmi_msg  *msg,
+			 void             *user_msg_data,
+			 int              priority,
+			 int              retries,
+			 unsigned int     retry_time_ms)
+{
+	return i_ipmi_request(user,
+			      user->intf,
+			      addr,
+			      msgid,
+			      msg,
+			      user_msg_data,
 			      NULL, NULL,
 			      priority,
 			      user->intf->my_address,
-			      user->intf->my_lun);
+			      user->intf->my_lun,
+			      retries,
+			      retry_time_ms);
 }
 
 int ipmi_request_supply_msgs(ipmi_user_t          user,
 			     struct ipmi_addr     *addr,
 			     long                 msgid,
 			     struct ipmi_msg      *msg,
+			     void                 *user_msg_data,
 			     void                 *supplied_smi,
 			     struct ipmi_recv_msg *supplied_recv,
 			     int                  priority)
@@ -1001,17 +1399,20 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			      addr,
 			      msgid,
 			      msg,
+			      user_msg_data,
 			      supplied_smi,
 			      supplied_recv,
 			      priority,
 			      user->intf->my_address,
-			      user->intf->my_lun);
+			      user->intf->my_lun,
+			      -1, 0);
 }
 
 int ipmi_request_with_source(ipmi_user_t      user,
 			     struct ipmi_addr *addr,
 			     long             msgid,
 			     struct ipmi_msg  *msg,
+			     void             *user_msg_data,
 			     int              priority,
 			     unsigned char    source_address,
 			     unsigned char    source_lun)
@@ -1021,10 +1422,215 @@ int ipmi_request_with_source(ipmi_user_t      user,
 			      addr,
 			      msgid,
 			      msg,
+			      user_msg_data,
 			      NULL, NULL,
 			      priority,
 			      source_address,
-			      source_lun);
+			      source_lun,
+			      -1, 0);
+}
+
+static int ipmb_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char       *out = (char *) page;
+	ipmi_smi_t intf = data;
+
+	return sprintf(out, "%x\n", intf->my_address);
+}
+
+static int version_file_read_proc(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	char       *out = (char *) page;
+	ipmi_smi_t intf = data;
+
+	return sprintf(out, "%d.%d\n",
+		       intf->version_major, intf->version_minor);
+}
+
+static int stat_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char       *out = (char *) page;
+	ipmi_smi_t intf = data;
+
+	out += sprintf(out, "sent_invalid_commands:       %d\n",
+		       intf->sent_invalid_commands);
+	out += sprintf(out, "sent_local_commands:         %d\n",
+		       intf->sent_local_commands);
+	out += sprintf(out, "handled_local_responses:     %d\n",
+		       intf->handled_local_responses);
+	out += sprintf(out, "unhandled_local_responses:   %d\n",
+		       intf->unhandled_local_responses);
+	out += sprintf(out, "sent_ipmb_commands:          %d\n",
+		       intf->sent_ipmb_commands);
+	out += sprintf(out, "sent_ipmb_command_errs:      %d\n",
+		       intf->sent_ipmb_command_errs);
+	out += sprintf(out, "retransmitted_ipmb_commands: %d\n",
+		       intf->retransmitted_ipmb_commands);
+	out += sprintf(out, "timed_out_ipmb_commands:     %d\n",
+		       intf->timed_out_ipmb_commands);
+	out += sprintf(out, "timed_out_ipmb_broadcasts:   %d\n",
+		       intf->timed_out_ipmb_broadcasts);
+	out += sprintf(out, "sent_ipmb_responses:         %d\n",
+		       intf->sent_ipmb_responses);
+	out += sprintf(out, "handled_ipmb_responses:      %d\n",
+		       intf->handled_ipmb_responses);
+	out += sprintf(out, "invalid_ipmb_responses:      %d\n",
+		       intf->invalid_ipmb_responses);
+	out += sprintf(out, "unhandled_ipmb_responses:    %d\n",
+		       intf->unhandled_ipmb_responses);
+	out += sprintf(out, "sent_lan_commands:           %d\n",
+		       intf->sent_lan_commands);
+	out += sprintf(out, "sent_lan_command_errs:       %d\n",
+		       intf->sent_lan_command_errs);
+	out += sprintf(out, "retransmitted_lan_commands:  %d\n",
+		       intf->retransmitted_lan_commands);
+	out += sprintf(out, "timed_out_lan_commands:      %d\n",
+		       intf->timed_out_lan_commands);
+	out += sprintf(out, "sent_lan_responses:          %d\n",
+		       intf->sent_lan_responses);
+	out += sprintf(out, "handled_lan_responses:       %d\n",
+		       intf->handled_lan_responses);
+	out += sprintf(out, "invalid_lan_responses:       %d\n",
+		       intf->invalid_lan_responses);
+	out += sprintf(out, "unhandled_lan_responses:     %d\n",
+		       intf->unhandled_lan_responses);
+	out += sprintf(out, "handled_commands:            %d\n",
+		       intf->handled_commands);
+	out += sprintf(out, "invalid_commands:            %d\n",
+		       intf->invalid_commands);
+	out += sprintf(out, "unhandled_commands:          %d\n",
+		       intf->unhandled_commands);
+	out += sprintf(out, "invalid_events:              %d\n",
+		       intf->invalid_events);
+	out += sprintf(out, "events:                      %d\n",
+		       intf->events);
+
+	return (out - ((char *) page));
+}
+
+int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
+			    read_proc_t *read_proc, write_proc_t *write_proc,
+			    void *data, struct module *owner)
+{
+	struct proc_dir_entry *file;
+	int                   rv = 0;
+
+	file = create_proc_entry(name, 0, smi->proc_dir);
+	if (!file)
+		rv = -ENOMEM;
+	else {
+		file->nlink = 1;
+		file->data = data;
+		file->read_proc = read_proc;
+		file->write_proc = write_proc;
+		file->owner = owner;
+	}
+
+	return rv;
+}
+
+static int add_proc_entries(ipmi_smi_t smi, int num)
+{
+	int rv = 0;
+
+	sprintf(smi->proc_dir_name, "%d", num);
+	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
+	if (!smi->proc_dir)
+		rv = -ENOMEM;
+	else {
+		smi->proc_dir->owner = THIS_MODULE;
+	}
+
+	if (rv == 0)
+		rv = ipmi_smi_add_proc_entry(smi, "stats",
+					     stat_file_read_proc, NULL,
+					     smi, THIS_MODULE);
+
+	if (rv == 0)
+		rv = ipmi_smi_add_proc_entry(smi, "ipmb",
+					     ipmb_file_read_proc, NULL,
+					     smi, THIS_MODULE);
+
+	if (rv == 0)
+		rv = ipmi_smi_add_proc_entry(smi, "version",
+					     version_file_read_proc, NULL,
+					     smi, THIS_MODULE);
+
+	return rv;
+}
+
+static int
+send_channel_info_cmd(ipmi_smi_t intf, int chan)
+{
+	struct ipmi_msg                   msg;
+	unsigned char                     data[1];
+	struct ipmi_system_interface_addr si;
+
+	si.addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
+	si.channel = IPMI_BMC_CHANNEL;
+	si.lun = 0;
+
+	msg.netfn = IPMI_NETFN_APP_REQUEST;
+	msg.cmd = IPMI_GET_CHANNEL_INFO_CMD;
+	msg.data = data;
+	msg.data_len = 1;
+	data[0] = chan;
+	return i_ipmi_request(NULL,
+			      intf,
+			      (struct ipmi_addr *) &si,
+			      0,
+			      &msg,
+			      NULL,
+			      NULL,
+			      NULL,
+			      0,
+			      intf->my_address,
+			      intf->my_lun,
+			      -1, 0);
+}
+
+static void
+channel_handler(ipmi_smi_t intf, struct ipmi_smi_msg *msg)
+{
+	int rv = 0;
+	int chan;
+
+	if ((msg->rsp[0] == (IPMI_NETFN_APP_RESPONSE << 2))
+	    && (msg->rsp[1] == IPMI_GET_CHANNEL_INFO_CMD))
+	{
+		/* It's the one we want */
+		if (msg->rsp[2] != 0) {
+			/* Got an error from the channel, just go on. */
+			goto next_channel;
+		}
+		if (msg->rsp_size < 6) {
+			/* Message not big enough, just go on. */
+			goto next_channel;
+		}
+		chan = intf->curr_channel;
+		intf->channels[chan].medium = msg->rsp[4] & 0x7f;
+		intf->channels[chan].protocol = msg->rsp[5] & 0x1f;
+
+	next_channel:
+		intf->curr_channel++;
+		if (intf->curr_channel >= IPMI_MAX_CHANNELS)
+			wake_up(&intf->waitq);
+		else
+			rv = send_channel_info_cmd(intf, intf->curr_channel);
+
+		if (rv) {
+			/* Got an error somehow, just give up. */
+			intf->curr_channel = IPMI_MAX_CHANNELS;
+			wake_up(&intf->waitq);
+
+			printk(KERN_WARNING "ipmi_msghandler: Error sending"
+			       "channel information: 0x%x\n",
+			       rv);
+		}
+	}
 }
 
 int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
@@ -1036,7 +1642,6 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	int              i, j;
 	int              rv;
 	ipmi_smi_t       new_intf;
-	struct list_head *entry;
 	unsigned long    flags;
 
 
@@ -1055,12 +1660,16 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	new_intf = kmalloc(sizeof(*new_intf), GFP_KERNEL);
 	if (!new_intf)
 		return -ENOMEM;
+	memset(new_intf, 0, sizeof(*new_intf));
+
+	new_intf->proc_dir = NULL;
 
 	rv = -ENOMEM;
 
 	down_write(&interfaces_sem);
 	for (i=0; i<MAX_IPMI_INTERFACES; i++) {
 		if (ipmi_interfaces[i] == NULL) {
+			new_intf->intf_num = i;
 			new_intf->version_major = version_major;
 			new_intf->version_minor = version_minor;
 			new_intf->my_address = IPMI_BMC_SLAVE_ADDR;
@@ -1081,9 +1690,12 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 			INIT_LIST_HEAD(&(new_intf->waiting_events));
 			new_intf->waiting_events_count = 0;
 			rwlock_init(&(new_intf->cmd_rcvr_lock));
+			init_waitqueue_head(&new_intf->waitq);
 			INIT_LIST_HEAD(&(new_intf->cmd_rcvrs));
 			new_intf->all_cmd_rcvr = NULL;
 
+			spin_lock_init(&(new_intf->counter_lock));
+
 			spin_lock_irqsave(&interfaces_lock, flags);
 			ipmi_interfaces[i] = new_intf;
 			spin_unlock_irqrestore(&interfaces_lock, flags);
@@ -1096,46 +1708,71 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 
 	downgrade_write(&interfaces_sem);
 
+	if (rv == 0)
+		rv = add_proc_entries(*intf, i);
+
 	if (rv == 0) {
-		/* Call all the watcher interfaces to tell them that a
-		   new interface is available. */
-		down_read(&smi_watchers_sem);
-		list_for_each(entry, &smi_watchers) {
-			struct ipmi_smi_watcher *w;
-			w = list_entry(entry, struct ipmi_smi_watcher, link);
-			w->new_smi(i);
-		}
-		up_read(&smi_watchers_sem);
+		if ((version_major > 1)
+		    || ((version_major == 1) && (version_minor >= 5)))
+		{
+			/* Start scanning the channels to see what is
+			   available. */
+			(*intf)->null_user_handler = channel_handler;
+			(*intf)->curr_channel = 0;
+			rv = send_channel_info_cmd(*intf, 0);
+			if (rv)
+				goto out;
+
+			/* Wait for the channel info to be read. */
+			up_read(&interfaces_sem);
+			wait_event((*intf)->waitq,
+				   ((*intf)->curr_channel>=IPMI_MAX_CHANNELS));
+			down_read(&interfaces_sem);
+
+			if (ipmi_interfaces[i] != new_intf)
+				/* Well, it went away.  Just return. */
+				goto out;
+		} else {
+			/* Assume a single IPMB channel at zero. */
+			(*intf)->channels[0].medium = IPMI_CHANNEL_MEDIUM_IPMB;
+			(*intf)->channels[0].protocol
+				= IPMI_CHANNEL_PROTOCOL_IPMB;
+  		}
+
+		/* Call all the watcher interfaces to tell
+		   them that a new interface is available. */
+		call_smi_watchers(i);
 	}
 
+ out:
 	up_read(&interfaces_sem);
 
-	if (rv)
+	if (rv) {
+		if (new_intf->proc_dir)
+			remove_proc_entry(new_intf->proc_dir_name,
+					  proc_ipmi_root);
 		kfree(new_intf);
+	}
 
 	return rv;
 }
 
 static void free_recv_msg_list(struct list_head *q)
 {
-	struct list_head     *entry, *entry2;
-	struct ipmi_recv_msg *msg;
+	struct ipmi_recv_msg *msg, *msg2;
 
-	list_for_each_safe(entry, entry2, q) {
-		msg = list_entry(entry, struct ipmi_recv_msg, link);
-		list_del(entry);
+	list_for_each_entry_safe(msg, msg2, q, link) {
+		list_del(&msg->link);
 		ipmi_free_recv_msg(msg);
 	}
 }
 
 static void free_cmd_rcvr_list(struct list_head *q)
 {
-	struct list_head *entry, *entry2;
-	struct cmd_rcvr  *rcvr;
+	struct cmd_rcvr  *rcvr, *rcvr2;
 
-	list_for_each_safe(entry, entry2, q) {
-		rcvr = list_entry(entry, struct cmd_rcvr, link);
-		list_del(entry);
+	list_for_each_entry_safe(rcvr, rcvr2, q, link) {
+		list_del(&rcvr->link);
 		kfree(rcvr);
 	}
 }
@@ -1159,16 +1796,18 @@ static void clean_up_interface_data(ipmi_smi_t intf)
 
 int ipmi_unregister_smi(ipmi_smi_t intf)
 {
-	int              rv = -ENODEV;
-	int              i;
-	struct list_head *entry;
-	unsigned long    flags;
+	int                     rv = -ENODEV;
+	int                     i;
+	struct ipmi_smi_watcher *w;
+	unsigned long           flags;
 
 	down_write(&interfaces_sem);
 	if (list_empty(&(intf->users)))
 	{
 		for (i=0; i<MAX_IPMI_INTERFACES; i++) {
 			if (ipmi_interfaces[i] == intf) {
+				remove_proc_entry(intf->proc_dir_name,
+						  proc_ipmi_root);
 				spin_lock_irqsave(&interfaces_lock, flags);
 				ipmi_interfaces[i] = NULL;
 				clean_up_interface_data(intf);
@@ -1191,11 +1830,7 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	/* Call all the watcher interfaces to tell them that
 	   an interface is gone. */
 	down_read(&smi_watchers_sem);
-	list_for_each(entry, &smi_watchers) {
-		struct ipmi_smi_watcher *w;
-		w = list_entry(entry,
-			       struct ipmi_smi_watcher,
-			       link);
+	list_for_each_entry(w, &smi_watchers, link) {
 		w->smi_gone(i);
 	}
 	up_read(&smi_watchers_sem);
@@ -1203,20 +1838,28 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	return 0;
 }
 
-static int handle_get_msg_rsp(ipmi_smi_t          intf,
-			      struct ipmi_smi_msg *msg)
+static int handle_ipmb_get_msg_rsp(ipmi_smi_t          intf,
+				   struct ipmi_smi_msg *msg)
 {
 	struct ipmi_ipmb_addr ipmb_addr;
 	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
 
 	
-	if (msg->rsp_size < 11)
+	/* This is 11, not 10, because the response must contain a
+	 * completion code. */
+	if (msg->rsp_size < 11) {
 		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_ipmb_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
+	}
 
-	if (msg->rsp[2] != 0)
+	if (msg->rsp[2] != 0) {
 		/* An error getting the response, just ignore it. */
 		return 0;
+	}
 
 	ipmb_addr.addr_type = IPMI_IPMB_ADDR_TYPE;
 	ipmb_addr.slave_addr = msg->rsp[6];
@@ -1235,6 +1878,9 @@ static int handle_get_msg_rsp(ipmi_smi_t          intf,
 	{
 		/* We were unable to find the sequence number,
 		   so just nuke the message. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_ipmb_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
 	}
 
@@ -1248,26 +1894,33 @@ static int handle_get_msg_rsp(ipmi_smi_t          intf,
 	recv_msg->msg.data = recv_msg->msg_data;
 	recv_msg->msg.data_len = msg->rsp_size - 10;
 	recv_msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+	spin_lock_irqsave(&intf->counter_lock, flags);
+	intf->handled_ipmb_responses++;
+	spin_unlock_irqrestore(&intf->counter_lock, flags);
 	deliver_response(recv_msg);
 
 	return 0;
 }
 
-static int handle_get_msg_cmd(ipmi_smi_t          intf,
-			      struct ipmi_smi_msg *msg)
+static int handle_ipmb_get_msg_cmd(ipmi_smi_t          intf,
+				   struct ipmi_smi_msg *msg)
 {
-	struct list_head *entry;
 	struct cmd_rcvr       *rcvr;
-	int              rv = 0;
-	unsigned char    netfn;
-	unsigned char    cmd;
-	ipmi_user_t      user = NULL;
+	int                   rv = 0;
+	unsigned char         netfn;
+	unsigned char         cmd;
+	ipmi_user_t           user = NULL;
 	struct ipmi_ipmb_addr *ipmb_addr;
 	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
 
-	if (msg->rsp_size < 10)
+	if (msg->rsp_size < 10) {
 		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
+	}
 
 	if (msg->rsp[2] != 0) {
 		/* An error getting the response, just ignore it. */
@@ -1283,8 +1936,7 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 		user = intf->all_cmd_rcvr;
 	} else {
 		/* Find the command/netfn. */
-		list_for_each(entry, &(intf->cmd_rcvrs)) {
-			rcvr = list_entry(entry, struct cmd_rcvr, link);
+		list_for_each_entry(rcvr, &(intf->cmd_rcvrs), link) {
 			if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)) {
 				user = rcvr->user;
 				break;
@@ -1295,6 +1947,10 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 
 	if (user == NULL) {
 		/* We didn't find a user, deliver an error response. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
 		msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
 		msg->data[1] = IPMI_SEND_MSG_CMD;
 		msg->data[2] = msg->rsp[3];
@@ -1309,12 +1965,25 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 		msg->data[10] = ipmb_checksum(&(msg->data[6]), 4);
 		msg->data_size = 11;
 
+#if DEBUG_MSGING
+	{
+		int m;
+		printk("Invalid command:");
+		for (m=0; m<msg->data_size; m++)
+			printk(" %2.2x", msg->data[m]);
+		printk("\n");
+	}
+#endif
 		intf->handlers->sender(intf->send_info, msg, 0);
 
 		rv = -1; /* We used the message, so return the value that
 			    causes it to not be freed or queued. */
 	} else {
 		/* Deliver the message to the user. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->handled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
 		recv_msg = ipmi_alloc_recv_msg();
 		if (! recv_msg) {
 			/* We couldn't allocate memory for the
@@ -1322,18 +1991,24 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
                            later. */
 			rv = 1;
 		} else {
+			/* Extract the source address from the data. */
 			ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr;
 			ipmb_addr->addr_type = IPMI_IPMB_ADDR_TYPE;
 			ipmb_addr->slave_addr = msg->rsp[6];
 			ipmb_addr->lun = msg->rsp[7] & 3;
-			ipmb_addr->channel = msg->rsp[3];
+			ipmb_addr->channel = msg->rsp[3] & 0xf;
 
+			/* Extract the rest of the message information
+			   from the IPMB header.*/
 			recv_msg->user = user;
 			recv_msg->recv_type = IPMI_CMD_RECV_TYPE;
 			recv_msg->msgid = msg->rsp[7] >> 2;
 			recv_msg->msg.netfn = msg->rsp[4] >> 2;
 			recv_msg->msg.cmd = msg->rsp[8];
 			recv_msg->msg.data = recv_msg->msg_data;
+
+			/* We chop off 10, not 9 bytes because the checksum
+			   at the end also needs to be removed. */
 			recv_msg->msg.data_len = msg->rsp_size - 10;
 			memcpy(recv_msg->msg_data,
 			       &(msg->rsp[9]),
@@ -1345,6 +2020,169 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 	return rv;
 }
 
+static int handle_lan_get_msg_rsp(ipmi_smi_t          intf,
+				  struct ipmi_smi_msg *msg)
+{
+	struct ipmi_lan_addr  lan_addr;
+	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
+
+
+	/* This is 13, not 12, because the response must contain a
+	 * completion code. */
+	if (msg->rsp_size < 13) {
+		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_lan_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		return 0;
+	}
+
+	if (msg->rsp[2] != 0) {
+		/* An error getting the response, just ignore it. */
+		return 0;
+	}
+
+	lan_addr.addr_type = IPMI_LAN_ADDR_TYPE;
+	lan_addr.session_handle = msg->rsp[4];
+	lan_addr.remote_SWID = msg->rsp[8];
+	lan_addr.local_SWID = msg->rsp[5];
+	lan_addr.channel = msg->rsp[3] & 0x0f;
+	lan_addr.privilege = msg->rsp[3] >> 4;
+	lan_addr.lun = msg->rsp[9] & 3;
+
+	/* It's a response from a remote entity.  Look up the sequence
+	   number and handle the response. */
+	if (intf_find_seq(intf,
+			  msg->rsp[9] >> 2,
+			  msg->rsp[3] & 0x0f,
+			  msg->rsp[10],
+			  (msg->rsp[6] >> 2) & (~1),
+			  (struct ipmi_addr *) &(lan_addr),
+			  &recv_msg))
+	{
+		/* We were unable to find the sequence number,
+		   so just nuke the message. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_lan_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		return 0;
+	}
+
+	memcpy(recv_msg->msg_data,
+	       &(msg->rsp[11]),
+	       msg->rsp_size - 11);
+	/* The other fields matched, so no need to set them, except
+           for netfn, which needs to be the response that was
+           returned, not the request value. */
+	recv_msg->msg.netfn = msg->rsp[6] >> 2;
+	recv_msg->msg.data = recv_msg->msg_data;
+	recv_msg->msg.data_len = msg->rsp_size - 12;
+	recv_msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+	spin_lock_irqsave(&intf->counter_lock, flags);
+	intf->handled_lan_responses++;
+	spin_unlock_irqrestore(&intf->counter_lock, flags);
+	deliver_response(recv_msg);
+
+	return 0;
+}
+
+static int handle_lan_get_msg_cmd(ipmi_smi_t          intf,
+				  struct ipmi_smi_msg *msg)
+{
+	struct cmd_rcvr       *rcvr;
+	int                   rv = 0;
+	unsigned char         netfn;
+	unsigned char         cmd;
+	ipmi_user_t           user = NULL;
+	struct ipmi_lan_addr  *lan_addr;
+	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
+
+	if (msg->rsp_size < 12) {
+		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		return 0;
+	}
+
+	if (msg->rsp[2] != 0) {
+		/* An error getting the response, just ignore it. */
+		return 0;
+	}
+
+	netfn = msg->rsp[6] >> 2;
+	cmd = msg->rsp[10];
+
+	read_lock(&(intf->cmd_rcvr_lock));
+
+	if (intf->all_cmd_rcvr) {
+		user = intf->all_cmd_rcvr;
+	} else {
+		/* Find the command/netfn. */
+		list_for_each_entry(rcvr, &(intf->cmd_rcvrs), link) {
+			if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)) {
+				user = rcvr->user;
+				break;
+			}
+		}
+	}
+	read_unlock(&(intf->cmd_rcvr_lock));
+
+	if (user == NULL) {
+		/* We didn't find a user, deliver an error response. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
+		rv = 0; /* Don't do anything with these messages, just
+			   allow them to be freed. */
+	} else {
+		/* Deliver the message to the user. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->handled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
+		recv_msg = ipmi_alloc_recv_msg();
+		if (! recv_msg) {
+			/* We couldn't allocate memory for the
+                           message, so requeue it for handling
+                           later. */
+			rv = 1;
+		} else {
+			/* Extract the source address from the data. */
+			lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr;
+			lan_addr->addr_type = IPMI_LAN_ADDR_TYPE;
+			lan_addr->session_handle = msg->rsp[4];
+			lan_addr->remote_SWID = msg->rsp[8];
+			lan_addr->local_SWID = msg->rsp[5];
+			lan_addr->lun = msg->rsp[9] & 3;
+			lan_addr->channel = msg->rsp[3] & 0xf;
+			lan_addr->privilege = msg->rsp[3] >> 4;
+
+			/* Extract the rest of the message information
+			   from the IPMB header.*/
+			recv_msg->user = user;
+			recv_msg->recv_type = IPMI_CMD_RECV_TYPE;
+			recv_msg->msgid = msg->rsp[9] >> 2;
+			recv_msg->msg.netfn = msg->rsp[6] >> 2;
+			recv_msg->msg.cmd = msg->rsp[10];
+			recv_msg->msg.data = recv_msg->msg_data;
+
+			/* We chop off 12, not 11 bytes because the checksum
+			   at the end also needs to be removed. */
+			recv_msg->msg.data_len = msg->rsp_size - 12;
+			memcpy(recv_msg->msg_data,
+			       &(msg->rsp[11]),
+			       msg->rsp_size - 12);
+			deliver_response(recv_msg);
+		}
+	}
+
+	return rv;
+}
+
 static void copy_event_into_recv_msg(struct ipmi_recv_msg *recv_msg,
 				     struct ipmi_smi_msg  *msg)
 {
@@ -1368,9 +2206,8 @@ static void copy_event_into_recv_msg(struct ipmi_recv_msg *recv_msg,
 static int handle_read_event_rsp(ipmi_smi_t          intf,
 				 struct ipmi_smi_msg *msg)
 {
-	struct ipmi_recv_msg *recv_msg;
+	struct ipmi_recv_msg *recv_msg, *recv_msg2;
 	struct list_head     msgs;
-	struct list_head     *entry, *entry2;
 	ipmi_user_t          user;
 	int                  rv = 0;
 	int                  deliver_count = 0;
@@ -1378,6 +2215,9 @@ static int handle_read_event_rsp(ipmi_smi_t          intf,
 
 	if (msg->rsp_size < 19) {
 		/* Message is too small to be an IPMB event. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_events++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
 	}
 
@@ -1390,21 +2230,20 @@ static int handle_read_event_rsp(ipmi_smi_t          intf,
 
 	spin_lock_irqsave(&(intf->events_lock), flags);
 
+	spin_lock(&intf->counter_lock);
+	intf->events++;
+	spin_unlock(&intf->counter_lock);
+
 	/* Allocate and fill in one message for every user that is getting
 	   events. */
-	list_for_each(entry, &(intf->users)) {
-		user = list_entry(entry, struct ipmi_user, link);
-
+	list_for_each_entry(user, &(intf->users), link) {
 		if (! user->gets_events)
 			continue;
 
 		recv_msg = ipmi_alloc_recv_msg();
 		if (! recv_msg) {
-			list_for_each_safe(entry, entry2, &msgs) {
-				recv_msg = list_entry(entry,
-						      struct ipmi_recv_msg,
-						      link);
-				list_del(entry);
+			list_for_each_entry_safe(recv_msg, recv_msg2, &msgs, link) {
+				list_del(&recv_msg->link);
 				ipmi_free_recv_msg(recv_msg);
 			}
 			/* We couldn't allocate memory for the
@@ -1423,11 +2262,8 @@ static int handle_read_event_rsp(ipmi_smi_t          intf,
 
 	if (deliver_count) {
 		/* Now deliver all the messages. */
-		list_for_each_safe(entry, entry2, &msgs) {
-			recv_msg = list_entry(entry,
-					      struct ipmi_recv_msg,
-					      link);
-			list_del(entry);
+		list_for_each_entry_safe(recv_msg, recv_msg2, &msgs, link) {
+			list_del(&recv_msg->link);
 			deliver_response(recv_msg);
 		}
 	} else if (intf->waiting_events_count < MAX_EVENTS_IN_QUEUE) {
@@ -1462,15 +2298,14 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 {
 	struct ipmi_recv_msg *recv_msg;
 	int                  found = 0;
-	struct list_head     *entry;
+	struct ipmi_user     *user;
+	unsigned long        flags;
 
 	recv_msg = (struct ipmi_recv_msg *) msg->user_data;
 
 	/* Make sure the user still exists. */
-	list_for_each(entry, &(intf->users)) {
-		if (list_entry(entry, struct ipmi_user, link)
-		    == recv_msg->user)
-		{
+	list_for_each_entry(user, &(intf->users), link) {
+		if (user == recv_msg->user) {
 			/* Found it, so we can deliver it */
 			found = 1;
 			break;
@@ -1482,10 +2317,16 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 		if (!recv_msg->user && intf->null_user_handler)
 			intf->null_user_handler(intf, msg);
 		/* The user for the message went away, so give up. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_local_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		ipmi_free_recv_msg(recv_msg);
 	} else {
 		struct ipmi_system_interface_addr *smi_addr;
 
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->handled_local_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		recv_msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
 		recv_msg->msgid = msg->msgid;
 		smi_addr = ((struct ipmi_system_interface_addr *)
@@ -1513,28 +2354,86 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 			       struct ipmi_smi_msg *msg)
 {
 	int requeue;
+	int chan;
 
+#if DEBUG_MSGING
+	int m;
+	printk("Recv:");
+	for (m=0; m<msg->rsp_size; m++)
+		printk(" %2.2x", msg->rsp[m]);
+	printk("\n");
+#endif
 	if (msg->rsp_size < 2) {
 		/* Message is too small to be correct. */
 		requeue = 0;
-	} else if (msg->rsp[1] == IPMI_GET_MSG_CMD) {
-#if DEBUG_MSGING
-		int m;
-		printk("Response:");
-		for (m=0; m<msg->rsp_size; m++)
-			printk(" %2.2x", msg->rsp[m]);
-		printk("\n");
-#endif
+	} else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2))
+		   && (msg->rsp[1] == IPMI_SEND_MSG_CMD)
+		   && (msg->user_data != NULL))
+	{
+		/* It's a response to a response we sent.  For this we
+		   deliver a send message response to the user. */
+		struct ipmi_recv_msg *recv_msg = msg->user_data;
+
+		requeue = 0;
+		if (msg->rsp_size < 2)
+			/* Message is too small to be correct. */
+			goto out;
+
+		chan = msg->data[2] & 0x0f;
+		if (chan >= IPMI_MAX_CHANNELS)
+			/* Invalid channel number */
+			goto out;
+
+		if (recv_msg) {
+			recv_msg->recv_type = IPMI_RESPONSE_RESPONSE_TYPE;
+			recv_msg->msg.data = recv_msg->msg_data;
+			recv_msg->msg.data_len = 1;
+			recv_msg->msg_data[0] = msg->rsp[2];
+			deliver_response(recv_msg);
+		}
+	} else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2))
+		   && (msg->rsp[1] == IPMI_GET_MSG_CMD))
+	{
 		/* It's from the receive queue. */
-		if (msg->rsp[4] & 0x04) {
-			/* It's a response, so find the
-			   requesting message and send it up. */
-			requeue = handle_get_msg_rsp(intf, msg);
-		} else {
-			/* It's a command to the SMS from some other
-			   entity.  Handle that. */
-			requeue = handle_get_msg_cmd(intf, msg);
+		chan = msg->rsp[3] & 0xf;
+		if (chan >= IPMI_MAX_CHANNELS) {
+			/* Invalid channel number */
+			requeue = 0;
+			goto out;
+		}
+
+		switch (intf->channels[chan].medium) {
+		case IPMI_CHANNEL_MEDIUM_IPMB:
+			if (msg->rsp[4] & 0x04) {
+				/* It's a response, so find the
+				   requesting message and send it up. */
+				requeue = handle_ipmb_get_msg_rsp(intf, msg);
+			} else {
+				/* It's a command to the SMS from some other
+				   entity.  Handle that. */
+				requeue = handle_ipmb_get_msg_cmd(intf, msg);
+			}
+			break;
+
+		case IPMI_CHANNEL_MEDIUM_8023LAN:
+		case IPMI_CHANNEL_MEDIUM_ASYNC:
+			if (msg->rsp[6] & 0x04) {
+				/* It's a response, so find the
+				   requesting message and send it up. */
+				requeue = handle_lan_get_msg_rsp(intf, msg);
+			} else {
+				/* It's a command to the SMS from some other
+				   entity.  Handle that. */
+				requeue = handle_lan_get_msg_cmd(intf, msg);
+			}
+			break;
+
+		default:
+			/* We don't handle the channel type, so just
+			 * free the message. */
+			requeue = 0;
 		}
+
 	} else if (msg->rsp[1] == IPMI_READ_EVENT_MSG_BUFFER_CMD) {
 		/* It's an asyncronous event. */
 		requeue = handle_read_event_rsp(intf, msg);
@@ -1543,6 +2442,7 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 		requeue = handle_bmc_rsp(intf, msg);
 	}
 
+ out:
 	return requeue;
 }
 
@@ -1558,10 +2458,43 @@ void ipmi_smi_msg_received(ipmi_smi_t          intf,
 	   working on it. */
 	read_lock(&(intf->users_lock));
 
-	if ((msg->data_size >= 2) && (msg->data[1] == IPMI_SEND_MSG_CMD)) {
-		/* This is the local response to a send, start the
-                   timer for these. */
-		intf_start_seq_timer(intf, msg->msgid);
+	if ((msg->data_size >= 2)
+	    && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2))
+	    && (msg->data[1] == IPMI_SEND_MSG_CMD)
+	    && (msg->user_data == NULL)) {
+		/* This is the local response to a command send, start
+                   the timer for these.  The user_data will not be
+                   NULL if this is a response send, and we will let
+                   response sends just go through. */
+
+		/* Check for errors, if we get certain errors (ones
+                   that mean basically we can try again later), we
+                   ignore them and start the timer.  Otherwise we
+                   report the error immediately. */
+		if ((msg->rsp_size >= 3) && (msg->rsp[2] != 0)
+		    && (msg->rsp[2] != IPMI_NODE_BUSY_ERR)
+		    && (msg->rsp[2] != IPMI_LOST_ARBITRATION_ERR))
+		{
+			int chan = msg->rsp[3] & 0xf;
+
+			/* Got an error sending the message, handle it. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			if (chan >= IPMI_MAX_CHANNELS)
+				; /* This shouldn't happen */
+			else if ((intf->channels[chan].medium
+				  == IPMI_CHANNEL_MEDIUM_8023LAN)
+				 || (intf->channels[chan].medium
+				     == IPMI_CHANNEL_MEDIUM_ASYNC))
+				intf->sent_lan_command_errs++;
+			else
+				intf->sent_ipmb_command_errs++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			intf_err_seq(intf, msg->msgid, msg->rsp[2]);
+		} else {
+			/* The message was sent, start the timer. */
+			intf_start_seq_timer(intf, msg->msgid);
+		}
+
 		ipmi_free_smi_msg(msg);
 		goto out_unlock;
 	}
@@ -1593,13 +2526,10 @@ void ipmi_smi_msg_received(ipmi_smi_t          intf,
 
 void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf)
 {
-	struct list_head *entry;
-	ipmi_user_t      user;
+	ipmi_user_t user;
 
 	read_lock(&(intf->users_lock));
-	list_for_each(entry, &(intf->users)) {
-		user = list_entry(entry, struct ipmi_user, link);
-
+	list_for_each_entry(user, &(intf->users), link) {
 		if (! user->handler->ipmi_watchdog_pretimeout)
 			continue;
 
@@ -1657,10 +2587,9 @@ ipmi_timeout_handler(long timeout_period)
 {
 	ipmi_smi_t           intf;
 	struct list_head     timeouts;
-	struct ipmi_recv_msg *msg;
-	struct ipmi_smi_msg  *smi_msg;
+	struct ipmi_recv_msg *msg, *msg2;
+	struct ipmi_smi_msg  *smi_msg, *smi_msg2;
 	unsigned long        flags;
-	struct list_head     *entry, *entry2;
 	int                  i, j;
 
 	INIT_LIST_HEAD(&timeouts);
@@ -1675,10 +2604,9 @@ ipmi_timeout_handler(long timeout_period)
 
 		/* See if any waiting messages need to be processed. */
 		spin_lock_irqsave(&(intf->waiting_msgs_lock), flags);
-		list_for_each_safe(entry, entry2, &(intf->waiting_msgs)) {
-			smi_msg = list_entry(entry, struct ipmi_smi_msg, link);
+		list_for_each_entry_safe(smi_msg, smi_msg2, &(intf->waiting_msgs), link) {
 			if (! handle_new_recv_msg(intf, smi_msg)) {
-				list_del(entry);
+				list_del(&smi_msg->link);
 				ipmi_free_smi_msg(smi_msg);
 			} else {
 				/* To preserve message order, quit if we
@@ -1706,6 +2634,15 @@ ipmi_timeout_handler(long timeout_period)
 				ent->inuse = 0;
 				msg = ent->recv_msg;
 				list_add_tail(&(msg->link), &timeouts);
+				spin_lock(&intf->counter_lock);
+				if (ent->broadcast)
+					intf->timed_out_ipmb_broadcasts++;
+				else if (ent->recv_msg->addr.addr_type
+					 == IPMI_LAN_ADDR_TYPE)
+					intf->timed_out_lan_commands++;
+				else
+					intf->timed_out_ipmb_commands++;
+				spin_unlock(&intf->counter_lock);
 			} else {
 				/* More retries, send again. */
 
@@ -1715,12 +2652,18 @@ ipmi_timeout_handler(long timeout_period)
 				ent->retries_left--;
 				send_from_recv_msg(intf, ent->recv_msg, NULL,
 						   j, ent->seqid);
+				spin_lock(&intf->counter_lock);
+				if (ent->recv_msg->addr.addr_type
+				    == IPMI_LAN_ADDR_TYPE)
+					intf->retransmitted_lan_commands++;
+				else
+					intf->retransmitted_ipmb_commands++;
+				spin_unlock(&intf->counter_lock);
 			}
 		}
 		spin_unlock_irqrestore(&(intf->seq_lock), flags);
 
-		list_for_each_safe(entry, entry2, &timeouts) {
-			msg = list_entry(entry, struct ipmi_recv_msg, link);
+		list_for_each_entry_safe(msg, msg2, &timeouts, link) {
 			handle_msg_timeout(msg);
 		}
 
@@ -1747,13 +2690,16 @@ static void ipmi_request_event(void)
 
 static struct timer_list ipmi_timer;
 
-/* Call every 100 ms. */
+/* Call every ~100 ms. */
 #define IPMI_TIMEOUT_TIME	100
-#define IPMI_TIMEOUT_JIFFIES	((IPMI_TIMEOUT_TIME * HZ)/1000)
 
-/* Request events from the queue every second.  Hopefully, in the
-   future, IPMI will add a way to know immediately if an event is
-   in the queue. */
+/* How many jiffies does it take to get to the timeout time. */
+#define IPMI_TIMEOUT_JIFFIES	((IPMI_TIMEOUT_TIME * HZ) / 1000)
+
+/* Request events from the queue every second (this is the number of
+   IPMI_TIMEOUT_TIMES between event requests).  Hopefully, in the
+   future, IPMI will add a way to know immediately if an event is in
+   the queue and this silliness can go away. */
 #define IPMI_REQUEST_EV_TIME	(1000 / (IPMI_TIMEOUT_TIME))
 
 static volatile int stop_operation = 0;
@@ -1796,6 +2742,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void)
 	rv = kmalloc(sizeof(struct ipmi_smi_msg), GFP_ATOMIC);
 	if (rv) {
 		rv->done = free_smi_msg;
+		rv->user_data = NULL;
 		atomic_inc(&smi_msg_inuse_count);
 	}
 	return rv;
@@ -1907,11 +2854,13 @@ static void send_panic_events(char *str)
 			       &addr,
 			       0,
 			       &msg,
+			       NULL,
 			       &smi_msg,
 			       &recv_msg,
 			       0,
 			       intf->my_address,
-			       intf->my_lun);
+			       intf->my_lun,
+			       0, 1); /* Don't retry, and don't wait. */
 	}
 
 #ifdef CONFIG_IPMI_PANIC_STRING
@@ -1951,11 +2900,13 @@ static void send_panic_events(char *str)
 			       &addr,
 			       0,
 			       &msg,
+			       NULL,
 			       &smi_msg,
 			       &recv_msg,
 			       0,
 			       intf->my_address,
-			       intf->my_lun);
+			       intf->my_lun,
+			       0, 1); /* Don't retry, and don't wait. */
 
 		if (intf->local_event_generator) {
 			/* Request the event receiver from the local MC. */
@@ -1969,11 +2920,13 @@ static void send_panic_events(char *str)
 				       &addr,
 				       0,
 				       &msg,
+				       NULL,
 				       &smi_msg,
 				       &recv_msg,
 				       0,
 				       intf->my_address,
-				       intf->my_lun);
+				       intf->my_lun,
+				       0, 1); /* no retry, and no wait. */
 		}
 		intf->null_user_handler = NULL;
 
@@ -2029,11 +2982,13 @@ static void send_panic_events(char *str)
 				       &addr,
 				       0,
 				       &msg,
+				       NULL,
 				       &smi_msg,
 				       &recv_msg,
 				       0,
 				       intf->my_address,
-				       intf->my_lun);
+				       intf->my_lun,
+				       0, 1); /* no retry, and no wait. */
 		}
 	}	
 #endif /* CONFIG_IPMI_PANIC_STRING */
@@ -2075,7 +3030,6 @@ static struct notifier_block panic_block = {
 	200   /* priority: INT_MAX >= x >= 0 */
 };
 
-
 static __init int ipmi_init_msghandler(void)
 {
 	int i;
@@ -2083,10 +3037,21 @@ static __init int ipmi_init_msghandler(void)
 	if (initialized)
 		return 0;
 
+	printk(KERN_INFO "ipmi message handler version "
+	       IPMI_MSGHANDLER_VERSION "\n");
+
 	for (i=0; i<MAX_IPMI_INTERFACES; i++) {
 		ipmi_interfaces[i] = NULL;
 	}
 
+	proc_ipmi_root = proc_mkdir("ipmi", 0);
+	if (!proc_ipmi_root) {
+	    printk("Unable to create IPMI proc dir");
+	    return -ENOMEM;
+	}
+
+	proc_ipmi_root->owner = THIS_MODULE;
+
 	init_timer(&ipmi_timer);
 	ipmi_timer.data = 0;
 	ipmi_timer.function = ipmi_timeout;
@@ -2097,8 +3062,6 @@ static __init int ipmi_init_msghandler(void)
 
 	initialized = 1;
 
-	printk(KERN_INFO "ipmi: message handler initialized\n");
-
 	return 0;
 }
 
@@ -2118,9 +3081,12 @@ static __exit void cleanup_ipmi(void)
 	   problems with race conditions removing the timer here. */
 	stop_operation = 1;
 	while (!timer_stopped) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
 	}
 
+	remove_proc_entry(proc_ipmi_root->name, &proc_root);
+
 	initialized = 0;
 
 	/* Check for buffer leaks. */
@@ -2143,6 +3109,7 @@ EXPORT_SYMBOL(ipmi_create_user);
 EXPORT_SYMBOL(ipmi_destroy_user);
 EXPORT_SYMBOL(ipmi_get_version);
 EXPORT_SYMBOL(ipmi_request);
+EXPORT_SYMBOL(ipmi_request_settime);
 EXPORT_SYMBOL(ipmi_request_supply_msgs);
 EXPORT_SYMBOL(ipmi_request_with_source);
 EXPORT_SYMBOL(ipmi_register_smi);
@@ -2164,3 +3131,4 @@ EXPORT_SYMBOL(ipmi_set_my_address);
 EXPORT_SYMBOL(ipmi_get_my_address);
 EXPORT_SYMBOL(ipmi_set_my_LUN);
 EXPORT_SYMBOL(ipmi_get_my_LUN);
+EXPORT_SYMBOL(ipmi_smi_add_proc_entry);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
new file mode 100644
index 000000000000..42b7e5d22de9
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -0,0 +1,2052 @@
+/*
+ * ipmi_si.c
+ *
+ * The interface to the IPMI driver for the system interfaces (KCS, SMIC,
+ * BT).
+ *
+ * Author: MontaVista Software, Inc.
+ *         Corey Minyard <minyard@mvista.com>
+ *         source@mvista.com
+ *
+ * Copyright 2002 MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This file holds the "policy" for the interface to the SMI state
+ * machine.  It does the configuration, handles timers and interrupts,
+ * and drives the real SMI state machine.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <asm/system.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#ifdef CONFIG_HIGH_RES_TIMERS
+#include <linux/hrtime.h>
+# if defined(schedule_next_int)
+/* Old high-res timer code, do translations. */
+#  define get_arch_cycles(a) quick_update_jiffies_sub(a)
+#  define arch_cycles_per_jiffy cycles_per_jiffies
+# endif
+static inline void add_usec_to_timer(struct timer_list *t, long v)
+{
+	t->sub_expires += nsec_to_arch_cycle(v * 1000);
+	while (t->sub_expires >= arch_cycles_per_jiffy)
+	{
+		t->expires++;
+		t->sub_expires -= arch_cycles_per_jiffy;
+	}
+}
+#endif
+#include <linux/interrupt.h>
+#include <linux/rcupdate.h>
+#include <linux/ipmi_smi.h>
+#include <asm/io.h>
+#include "ipmi_si_sm.h"
+#include <linux/init.h>
+
+#define IPMI_SI_VERSION "v31"
+
+/* Measure times between events in the driver. */
+#undef DEBUG_TIMING
+
+/* Call every 10 ms. */
+#define SI_TIMEOUT_TIME_USEC	10000
+#define SI_USEC_PER_JIFFY	(1000000/HZ)
+#define SI_TIMEOUT_JIFFIES	(SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY)
+#define SI_SHORT_TIMEOUT_USEC  250 /* .25ms when the SM request a
+                                       short timeout */
+
+enum si_intf_state {
+	SI_NORMAL,
+	SI_GETTING_FLAGS,
+	SI_GETTING_EVENTS,
+	SI_CLEARING_FLAGS,
+	SI_CLEARING_FLAGS_THEN_SET_IRQ,
+	SI_GETTING_MESSAGES,
+	SI_ENABLE_INTERRUPTS1,
+	SI_ENABLE_INTERRUPTS2
+	/* FIXME - add watchdog stuff. */
+};
+
+enum si_type {
+    SI_KCS, SI_SMIC, SI_BT
+};
+
+struct smi_info
+{
+	ipmi_smi_t             intf;
+	struct si_sm_data      *si_sm;
+	struct si_sm_handlers  *handlers;
+	enum si_type           si_type;
+	spinlock_t             si_lock;
+	spinlock_t             msg_lock;
+	struct list_head       xmit_msgs;
+	struct list_head       hp_xmit_msgs;
+	struct ipmi_smi_msg    *curr_msg;
+	enum si_intf_state     si_state;
+
+	/* Used to handle the various types of I/O that can occur with
+           IPMI */
+	struct si_sm_io io;
+	int (*io_setup)(struct smi_info *info);
+	void (*io_cleanup)(struct smi_info *info);
+	int (*irq_setup)(struct smi_info *info);
+	void (*irq_cleanup)(struct smi_info *info);
+	unsigned int io_size;
+
+	/* Flags from the last GET_MSG_FLAGS command, used when an ATTN
+	   is set to hold the flags until we are done handling everything
+	   from the flags. */
+#define RECEIVE_MSG_AVAIL	0x01
+#define EVENT_MSG_BUFFER_FULL	0x02
+#define WDT_PRE_TIMEOUT_INT	0x08
+	unsigned char       msg_flags;
+
+	/* If set to true, this will request events the next time the
+	   state machine is idle. */
+	atomic_t            req_events;
+
+	/* If true, run the state machine to completion on every send
+	   call.  Generally used after a panic to make sure stuff goes
+	   out. */
+	int                 run_to_completion;
+
+	/* The I/O port of an SI interface. */
+	int                 port;
+
+	/* zero if no irq; */
+	int                 irq;
+
+	/* The timer for this si. */
+	struct timer_list   si_timer;
+
+	/* The time (in jiffies) the last timeout occurred at. */
+	unsigned long       last_timeout_jiffies;
+
+	/* Used to gracefully stop the timer without race conditions. */
+	volatile int        stop_operation;
+	volatile int        timer_stopped;
+
+	/* The driver will disable interrupts when it gets into a
+	   situation where it cannot handle messages due to lack of
+	   memory.  Once that situation clears up, it will re-enable
+	   interrupts. */
+	int interrupt_disabled;
+
+	unsigned char ipmi_si_dev_rev;
+	unsigned char ipmi_si_fw_rev_major;
+	unsigned char ipmi_si_fw_rev_minor;
+	unsigned char ipmi_version_major;
+	unsigned char ipmi_version_minor;
+
+	/* Counters and things for the proc filesystem. */
+	spinlock_t count_lock;
+	unsigned long short_timeouts;
+	unsigned long long_timeouts;
+	unsigned long timeout_restarts;
+	unsigned long idles;
+	unsigned long interrupts;
+	unsigned long attentions;
+	unsigned long flag_fetches;
+	unsigned long hosed_count;
+	unsigned long complete_transactions;
+	unsigned long events;
+	unsigned long watchdog_pretimeouts;
+	unsigned long incoming_messages;
+};
+
+static void si_restart_short_timer(struct smi_info *smi_info);
+
+static void deliver_recv_msg(struct smi_info *smi_info,
+			     struct ipmi_smi_msg *msg)
+{
+	/* Deliver the message to the upper layer with the lock
+           released. */
+	spin_unlock(&(smi_info->si_lock));
+	ipmi_smi_msg_received(smi_info->intf, msg);
+	spin_lock(&(smi_info->si_lock));
+}
+
+static void return_hosed_msg(struct smi_info *smi_info)
+{
+	struct ipmi_smi_msg *msg = smi_info->curr_msg;
+
+	/* Make it a reponse */
+	msg->rsp[0] = msg->data[0] | 4;
+	msg->rsp[1] = msg->data[1];
+	msg->rsp[2] = 0xFF; /* Unknown error. */
+	msg->rsp_size = 3;
+
+	smi_info->curr_msg = NULL;
+	deliver_recv_msg(smi_info, msg);
+}
+
+static enum si_sm_result start_next_msg(struct smi_info *smi_info)
+{
+	int              rv;
+	struct list_head *entry = NULL;
+#ifdef DEBUG_TIMING
+	struct timeval t;
+#endif
+
+	/* No need to save flags, we aleady have interrupts off and we
+	   already hold the SMI lock. */
+	spin_lock(&(smi_info->msg_lock));
+
+	/* Pick the high priority queue first. */
+	if (! list_empty(&(smi_info->hp_xmit_msgs))) {
+		entry = smi_info->hp_xmit_msgs.next;
+	} else if (! list_empty(&(smi_info->xmit_msgs))) {
+		entry = smi_info->xmit_msgs.next;
+	}
+
+	if (!entry) {
+		smi_info->curr_msg = NULL;
+		rv = SI_SM_IDLE;
+	} else {
+		int err;
+
+		list_del(entry);
+		smi_info->curr_msg = list_entry(entry,
+						struct ipmi_smi_msg,
+						link);
+#ifdef DEBUG_TIMING
+		do_gettimeofday(&t);
+		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+		err = smi_info->handlers->start_transaction(
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		if (err) {
+			return_hosed_msg(smi_info);
+		}
+
+		rv = SI_SM_CALL_WITHOUT_DELAY;
+	}
+	spin_unlock(&(smi_info->msg_lock));
+
+	return rv;
+}
+
+static void start_enable_irq(struct smi_info *smi_info)
+{
+	unsigned char msg[2];
+
+	/* If we are enabling interrupts, we have to tell the
+	   BMC to use them. */
+	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+	msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
+
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+	smi_info->si_state = SI_ENABLE_INTERRUPTS1;
+}
+
+static void start_clear_flags(struct smi_info *smi_info)
+{
+	unsigned char msg[3];
+
+	/* Make sure the watchdog pre-timeout flag is not set at startup. */
+	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+	msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD;
+	msg[2] = WDT_PRE_TIMEOUT_INT;
+
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
+	smi_info->si_state = SI_CLEARING_FLAGS;
+}
+
+/* When we have a situtaion where we run out of memory and cannot
+   allocate messages, we just leave them in the BMC and run the system
+   polled until we can allocate some memory.  Once we have some
+   memory, we will re-enable the interrupt. */
+static inline void disable_si_irq(struct smi_info *smi_info)
+{
+	if ((smi_info->irq) && (!smi_info->interrupt_disabled)) {
+		disable_irq_nosync(smi_info->irq);
+		smi_info->interrupt_disabled = 1;
+	}
+}
+
+static inline void enable_si_irq(struct smi_info *smi_info)
+{
+	if ((smi_info->irq) && (smi_info->interrupt_disabled)) {
+		enable_irq(smi_info->irq);
+		smi_info->interrupt_disabled = 0;
+	}
+}
+
+static void handle_flags(struct smi_info *smi_info)
+{
+	if (smi_info->msg_flags & WDT_PRE_TIMEOUT_INT) {
+		/* Watchdog pre-timeout */
+		spin_lock(&smi_info->count_lock);
+		smi_info->watchdog_pretimeouts++;
+		spin_unlock(&smi_info->count_lock);
+
+		start_clear_flags(smi_info);
+		smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
+		spin_unlock(&(smi_info->si_lock));
+		ipmi_smi_watchdog_pretimeout(smi_info->intf);
+		spin_lock(&(smi_info->si_lock));
+	} else if (smi_info->msg_flags & RECEIVE_MSG_AVAIL) {
+		/* Messages available. */
+		smi_info->curr_msg = ipmi_alloc_smi_msg();
+		if (!smi_info->curr_msg) {
+			disable_si_irq(smi_info);
+			smi_info->si_state = SI_NORMAL;
+			return;
+		}
+		enable_si_irq(smi_info);
+
+		smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		smi_info->curr_msg->data[1] = IPMI_GET_MSG_CMD;
+		smi_info->curr_msg->data_size = 2;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		smi_info->si_state = SI_GETTING_MESSAGES;
+	} else if (smi_info->msg_flags & EVENT_MSG_BUFFER_FULL) {
+		/* Events available. */
+		smi_info->curr_msg = ipmi_alloc_smi_msg();
+		if (!smi_info->curr_msg) {
+			disable_si_irq(smi_info);
+			smi_info->si_state = SI_NORMAL;
+			return;
+		}
+		enable_si_irq(smi_info);
+
+		smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		smi_info->curr_msg->data[1] = IPMI_READ_EVENT_MSG_BUFFER_CMD;
+		smi_info->curr_msg->data_size = 2;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		smi_info->si_state = SI_GETTING_EVENTS;
+	} else {
+		smi_info->si_state = SI_NORMAL;
+	}
+}
+
+static void handle_transaction_done(struct smi_info *smi_info)
+{
+	struct ipmi_smi_msg *msg;
+#ifdef DEBUG_TIMING
+	struct timeval t;
+
+	do_gettimeofday(&t);
+	printk("**Done: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	switch (smi_info->si_state) {
+	case SI_NORMAL:
+		if (!smi_info->curr_msg)
+			break;
+
+		smi_info->curr_msg->rsp_size
+			= smi_info->handlers->get_result(
+				smi_info->si_sm,
+				smi_info->curr_msg->rsp,
+				IPMI_MAX_MSG_LENGTH);
+
+		/* Do this here becase deliver_recv_msg() releases the
+		   lock, and a new message can be put in during the
+		   time the lock is released. */
+		msg = smi_info->curr_msg;
+		smi_info->curr_msg = NULL;
+		deliver_recv_msg(smi_info, msg);
+		break;
+
+	case SI_GETTING_FLAGS:
+	{
+		unsigned char msg[4];
+		unsigned int  len;
+
+		/* We got the flags from the SMI, now handle them. */
+		len = smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
+		if (msg[2] != 0) {
+			/* Error fetching flags, just give up for
+			   now. */
+			smi_info->si_state = SI_NORMAL;
+		} else if (len < 3) {
+			/* Hmm, no flags.  That's technically illegal, but
+			   don't use uninitialized data. */
+			smi_info->si_state = SI_NORMAL;
+		} else {
+			smi_info->msg_flags = msg[3];
+			handle_flags(smi_info);
+		}
+		break;
+	}
+
+	case SI_CLEARING_FLAGS:
+	case SI_CLEARING_FLAGS_THEN_SET_IRQ:
+	{
+		unsigned char msg[3];
+
+		/* We cleared the flags. */
+		smi_info->handlers->get_result(smi_info->si_sm, msg, 3);
+		if (msg[2] != 0) {
+			/* Error clearing flags */
+			printk(KERN_WARNING
+			       "ipmi_si: Error clearing flags: %2.2x\n",
+			       msg[2]);
+		}
+		if (smi_info->si_state == SI_CLEARING_FLAGS_THEN_SET_IRQ)
+			start_enable_irq(smi_info);
+		else
+			smi_info->si_state = SI_NORMAL;
+		break;
+	}
+
+	case SI_GETTING_EVENTS:
+	{
+		smi_info->curr_msg->rsp_size
+			= smi_info->handlers->get_result(
+				smi_info->si_sm,
+				smi_info->curr_msg->rsp,
+				IPMI_MAX_MSG_LENGTH);
+
+		/* Do this here becase deliver_recv_msg() releases the
+		   lock, and a new message can be put in during the
+		   time the lock is released. */
+		msg = smi_info->curr_msg;
+		smi_info->curr_msg = NULL;
+		if (msg->rsp[2] != 0) {
+			/* Error getting event, probably done. */
+			msg->done(msg);
+
+			/* Take off the event flag. */
+			smi_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL;
+		} else {
+			spin_lock(&smi_info->count_lock);
+			smi_info->events++;
+			spin_unlock(&smi_info->count_lock);
+
+			deliver_recv_msg(smi_info, msg);
+		}
+		handle_flags(smi_info);
+		break;
+	}
+
+	case SI_GETTING_MESSAGES:
+	{
+		smi_info->curr_msg->rsp_size
+			= smi_info->handlers->get_result(
+				smi_info->si_sm,
+				smi_info->curr_msg->rsp,
+				IPMI_MAX_MSG_LENGTH);
+
+		/* Do this here becase deliver_recv_msg() releases the
+		   lock, and a new message can be put in during the
+		   time the lock is released. */
+		msg = smi_info->curr_msg;
+		smi_info->curr_msg = NULL;
+		if (msg->rsp[2] != 0) {
+			/* Error getting event, probably done. */
+			msg->done(msg);
+
+			/* Take off the msg flag. */
+			smi_info->msg_flags &= ~RECEIVE_MSG_AVAIL;
+		} else {
+			spin_lock(&smi_info->count_lock);
+			smi_info->incoming_messages++;
+			spin_unlock(&smi_info->count_lock);
+
+			deliver_recv_msg(smi_info, msg);
+		}
+		handle_flags(smi_info);
+		break;
+	}
+
+	case SI_ENABLE_INTERRUPTS1:
+	{
+		unsigned char msg[4];
+
+		/* We got the flags from the SMI, now handle them. */
+		smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
+		if (msg[2] != 0) {
+			printk(KERN_WARNING
+			       "ipmi_si: Could not enable interrupts"
+			       ", failed get, using polled mode.\n");
+			smi_info->si_state = SI_NORMAL;
+		} else {
+			msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+			msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
+			msg[2] = msg[3] | 1; /* enable msg queue int */
+			smi_info->handlers->start_transaction(
+				smi_info->si_sm, msg, 3);
+			smi_info->si_state = SI_ENABLE_INTERRUPTS2;
+		}
+		break;
+	}
+
+	case SI_ENABLE_INTERRUPTS2:
+	{
+		unsigned char msg[4];
+
+		/* We got the flags from the SMI, now handle them. */
+		smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
+		if (msg[2] != 0) {
+			printk(KERN_WARNING
+			       "ipmi_si: Could not enable interrupts"
+			       ", failed set, using polled mode.\n");
+		}
+		smi_info->si_state = SI_NORMAL;
+		break;
+	}
+	}
+}
+
+/* Called on timeouts and events.  Timeouts should pass the elapsed
+   time, interrupts should pass in zero. */
+static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
+					   int time)
+{
+	enum si_sm_result si_sm_result;
+
+ restart:
+	/* There used to be a loop here that waited a little while
+	   (around 25us) before giving up.  That turned out to be
+	   pointless, the minimum delays I was seeing were in the 300us
+	   range, which is far too long to wait in an interrupt.  So
+	   we just run until the state machine tells us something
+	   happened or it needs a delay. */
+	si_sm_result = smi_info->handlers->event(smi_info->si_sm, time);
+	time = 0;
+	while (si_sm_result == SI_SM_CALL_WITHOUT_DELAY)
+	{
+		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
+	}
+
+	if (si_sm_result == SI_SM_TRANSACTION_COMPLETE)
+	{
+		spin_lock(&smi_info->count_lock);
+		smi_info->complete_transactions++;
+		spin_unlock(&smi_info->count_lock);
+
+		handle_transaction_done(smi_info);
+		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
+	}
+	else if (si_sm_result == SI_SM_HOSED)
+	{
+		spin_lock(&smi_info->count_lock);
+		smi_info->hosed_count++;
+		spin_unlock(&smi_info->count_lock);
+
+		if (smi_info->curr_msg != NULL) {
+			/* If we were handling a user message, format
+                           a response to send to the upper layer to
+                           tell it about the error. */
+			return_hosed_msg(smi_info);
+		}
+		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
+		smi_info->si_state = SI_NORMAL;
+	}
+
+	/* We prefer handling attn over new messages. */
+	if (si_sm_result == SI_SM_ATTN)
+	{
+		unsigned char msg[2];
+
+		spin_lock(&smi_info->count_lock);
+		smi_info->attentions++;
+		spin_unlock(&smi_info->count_lock);
+
+		/* Got a attn, send down a get message flags to see
+                   what's causing it.  It would be better to handle
+                   this in the upper layer, but due to the way
+                   interrupts work with the SMI, that's not really
+                   possible. */
+		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm, msg, 2);
+		smi_info->si_state = SI_GETTING_FLAGS;
+		goto restart;
+	}
+
+	/* If we are currently idle, try to start the next message. */
+	if (si_sm_result == SI_SM_IDLE) {
+		spin_lock(&smi_info->count_lock);
+		smi_info->idles++;
+		spin_unlock(&smi_info->count_lock);
+
+		si_sm_result = start_next_msg(smi_info);
+		if (si_sm_result != SI_SM_IDLE)
+			goto restart;
+        }
+
+	if ((si_sm_result == SI_SM_IDLE)
+	    && (atomic_read(&smi_info->req_events)))
+	{
+		/* We are idle and the upper layer requested that I fetch
+		   events, so do so. */
+		unsigned char msg[2];
+
+		spin_lock(&smi_info->count_lock);
+		smi_info->flag_fetches++;
+		spin_unlock(&smi_info->count_lock);
+
+		atomic_set(&smi_info->req_events, 0);
+		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm, msg, 2);
+		smi_info->si_state = SI_GETTING_FLAGS;
+		goto restart;
+	}
+
+	return si_sm_result;
+}
+
+static void sender(void                *send_info,
+		   struct ipmi_smi_msg *msg,
+		   int                 priority)
+{
+	struct smi_info   *smi_info = send_info;
+	enum si_sm_result result;
+	unsigned long     flags;
+#ifdef DEBUG_TIMING
+	struct timeval    t;
+#endif
+
+	spin_lock_irqsave(&(smi_info->msg_lock), flags);
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+
+	if (smi_info->run_to_completion) {
+		/* If we are running to completion, then throw it in
+		   the list and run transactions until everything is
+		   clear.  Priority doesn't matter here. */
+		list_add_tail(&(msg->link), &(smi_info->xmit_msgs));
+
+		/* We have to release the msg lock and claim the smi
+		   lock in this case, because of race conditions. */
+		spin_unlock_irqrestore(&(smi_info->msg_lock), flags);
+
+		spin_lock_irqsave(&(smi_info->si_lock), flags);
+		result = smi_event_handler(smi_info, 0);
+		while (result != SI_SM_IDLE) {
+			udelay(SI_SHORT_TIMEOUT_USEC);
+			result = smi_event_handler(smi_info,
+						   SI_SHORT_TIMEOUT_USEC);
+		}
+		spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+		return;
+	} else {
+		if (priority > 0) {
+			list_add_tail(&(msg->link), &(smi_info->hp_xmit_msgs));
+		} else {
+			list_add_tail(&(msg->link), &(smi_info->xmit_msgs));
+		}
+	}
+	spin_unlock_irqrestore(&(smi_info->msg_lock), flags);
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+	if ((smi_info->si_state == SI_NORMAL)
+	    && (smi_info->curr_msg == NULL))
+	{
+		start_next_msg(smi_info);
+		si_restart_short_timer(smi_info);
+	}
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+}
+
+static void set_run_to_completion(void *send_info, int i_run_to_completion)
+{
+	struct smi_info   *smi_info = send_info;
+	enum si_sm_result result;
+	unsigned long     flags;
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+
+	smi_info->run_to_completion = i_run_to_completion;
+	if (i_run_to_completion) {
+		result = smi_event_handler(smi_info, 0);
+		while (result != SI_SM_IDLE) {
+			udelay(SI_SHORT_TIMEOUT_USEC);
+			result = smi_event_handler(smi_info,
+						   SI_SHORT_TIMEOUT_USEC);
+		}
+	}
+
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+}
+
+static void request_events(void *send_info)
+{
+	struct smi_info *smi_info = send_info;
+
+	atomic_set(&smi_info->req_events, 1);
+}
+
+static int initialized = 0;
+
+/* Must be called with interrupts off and with the si_lock held. */
+static void si_restart_short_timer(struct smi_info *smi_info)
+{
+#if defined(CONFIG_HIGH_RES_TIMERS)
+	unsigned long flags;
+	unsigned long jiffies_now;
+
+	if (del_timer(&(smi_info->si_timer))) {
+		/* If we don't delete the timer, then it will go off
+		   immediately, anyway.  So we only process if we
+		   actually delete the timer. */
+
+		/* We already have irqsave on, so no need for it
+                   here. */
+		read_lock(&xtime_lock);
+		jiffies_now = jiffies;
+		smi_info->si_timer.expires = jiffies_now;
+		smi_info->si_timer.sub_expires = get_arch_cycles(jiffies_now);
+
+		add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC);
+
+		add_timer(&(smi_info->si_timer));
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->timeout_restarts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+	}
+#endif
+}
+
+static void smi_timeout(unsigned long data)
+{
+	struct smi_info   *smi_info = (struct smi_info *) data;
+	enum si_sm_result smi_result;
+	unsigned long     flags;
+	unsigned long     jiffies_now;
+	unsigned long     time_diff;
+#ifdef DEBUG_TIMING
+	struct timeval    t;
+#endif
+
+	if (smi_info->stop_operation) {
+		smi_info->timer_stopped = 1;
+		return;
+	}
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**Timer: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	jiffies_now = jiffies;
+	time_diff = ((jiffies_now - smi_info->last_timeout_jiffies)
+		     * SI_USEC_PER_JIFFY);
+	smi_result = smi_event_handler(smi_info, time_diff);
+
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+
+	smi_info->last_timeout_jiffies = jiffies_now;
+
+	if ((smi_info->irq) && (! smi_info->interrupt_disabled)) {
+		/* Running with interrupts, only do long timeouts. */
+		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->long_timeouts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+		goto do_add_timer;
+	}
+
+	/* If the state machine asks for a short delay, then shorten
+           the timer timeout. */
+	if (smi_result == SI_SM_CALL_WITH_DELAY) {
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->short_timeouts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+#if defined(CONFIG_HIGH_RES_TIMERS)
+		read_lock(&xtime_lock);
+                smi_info->si_timer.expires = jiffies;
+                smi_info->si_timer.sub_expires
+                        = get_arch_cycles(smi_info->si_timer.expires);
+                read_unlock(&xtime_lock);
+		add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC);
+#else
+		smi_info->si_timer.expires = jiffies + 1;
+#endif
+	} else {
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->long_timeouts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
+#if defined(CONFIG_HIGH_RES_TIMERS)
+		smi_info->si_timer.sub_expires = 0;
+#endif
+	}
+
+ do_add_timer:
+	add_timer(&(smi_info->si_timer));
+}
+
+static irqreturn_t si_irq_handler(int irq, void *data, struct pt_regs *regs)
+{
+	struct smi_info *smi_info = data;
+	unsigned long   flags;
+#ifdef DEBUG_TIMING
+	struct timeval  t;
+#endif
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+
+	spin_lock(&smi_info->count_lock);
+	smi_info->interrupts++;
+	spin_unlock(&smi_info->count_lock);
+
+	if (smi_info->stop_operation)
+		goto out;
+
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**Interrupt: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	smi_event_handler(smi_info, 0);
+ out:
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+	return IRQ_HANDLED;
+}
+
+static struct ipmi_smi_handlers handlers =
+{
+	.owner                  = THIS_MODULE,
+	.sender			= sender,
+	.request_events		= request_events,
+	.set_run_to_completion  = set_run_to_completion
+};
+
+/* There can be 4 IO ports passed in (with or without IRQs), 4 addresses,
+   a default IO port, and 1 ACPI/SPMI address.  That sets SI_MAX_DRIVERS */
+
+#define SI_MAX_PARMS 4
+#define SI_MAX_DRIVERS ((SI_MAX_PARMS * 2) + 2)
+static struct smi_info *smi_infos[SI_MAX_DRIVERS] =
+{ NULL, NULL, NULL, NULL };
+
+#define DEVICE_NAME "ipmi_si"
+
+#define DEFAULT_KCS_IO_PORT 0xca2
+#define DEFAULT_SMIC_IO_PORT 0xca9
+#define DEFAULT_BT_IO_PORT   0xe4
+
+static int           si_trydefaults = 1;
+static char          *si_type[SI_MAX_PARMS] = { NULL, NULL, NULL, NULL };
+#define MAX_SI_TYPE_STR 30
+static char          si_type_str[MAX_SI_TYPE_STR];
+static unsigned long addrs[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int num_addrs = 0;
+static unsigned int  ports[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int num_ports = 0;
+static int           irqs[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int num_irqs = 0;
+
+
+module_param_named(trydefaults, si_trydefaults, bool, 0);
+MODULE_PARM_DESC(trydefaults, "Setting this to 'false' will disable the"
+		 " default scan of the KCS and SMIC interface at the standard"
+		 " address");
+module_param_string(type, si_type_str, MAX_SI_TYPE_STR, 0);
+MODULE_PARM_DESC(type, "Defines the type of each interface, each"
+		 " interface separated by commas.  The types are 'kcs',"
+		 " 'smic', and 'bt'.  For example si_type=kcs,bt will set"
+		 " the first interface to kcs and the second to bt");
+module_param_array(addrs, long, num_addrs, 0);
+MODULE_PARM_DESC(addrs, "Sets the memory address of each interface, the"
+		 " addresses separated by commas.  Only use if an interface"
+		 " is in memory.  Otherwise, set it to zero or leave"
+		 " it blank.");
+module_param_array(ports, int, num_ports, 0);
+MODULE_PARM_DESC(ports, "Sets the port address of each interface, the"
+		 " addresses separated by commas.  Only use if an interface"
+		 " is a port.  Otherwise, set it to zero or leave"
+		 " it blank.");
+module_param_array(irqs, int, num_irqs, 0);
+MODULE_PARM_DESC(irqs, "Sets the interrupt of each interface, the"
+		 " addresses separated by commas.  Only use if an interface"
+		 " has an interrupt.  Otherwise, set it to zero or leave"
+		 " it blank.");
+
+
+#if defined(CONFIG_ACPI_INTERPETER) || defined(CONFIG_X86) || defined(CONFIG_PCI)
+#define IPMI_MEM_ADDR_SPACE 1
+#define IPMI_IO_ADDR_SPACE  2
+static int is_new_interface(int intf, u8 addr_space, unsigned long base_addr)
+{
+	int i;
+
+	for (i = 0; i < SI_MAX_PARMS; ++i) {
+		/* Don't check our address. */
+		if (i == intf)
+			continue;
+		if (si_type[i] != NULL) {
+			if ((addr_space == IPMI_MEM_ADDR_SPACE &&
+			     base_addr == addrs[i]) ||
+			    (addr_space == IPMI_IO_ADDR_SPACE &&
+			     base_addr == ports[i]))
+				return 0;
+		}
+		else
+			break;
+	}
+
+	return 1;
+}
+#endif
+
+static int std_irq_setup(struct smi_info *info)
+{
+	int rv;
+
+	if (!info->irq)
+		return 0;
+
+	rv = request_irq(info->irq,
+			 si_irq_handler,
+			 SA_INTERRUPT,
+			 DEVICE_NAME,
+			 info);
+	if (rv) {
+		printk(KERN_WARNING
+		       "ipmi_si: %s unable to claim interrupt %d,"
+		       " running polled\n",
+		       DEVICE_NAME, info->irq);
+		info->irq = 0;
+	} else {
+		printk("  Using irq %d\n", info->irq);
+	}
+
+	return rv;
+}
+
+static void std_irq_cleanup(struct smi_info *info)
+{
+	if (!info->irq)
+		return;
+
+	free_irq(info->irq, info);
+}
+
+static unsigned char port_inb(struct si_sm_io *io, unsigned int offset)
+{
+	unsigned int *addr = io->info;
+
+	return inb((*addr)+offset);
+}
+
+static void port_outb(struct si_sm_io *io, unsigned int offset,
+		      unsigned char b)
+{
+	unsigned int *addr = io->info;
+
+	outb(b, (*addr)+offset);
+}
+
+static int port_setup(struct smi_info *info)
+{
+	unsigned int *addr = info->io.info;
+
+	if (!addr || (!*addr))
+		return -ENODEV;
+
+	if (request_region(*addr, info->io_size, DEVICE_NAME) == NULL)
+		return -EIO;
+	return 0;
+}
+
+static void port_cleanup(struct smi_info *info)
+{
+	unsigned int *addr = info->io.info;
+
+	if (addr && (*addr))
+		release_region (*addr, info->io_size);
+	kfree(info);
+}
+
+static int try_init_port(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info *info;
+
+	if (!ports[intf_num])
+		return -ENODEV;
+
+	if (!is_new_interface(intf_num, IPMI_IO_ADDR_SPACE,
+			      ports[intf_num]))
+		return -ENODEV;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (1)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	info->io_setup = port_setup;
+	info->io_cleanup = port_cleanup;
+	info->io.inputb = port_inb;
+	info->io.outputb = port_outb;
+	info->io.info = &(ports[intf_num]);
+	info->io.addr = NULL;
+	info->irq = 0;
+	info->irq_setup = NULL;
+	*new_info = info;
+
+	if (si_type[intf_num] == NULL)
+		si_type[intf_num] = "kcs";
+
+	printk("ipmi_si: Trying \"%s\" at I/O port 0x%x\n",
+	       si_type[intf_num], ports[intf_num]);
+	return 0;
+}
+
+static unsigned char mem_inb(struct si_sm_io *io, unsigned int offset)
+{
+	return readb((io->addr)+offset);
+}
+
+static void mem_outb(struct si_sm_io *io, unsigned int offset,
+		     unsigned char b)
+{
+	writeb(b, (io->addr)+offset);
+}
+
+static int mem_setup(struct smi_info *info)
+{
+	unsigned long *addr = info->io.info;
+
+	if (!addr || (!*addr))
+		return -ENODEV;
+
+	if (request_mem_region(*addr, info->io_size, DEVICE_NAME) == NULL)
+		return -EIO;
+
+	info->io.addr = ioremap(*addr, info->io_size);
+	if (info->io.addr == NULL) {
+		release_mem_region(*addr, info->io_size);
+		return -EIO;
+	}
+	return 0;
+}
+
+static void mem_cleanup(struct smi_info *info)
+{
+	unsigned long *addr = info->io.info;
+
+	if (info->io.addr) {
+		iounmap(info->io.addr);
+		release_mem_region(*addr, info->io_size);
+	}
+	kfree(info);
+}
+
+static int try_init_mem(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info *info;
+
+	if (!addrs[intf_num])
+		return -ENODEV;
+
+	if (!is_new_interface(intf_num, IPMI_MEM_ADDR_SPACE,
+			      addrs[intf_num]))
+		return -ENODEV;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (2)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	info->io_setup = mem_setup;
+	info->io_cleanup = mem_cleanup;
+	info->io.inputb = mem_inb;
+	info->io.outputb = mem_outb;
+	info->io.info = (void *) addrs[intf_num];
+	info->io.addr = NULL;
+	info->irq = 0;
+	info->irq_setup = NULL;
+	*new_info = info;
+
+	if (si_type[intf_num] == NULL)
+		si_type[intf_num] = "kcs";
+
+	printk("ipmi_si: Trying \"%s\" at memory address 0x%lx\n",
+	       si_type[intf_num], addrs[intf_num]);
+	return 0;
+}
+
+
+#ifdef CONFIG_ACPI_INTERPRETER
+
+#include <linux/acpi.h>
+
+/* Once we get an ACPI failure, we don't try any more, because we go
+   through the tables sequentially.  Once we don't find a table, there
+   are no more. */
+static int acpi_failure = 0;
+
+/* For GPE-type interrupts. */
+void ipmi_acpi_gpe(void *context)
+{
+	struct smi_info *smi_info = context;
+	unsigned long   flags;
+#ifdef DEBUG_TIMING
+	struct timeval t;
+#endif
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+
+	spin_lock(&smi_info->count_lock);
+	smi_info->interrupts++;
+	spin_unlock(&smi_info->count_lock);
+
+	if (smi_info->stop_operation)
+		goto out;
+
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**ACPI_GPE: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	smi_event_handler(smi_info, 0);
+ out:
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+}
+
+static int acpi_gpe_irq_setup(struct smi_info *info)
+{
+	acpi_status status;
+
+	if (!info->irq)
+		return 0;
+
+	/* FIXME - is level triggered right? */
+	status = acpi_install_gpe_handler(NULL,
+					  info->irq,
+					  ACPI_GPE_LEVEL_TRIGGERED,
+					  ipmi_acpi_gpe,
+					  info);
+	if (status != AE_OK) {
+		printk(KERN_WARNING
+		       "ipmi_si: %s unable to claim ACPI GPE %d,"
+		       " running polled\n",
+		       DEVICE_NAME, info->irq);
+		info->irq = 0;
+		return -EINVAL;
+	} else {
+		printk("  Using ACPI GPE %d\n", info->irq);
+		return 0;
+	}
+
+}
+
+static void acpi_gpe_irq_cleanup(struct smi_info *info)
+{
+	if (!info->irq)
+		return;
+
+	acpi_remove_gpe_handler(NULL, info->irq, ipmi_acpi_gpe);
+}
+
+/*
+ * Defined at
+ * http://h21007.www2.hp.com/dspp/files/unprotected/devresource/Docs/TechPapers/IA64/hpspmi.pdf
+ */
+struct SPMITable {
+	s8	Signature[4];
+	u32	Length;
+	u8	Revision;
+	u8	Checksum;
+	s8	OEMID[6];
+	s8	OEMTableID[8];
+	s8	OEMRevision[4];
+	s8	CreatorID[4];
+	s8	CreatorRevision[4];
+	u8	InterfaceType;
+	u8	IPMIlegacy;
+	s16	SpecificationRevision;
+
+	/*
+	 * Bit 0 - SCI interrupt supported
+	 * Bit 1 - I/O APIC/SAPIC
+	 */
+	u8	InterruptType;
+
+	/* If bit 0 of InterruptType is set, then this is the SCI
+           interrupt in the GPEx_STS register. */
+	u8	GPE;
+
+	s16	Reserved;
+
+	/* If bit 1 of InterruptType is set, then this is the I/O
+           APIC/SAPIC interrupt. */
+	u32	GlobalSystemInterrupt;
+
+	/* The actual register address. */
+	struct acpi_generic_address addr;
+
+	u8	UID[4];
+
+	s8      spmi_id[1]; /* A '\0' terminated array starts here. */
+};
+
+static int try_init_acpi(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info  *info;
+	acpi_status      status;
+	struct SPMITable *spmi;
+	char             *io_type;
+	u8 		 addr_space;
+
+	if (acpi_failure)
+		return -ENODEV;
+
+	status = acpi_get_firmware_table("SPMI", intf_num+1,
+					 ACPI_LOGICAL_ADDRESSING,
+					 (struct acpi_table_header **) &spmi);
+	if (status != AE_OK) {
+		acpi_failure = 1;
+		return -ENODEV;
+	}
+
+	if (spmi->IPMIlegacy != 1) {
+	    printk(KERN_INFO "IPMI: Bad SPMI legacy %d\n", spmi->IPMIlegacy);
+  	    return -ENODEV;
+	}
+
+	if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
+		addr_space = IPMI_MEM_ADDR_SPACE;
+	else
+		addr_space = IPMI_IO_ADDR_SPACE;
+	if (!is_new_interface(-1, addr_space, spmi->addr.address))
+		return -ENODEV;
+
+	/* Figure out the interface type. */
+	switch (spmi->InterfaceType)
+	{
+	case 1:	/* KCS */
+		si_type[intf_num] = "kcs";
+		break;
+
+	case 2:	/* SMIC */
+		si_type[intf_num] = "smic";
+		break;
+
+	case 3:	/* BT */
+		si_type[intf_num] = "bt";
+		break;
+
+	default:
+		printk(KERN_INFO "ipmi_si: Unknown ACPI/SPMI SI type %d\n",
+			spmi->InterfaceType);
+		return -EIO;
+	}
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (3)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	if (spmi->InterruptType & 1) {
+		/* We've got a GPE interrupt. */
+		info->irq = spmi->GPE;
+		info->irq_setup = acpi_gpe_irq_setup;
+		info->irq_cleanup = acpi_gpe_irq_cleanup;
+	} else if (spmi->InterruptType & 2) {
+		/* We've got an APIC/SAPIC interrupt. */
+		info->irq = spmi->GlobalSystemInterrupt;
+		info->irq_setup = std_irq_setup;
+		info->irq_cleanup = std_irq_cleanup;
+	} else {
+		/* Use the default interrupt setting. */
+		info->irq = 0;
+		info->irq_setup = NULL;
+	}
+
+	if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+		io_type = "memory";
+		info->io_setup = mem_setup;
+		info->io_cleanup = mem_cleanup;
+		addrs[intf_num] = spmi->addr.address;
+		info->io.inputb = mem_inb;
+		info->io.outputb = mem_outb;
+		info->io.info = &(addrs[intf_num]);
+	} else if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+		io_type = "I/O";
+		info->io_setup = port_setup;
+		info->io_cleanup = port_cleanup;
+		ports[intf_num] = spmi->addr.address;
+		info->io.inputb = port_inb;
+		info->io.outputb = port_outb;
+		info->io.info = &(ports[intf_num]);
+	} else {
+		kfree(info);
+		printk("ipmi_si: Unknown ACPI I/O Address type\n");
+		return -EIO;
+	}
+
+	*new_info = info;
+
+	printk("ipmi_si: ACPI/SPMI specifies \"%s\" %s SI @ 0x%lx\n",
+	       si_type[intf_num], io_type, (unsigned long) spmi->addr.address);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_X86
+
+typedef struct dmi_ipmi_data
+{
+	u8   		type;
+	u8   		addr_space;
+	unsigned long	base_addr;
+	u8   		irq;
+}dmi_ipmi_data_t;
+
+typedef struct dmi_header
+{
+	u8	type;
+	u8	length;
+	u16	handle;
+}dmi_header_t;
+
+static int decode_dmi(dmi_header_t *dm, dmi_ipmi_data_t *ipmi_data)
+{
+	u8		*data = (u8 *)dm;
+	unsigned long  	base_addr;
+
+	ipmi_data->type = data[0x04];
+
+	memcpy(&base_addr,&data[0x08],sizeof(unsigned long));
+	if (base_addr & 1) {
+		/* I/O */
+		base_addr &= 0xFFFE;
+		ipmi_data->addr_space = IPMI_IO_ADDR_SPACE;
+	}
+	else {
+		/* Memory */
+		ipmi_data->addr_space = IPMI_MEM_ADDR_SPACE;
+	}
+
+	ipmi_data->base_addr = base_addr;
+	ipmi_data->irq = data[0x11];
+
+	if (is_new_interface(-1, ipmi_data->addr_space,ipmi_data->base_addr))
+	    return 0;
+
+	memset(ipmi_data,0,sizeof(dmi_ipmi_data_t));
+
+	return -1;
+}
+
+static int dmi_table(u32 base, int len, int num,
+	dmi_ipmi_data_t *ipmi_data)
+{
+	u8 		  *buf;
+	struct dmi_header *dm;
+	u8 		  *data;
+	int 		  i=1;
+	int		  status=-1;
+
+	buf = ioremap(base, len);
+	if(buf==NULL)
+		return -1;
+
+	data = buf;
+
+	while(i<num && (data - buf) < len)
+	{
+		dm=(dmi_header_t *)data;
+
+		if((data-buf+dm->length) >= len)
+        		break;
+
+		if (dm->type == 38) {
+			if (decode_dmi(dm, ipmi_data) == 0) {
+				status = 0;
+				break;
+			}
+		}
+
+	        data+=dm->length;
+		while((data-buf) < len && (*data || data[1]))
+			data++;
+		data+=2;
+		i++;
+	}
+	iounmap(buf);
+
+	return status;
+}
+
+inline static int dmi_checksum(u8 *buf)
+{
+	u8   sum=0;
+	int  a;
+
+	for(a=0; a<15; a++)
+		sum+=buf[a];
+	return (sum==0);
+}
+
+static int dmi_iterator(dmi_ipmi_data_t *ipmi_data)
+{
+	u8   buf[15];
+	u32  fp=0xF0000;
+
+#ifdef CONFIG_SIMNOW
+	return -1;
+#endif
+
+	while(fp < 0xFFFFF)
+	{
+		isa_memcpy_fromio(buf, fp, 15);
+		if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf))
+		{
+			u16 num=buf[13]<<8|buf[12];
+			u16 len=buf[7]<<8|buf[6];
+			u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8];
+
+			if(dmi_table(base, len, num, ipmi_data) == 0)
+				return 0;
+		}
+		fp+=16;
+	}
+
+	return -1;
+}
+
+static int try_init_smbios(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info   *info;
+	dmi_ipmi_data_t   ipmi_data;
+	char              *io_type;
+	int               status;
+
+	status = dmi_iterator(&ipmi_data);
+
+	if (status < 0)
+		return -ENODEV;
+
+	switch(ipmi_data.type) {
+		case 0x01: /* KCS */
+			si_type[intf_num] = "kcs";
+			break;
+		case 0x02: /* SMIC */
+			si_type[intf_num] = "smic";
+			break;
+		case 0x03: /* BT */
+			si_type[intf_num] = "bt";
+			break;
+		default:
+			printk("ipmi_si: Unknown SMBIOS SI type.\n");
+			return -EIO;
+	}
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (4)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	if (ipmi_data.addr_space == 1) {
+		io_type = "memory";
+		info->io_setup = mem_setup;
+		info->io_cleanup = mem_cleanup;
+		addrs[intf_num] = ipmi_data.base_addr;
+		info->io.inputb = mem_inb;
+		info->io.outputb = mem_outb;
+		info->io.info = &(addrs[intf_num]);
+	} else if (ipmi_data.addr_space == 2) {
+		io_type = "I/O";
+		info->io_setup = port_setup;
+		info->io_cleanup = port_cleanup;
+		ports[intf_num] = ipmi_data.base_addr;
+		info->io.inputb = port_inb;
+		info->io.outputb = port_outb;
+		info->io.info = &(ports[intf_num]);
+	} else {
+		kfree(info);
+		printk("ipmi_si: Unknown SMBIOS I/O Address type.\n");
+		return -EIO;
+	}
+
+	irqs[intf_num] = ipmi_data.irq;
+
+	*new_info = info;
+
+	printk("ipmi_si: Found SMBIOS-specified state machine at %s"
+	       " address 0x%lx\n",
+	       io_type, (unsigned long)ipmi_data.base_addr);
+	return 0;
+}
+#endif /* CONFIG_X86 */
+
+#ifdef CONFIG_PCI
+
+#define PCI_ERMC_CLASSCODE  0x0C0700
+#define PCI_HP_VENDOR_ID    0x103C
+#define PCI_MMC_DEVICE_ID   0x121A
+#define PCI_MMC_ADDR_CW     0x10
+
+/* Avoid more than one attempt to probe pci smic. */
+static int pci_smic_checked = 0;
+
+static int find_pci_smic(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info  *info;
+	int              error;
+	struct pci_dev   *pci_dev = NULL;
+	u16    		 base_addr;
+	int              fe_rmc = 0;
+
+	if (pci_smic_checked)
+		return -ENODEV;
+
+	pci_smic_checked = 1;
+
+	if ((pci_dev = pci_find_device(PCI_HP_VENDOR_ID, PCI_MMC_DEVICE_ID,
+				       NULL)))
+		;
+	else if ((pci_dev = pci_find_class(PCI_ERMC_CLASSCODE, NULL)) &&
+		 pci_dev->subsystem_vendor == PCI_HP_VENDOR_ID)
+		fe_rmc = 1;
+	else
+		return -ENODEV;
+
+	error = pci_read_config_word(pci_dev, PCI_MMC_ADDR_CW, &base_addr);
+	if (error)
+	{
+		printk(KERN_ERR
+		       "ipmi_si: pci_read_config_word() failed (%d).\n",
+		       error);
+		return -ENODEV;
+	}
+
+	/* Bit 0: 1 specifies programmed I/O, 0 specifies memory mapped I/O */
+	if (!(base_addr & 0x0001))
+	{
+		printk(KERN_ERR
+		       "ipmi_si: memory mapped I/O not supported for PCI"
+		       " smic.\n");
+		return -ENODEV;
+	}
+
+	base_addr &= 0xFFFE;
+	if (!fe_rmc)
+		/* Data register starts at base address + 1 in eRMC */
+		++base_addr;
+
+	if (!is_new_interface(-1, IPMI_IO_ADDR_SPACE, base_addr))
+	    return -ENODEV;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (5)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	info->io_setup = port_setup;
+	info->io_cleanup = port_cleanup;
+	ports[intf_num] = base_addr;
+	info->io.inputb = port_inb;
+	info->io.outputb = port_outb;
+	info->io.info = &(ports[intf_num]);
+
+	*new_info = info;
+
+	irqs[intf_num] = pci_dev->irq;
+	si_type[intf_num] = "smic";
+
+	printk("ipmi_si: Found PCI SMIC at I/O address 0x%lx\n",
+		(long unsigned int) base_addr);
+
+	return 0;
+}
+#endif /* CONFIG_PCI */
+
+static int try_init_plug_and_play(int intf_num, struct smi_info **new_info)
+{
+#ifdef CONFIG_PCI
+	if (find_pci_smic(intf_num, new_info)==0)
+		return 0;
+#endif
+	/* Include other methods here. */
+
+	return -ENODEV;
+}
+
+
+static int try_get_dev_id(struct smi_info *smi_info)
+{
+	unsigned char      msg[2];
+	unsigned char      *resp;
+	unsigned long      resp_len;
+	enum si_sm_result smi_result;
+	int               rv = 0;
+
+	resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	/* Do a Get Device ID command, since it comes back with some
+	   useful info. */
+	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+	msg[1] = IPMI_GET_DEVICE_ID_CMD;
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+
+	smi_result = smi_info->handlers->event(smi_info->si_sm, 0);
+	for (;;)
+	{
+		if (smi_result == SI_SM_CALL_WITH_DELAY) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(1);
+			smi_result = smi_info->handlers->event(
+				smi_info->si_sm, 100);
+		}
+		else if (smi_result == SI_SM_CALL_WITHOUT_DELAY)
+		{
+			smi_result = smi_info->handlers->event(
+				smi_info->si_sm, 0);
+		}
+		else
+			break;
+	}
+	if (smi_result == SI_SM_HOSED) {
+		/* We couldn't get the state machine to run, so whatever's at
+		   the port is probably not an IPMI SMI interface. */
+		rv = -ENODEV;
+		goto out;
+	}
+
+	/* Otherwise, we got some data. */
+	resp_len = smi_info->handlers->get_result(smi_info->si_sm,
+						  resp, IPMI_MAX_MSG_LENGTH);
+	if (resp_len < 6) {
+		/* That's odd, it should be longer. */
+		rv = -EINVAL;
+		goto out;
+	}
+
+	if ((resp[1] != IPMI_GET_DEVICE_ID_CMD) || (resp[2] != 0)) {
+		/* That's odd, it shouldn't be able to fail. */
+		rv = -EINVAL;
+		goto out;
+	}
+
+	/* Record info from the get device id, in case we need it. */
+	smi_info->ipmi_si_dev_rev = resp[4] & 0xf;
+	smi_info->ipmi_si_fw_rev_major = resp[5] & 0x7f;
+	smi_info->ipmi_si_fw_rev_minor = resp[6];
+	smi_info->ipmi_version_major = resp[7] & 0xf;
+	smi_info->ipmi_version_minor = resp[7] >> 4;
+
+ out:
+	kfree(resp);
+	return rv;
+}
+
+static int type_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char            *out = (char *) page;
+	struct smi_info *smi = data;
+
+	switch (smi->si_type) {
+	    case SI_KCS:
+		return sprintf(out, "kcs\n");
+	    case SI_SMIC:
+		return sprintf(out, "smic\n");
+	    case SI_BT:
+		return sprintf(out, "bt\n");
+	    default:
+		return 0;
+	}
+}
+
+static int stat_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char            *out = (char *) page;
+	struct smi_info *smi = data;
+
+	out += sprintf(out, "interrupts_enabled:    %d\n",
+		       smi->irq && !smi->interrupt_disabled);
+	out += sprintf(out, "short_timeouts:        %ld\n",
+		       smi->short_timeouts);
+	out += sprintf(out, "long_timeouts:         %ld\n",
+		       smi->long_timeouts);
+	out += sprintf(out, "timeout_restarts:      %ld\n",
+		       smi->timeout_restarts);
+	out += sprintf(out, "idles:                 %ld\n",
+		       smi->idles);
+	out += sprintf(out, "interrupts:            %ld\n",
+		       smi->interrupts);
+	out += sprintf(out, "attentions:            %ld\n",
+		       smi->attentions);
+	out += sprintf(out, "flag_fetches:          %ld\n",
+		       smi->flag_fetches);
+	out += sprintf(out, "hosed_count:           %ld\n",
+		       smi->hosed_count);
+	out += sprintf(out, "complete_transactions: %ld\n",
+		       smi->complete_transactions);
+	out += sprintf(out, "events:                %ld\n",
+		       smi->events);
+	out += sprintf(out, "watchdog_pretimeouts:  %ld\n",
+		       smi->watchdog_pretimeouts);
+	out += sprintf(out, "incoming_messages:     %ld\n",
+		       smi->incoming_messages);
+
+	return (out - ((char *) page));
+}
+
+/* Returns 0 if initialized, or negative on an error. */
+static int init_one_smi(int intf_num, struct smi_info **smi)
+{
+	int		rv;
+	struct smi_info *new_smi;
+
+
+	rv = try_init_mem(intf_num, &new_smi);
+	if (rv)
+		rv = try_init_port(intf_num, &new_smi);
+#ifdef CONFIG_ACPI_INTERPRETER
+	if ((rv) && (si_trydefaults)) {
+		rv = try_init_acpi(intf_num, &new_smi);
+	}
+#endif
+#ifdef CONFIG_X86
+	if ((rv) && (si_trydefaults)) {
+		rv = try_init_smbios(intf_num, &new_smi);
+        }
+#endif
+	if ((rv) && (si_trydefaults)) {
+		rv = try_init_plug_and_play(intf_num, &new_smi);
+	}
+
+
+	if (rv)
+		return rv;
+
+	/* So we know not to free it unless we have allocated one. */
+	new_smi->intf = NULL;
+	new_smi->si_sm = NULL;
+	new_smi->handlers = 0;
+
+	if (!new_smi->irq_setup) {
+		new_smi->irq = irqs[intf_num];
+		new_smi->irq_setup = std_irq_setup;
+		new_smi->irq_cleanup = std_irq_cleanup;
+	}
+
+	/* Default to KCS if no type is specified. */
+	if (si_type[intf_num] == NULL) {
+		if (si_trydefaults)
+			si_type[intf_num] = "kcs";
+		else {
+			rv = -EINVAL;
+			goto out_err;
+		}
+	}
+
+	/* Set up the state machine to use. */
+	if (strcmp(si_type[intf_num], "kcs") == 0) {
+		new_smi->handlers = &kcs_smi_handlers;
+		new_smi->si_type = SI_KCS;
+	} else if (strcmp(si_type[intf_num], "smic") == 0) {
+		new_smi->handlers = &smic_smi_handlers;
+		new_smi->si_type = SI_SMIC;
+	} else if (strcmp(si_type[intf_num], "bt") == 0) {
+		new_smi->handlers = &bt_smi_handlers;
+		new_smi->si_type = SI_BT;
+	} else {
+		/* No support for anything else yet. */
+		rv = -EIO;
+		goto out_err;
+	}
+
+	/* Allocate the state machine's data and initialize it. */
+	new_smi->si_sm = kmalloc(new_smi->handlers->size(), GFP_KERNEL);
+	if (!new_smi->si_sm) {
+		printk(" Could not allocate state machine memory\n");
+		rv = -ENOMEM;
+		goto out_err;
+	}
+	new_smi->io_size = new_smi->handlers->init_data(new_smi->si_sm,
+							&new_smi->io);
+
+	/* Now that we know the I/O size, we can set up the I/O. */
+	rv = new_smi->io_setup(new_smi);
+	if (rv) {
+		printk(" Could not set up I/O space\n");
+		goto out_err;
+	}
+
+	spin_lock_init(&(new_smi->si_lock));
+	spin_lock_init(&(new_smi->msg_lock));
+	spin_lock_init(&(new_smi->count_lock));
+
+	/* Do low-level detection first. */
+	if (new_smi->handlers->detect(new_smi->si_sm)) {
+		rv = -ENODEV;
+		goto out_err;
+	}
+
+	/* Attempt a get device id command.  If it fails, we probably
+           don't have a SMI here. */
+	rv = try_get_dev_id(new_smi);
+	if (rv)
+		goto out_err;
+
+	/* Try to claim any interrupts. */
+	new_smi->irq_setup(new_smi);
+
+	INIT_LIST_HEAD(&(new_smi->xmit_msgs));
+	INIT_LIST_HEAD(&(new_smi->hp_xmit_msgs));
+	new_smi->curr_msg = NULL;
+	atomic_set(&new_smi->req_events, 0);
+	new_smi->run_to_completion = 0;
+
+	rv = ipmi_register_smi(&handlers,
+			       new_smi,
+			       new_smi->ipmi_version_major,
+			       new_smi->ipmi_version_minor,
+			       &(new_smi->intf));
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to register device: error %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	rv = ipmi_smi_add_proc_entry(new_smi->intf, "type",
+				     type_file_read_proc, NULL,
+				     new_smi, THIS_MODULE);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to create proc entry: %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	rv = ipmi_smi_add_proc_entry(new_smi->intf, "si_stats",
+				     stat_file_read_proc, NULL,
+				     new_smi, THIS_MODULE);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to create proc entry: %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	start_clear_flags(new_smi);
+
+	/* IRQ is defined to be set when non-zero. */
+	if (new_smi->irq)
+		new_smi->si_state = SI_CLEARING_FLAGS_THEN_SET_IRQ;
+
+	new_smi->interrupt_disabled = 0;
+	new_smi->timer_stopped = 0;
+	new_smi->stop_operation = 0;
+
+	init_timer(&(new_smi->si_timer));
+	new_smi->si_timer.data = (long) new_smi;
+	new_smi->si_timer.function = smi_timeout;
+	new_smi->last_timeout_jiffies = jiffies;
+	new_smi->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
+	add_timer(&(new_smi->si_timer));
+
+	*smi = new_smi;
+
+	printk(" IPMI %s interface initialized\n", si_type[intf_num]);
+
+	return 0;
+
+ out_err:
+	if (new_smi->intf)
+		ipmi_unregister_smi(new_smi->intf);
+
+	new_smi->irq_cleanup(new_smi);
+	if (new_smi->si_sm) {
+		if (new_smi->handlers)
+			new_smi->handlers->cleanup(new_smi->si_sm);
+		kfree(new_smi->si_sm);
+	}
+	new_smi->io_cleanup(new_smi);
+	return rv;
+}
+
+static __init int init_ipmi_si(void)
+{
+	int  rv = 0;
+	int  pos = 0;
+	int  i;
+	char *str;
+
+	if (initialized)
+		return 0;
+	initialized = 1;
+
+	/* Parse out the si_type string into its components. */
+	str = si_type_str;
+	if (*str != '\0') {
+		for (i=0; (i<SI_MAX_PARMS) && (*str != '\0'); i++) {
+			si_type[i] = str;
+			str = strchr(str, ',');
+			if (str) {
+				*str = '\0';
+				str++;
+			} else {
+				break;
+			}
+		}
+	}
+
+	printk(KERN_INFO "IPMI System Interface driver version "
+	       IPMI_SI_VERSION);
+	if (kcs_smi_handlers.version)
+		printk(", KCS version %s", kcs_smi_handlers.version);
+	if (smic_smi_handlers.version)
+		printk(", SMIC version %s", smic_smi_handlers.version);
+	if (bt_smi_handlers.version)
+   	        printk(", BT version %s", bt_smi_handlers.version);
+	printk("\n");
+
+	rv = init_one_smi(0, &(smi_infos[pos]));
+	if (rv && !ports[0] && si_trydefaults) {
+		/* If we are trying defaults and the initial port is
+                   not set, then set it. */
+		si_type[0] = "kcs";
+		ports[0] = DEFAULT_KCS_IO_PORT;
+		rv = init_one_smi(0, &(smi_infos[pos]));
+		if (rv) {
+			/* No KCS - try SMIC */
+			si_type[0] = "smic";
+			ports[0] = DEFAULT_SMIC_IO_PORT;
+			rv = init_one_smi(0, &(smi_infos[pos]));
+		}
+		if (rv) {
+			/* No SMIC - try BT */
+			si_type[0] = "bt";
+			ports[0] = DEFAULT_BT_IO_PORT;
+			rv = init_one_smi(0, &(smi_infos[pos]));
+		}
+	}
+	if (rv == 0)
+		pos++;
+
+	for (i=1; i < SI_MAX_PARMS; i++) {
+		rv = init_one_smi(i, &(smi_infos[pos]));
+		if (rv == 0)
+			pos++;
+	}
+
+	if (smi_infos[0] == NULL) {
+		printk("ipmi_si: Unable to find any System Interface(s)\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+module_init(init_ipmi_si);
+
+void __exit cleanup_one_si(struct smi_info *to_clean)
+{
+	int           rv;
+	unsigned long flags;
+
+	if (! to_clean)
+		return;
+
+	/* Tell the timer and interrupt handlers that we are shutting
+	   down. */
+	spin_lock_irqsave(&(to_clean->si_lock), flags);
+	spin_lock(&(to_clean->msg_lock));
+
+	to_clean->stop_operation = 1;
+
+	to_clean->irq_cleanup(to_clean);
+
+	spin_unlock(&(to_clean->msg_lock));
+	spin_unlock_irqrestore(&(to_clean->si_lock), flags);
+
+	/* Wait until we know that we are out of any interrupt
+	   handlers might have been running before we freed the
+	   interrupt. */
+	synchronize_kernel();
+
+	/* Wait for the timer to stop.  This avoids problems with race
+	   conditions removing the timer here. */
+	while (!to_clean->timer_stopped) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(1);
+	}
+
+	rv = ipmi_unregister_smi(to_clean->intf);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to unregister device: errno=%d\n",
+		       rv);
+	}
+
+	to_clean->handlers->cleanup(to_clean->si_sm);
+
+	kfree(to_clean->si_sm);
+
+	to_clean->io_cleanup(to_clean);
+}
+
+static __exit void cleanup_ipmi_si(void)
+{
+	int i;
+
+	if (!initialized)
+		return;
+
+	for (i=0; i<SI_MAX_DRIVERS; i++) {
+		cleanup_one_si(smi_infos[i]);
+	}
+}
+module_exit(cleanup_ipmi_si);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_si_sm.h b/drivers/char/ipmi/ipmi_si_sm.h
new file mode 100644
index 000000000000..f3506552c5a5
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_si_sm.h
@@ -0,0 +1,117 @@
+/*
+ * ipmi_si_sm.h
+ *
+ * State machine interface for low-level IPMI system management
+ * interface state machines.  This code is the interface between
+ * the ipmi_smi code (that handles the policy of a KCS, SMIC, or
+ * BT interface) and the actual low-level state machine.
+ *
+ * Author: MontaVista Software, Inc.
+ *         Corey Minyard <minyard@mvista.com>
+ *         source@mvista.com
+ *
+ * Copyright 2002 MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This is defined by the state machines themselves, it is an opaque
+   data type for them to use. */
+struct si_sm_data;
+
+/* The structure for doing I/O in the state machine.  The state
+   machine doesn't have the actual I/O routines, they are done through
+   this interface. */
+struct si_sm_io
+{
+	unsigned char (*inputb)(struct si_sm_io *io, unsigned int offset);
+	void (*outputb)(struct si_sm_io *io,
+			unsigned int  offset,
+			unsigned char b);
+
+	/* Generic info used by the actual handling routines, the
+           state machine shouldn't touch these. */
+	void *info;
+	void *addr;
+};
+
+/* Results of SMI events. */
+enum si_sm_result
+{
+	SI_SM_CALL_WITHOUT_DELAY, /* Call the driver again immediately */
+	SI_SM_CALL_WITH_DELAY,	/* Delay some before calling again. */
+	SI_SM_TRANSACTION_COMPLETE, /* A transaction is finished. */
+	SI_SM_IDLE,		/* The SM is in idle state. */
+	SI_SM_HOSED,		/* The hardware violated the state machine. */
+	SI_SM_ATTN		/* The hardware is asserting attn and the
+				   state machine is idle. */
+};
+
+/* Handlers for the SMI state machine. */
+struct si_sm_handlers
+{
+	/* Put the version number of the state machine here so the
+           upper layer can print it. */
+	char *version;
+
+	/* Initialize the data and return the amount of I/O space to
+           reserve for the space. */
+	unsigned int (*init_data)(struct si_sm_data *smi,
+				  struct si_sm_io   *io);
+
+	/* Start a new transaction in the state machine.  This will
+	   return -2 if the state machine is not idle, -1 if the size
+	   is invalid (to large or too small), or 0 if the transaction
+	   is successfully completed. */
+	int (*start_transaction)(struct si_sm_data *smi,
+				 unsigned char *data, unsigned int size);
+
+	/* Return the results after the transaction.  This will return
+	   -1 if the buffer is too small, zero if no transaction is
+	   present, or the actual length of the result data. */
+	int (*get_result)(struct si_sm_data *smi,
+			  unsigned char *data, unsigned int length);
+
+	/* Call this periodically (for a polled interface) or upon
+	   receiving an interrupt (for a interrupt-driven interface).
+	   If interrupt driven, you should probably poll this
+	   periodically when not in idle state.  This should be called
+	   with the time that passed since the last call, if it is
+	   significant.  Time is in microseconds. */
+	enum si_sm_result (*event)(struct si_sm_data *smi, long time);
+
+	/* Attempt to detect an SMI.  Returns 0 on success or nonzero
+           on failure. */
+	int (*detect)(struct si_sm_data *smi);
+
+	/* The interface is shutting down, so clean it up. */
+	void (*cleanup)(struct si_sm_data *smi);
+
+	/* Return the size of the SMI structure in bytes. */
+	int (*size)(void);
+};
+
+/* Current state machines that we can use. */
+extern struct si_sm_handlers kcs_smi_handlers;
+extern struct si_sm_handlers smic_smi_handlers;
+extern struct si_sm_handlers bt_smi_handlers;
+
diff --git a/drivers/char/ipmi/ipmi_smic_sm.c b/drivers/char/ipmi/ipmi_smic_sm.c
new file mode 100644
index 000000000000..7bd7041e3d2f
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_smic_sm.c
@@ -0,0 +1,599 @@
+/*
+ * ipmi_smic_sm.c
+ *
+ * The state-machine driver for an IPMI SMIC driver
+ *
+ * It started as a copy of Corey Minyard's driver for the KSC interface
+ * and the kernel patch "mmcdev-patch-245" by HP
+ *
+ * modified by:	Hannes Schulz <schulz@schwaar.com>
+ *		ipmi@schwaar.com
+ *
+ *
+ * Corey Minyard's driver for the KSC interface has the following
+ * copyright notice:
+ *   Copyright 2002 MontaVista Software Inc.
+ *
+ * the kernel patch "mmcdev-patch-245" by HP has the following
+ * copyright notice:
+ * (c) Copyright 2001 Grant Grundler (c) Copyright
+ * 2001 Hewlett-Packard Company
+ *
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+#include <linux/kernel.h> /* For printk. */
+#include <linux/string.h>
+#include <linux/ipmi_msgdefs.h>		/* for completion codes */
+#include "ipmi_si_sm.h"
+
+#define IPMI_SMIC_VERSION "v31"
+
+/* smic_debug is a bit-field
+ *	SMIC_DEBUG_ENABLE -	turned on for now
+ *	SMIC_DEBUG_MSG -	commands and their responses
+ *	SMIC_DEBUG_STATES -	state machine
+*/
+#define SMIC_DEBUG_STATES	4
+#define SMIC_DEBUG_MSG		2
+#define	SMIC_DEBUG_ENABLE	1
+
+static int smic_debug = 1;
+
+enum smic_states {
+	SMIC_IDLE,
+	SMIC_START_OP,
+	SMIC_OP_OK,
+	SMIC_WRITE_START,
+	SMIC_WRITE_NEXT,
+	SMIC_WRITE_END,
+	SMIC_WRITE2READ,
+	SMIC_READ_START,
+	SMIC_READ_NEXT,
+	SMIC_READ_END,
+	SMIC_HOSED
+};
+
+#define MAX_SMIC_READ_SIZE 80
+#define MAX_SMIC_WRITE_SIZE 80
+#define SMIC_MAX_ERROR_RETRIES 3
+
+/* Timeouts in microseconds. */
+#define SMIC_RETRY_TIMEOUT 100000
+
+/* SMIC Flags Register Bits */
+#define SMIC_RX_DATA_READY	0x80
+#define SMIC_TX_DATA_READY	0x40
+#define SMIC_SMI		0x10
+#define SMIC_EVM_DATA_AVAIL	0x08
+#define SMIC_SMS_DATA_AVAIL	0x04
+#define SMIC_FLAG_BSY		0x01
+
+/* SMIC Error Codes */
+#define	EC_NO_ERROR		0x00
+#define	EC_ABORTED		0x01
+#define	EC_ILLEGAL_CONTROL	0x02
+#define	EC_NO_RESPONSE		0x03
+#define	EC_ILLEGAL_COMMAND	0x04
+#define	EC_BUFFER_FULL		0x05
+
+struct si_sm_data
+{
+	enum smic_states state;
+	struct si_sm_io *io;
+        unsigned char	 write_data[MAX_SMIC_WRITE_SIZE];
+        int		 write_pos;
+        int		 write_count;
+        int		 orig_write_count;
+        unsigned char	 read_data[MAX_SMIC_READ_SIZE];
+        int		 read_pos;
+        int		 truncated;
+        unsigned int	 error_retries;
+        long		 smic_timeout;
+};
+
+static unsigned int init_smic_data (struct si_sm_data *smic,
+				    struct si_sm_io *io)
+{
+	smic->state = SMIC_IDLE;
+	smic->io = io;
+	smic->write_pos = 0;
+	smic->write_count = 0;
+	smic->orig_write_count = 0;
+	smic->read_pos = 0;
+	smic->error_retries = 0;
+	smic->truncated = 0;
+	smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+
+	/* We use 3 bytes of I/O. */
+	return 3;
+}
+
+static int start_smic_transaction(struct si_sm_data *smic,
+				  unsigned char *data, unsigned int size)
+{
+	unsigned int i;
+
+	if ((size < 2) || (size > MAX_SMIC_WRITE_SIZE)) {
+		return -1;
+	}
+	if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED)) {
+		return -2;
+	}
+	if (smic_debug & SMIC_DEBUG_MSG) {
+		printk(KERN_INFO "start_smic_transaction -");
+		for (i = 0; i < size; i ++) {
+			printk (" %02x", (unsigned char) (data [i]));
+		}
+		printk ("\n");
+	}
+	smic->error_retries = 0;
+	memcpy(smic->write_data, data, size);
+	smic->write_count = size;
+	smic->orig_write_count = size;
+	smic->write_pos = 0;
+	smic->read_pos = 0;
+	smic->state = SMIC_START_OP;
+	smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+	return 0;
+}
+
+static int smic_get_result(struct si_sm_data *smic,
+			   unsigned char *data, unsigned int length)
+{
+	int i;
+
+	if (smic_debug & SMIC_DEBUG_MSG) {
+		printk (KERN_INFO "smic_get result -");
+		for (i = 0; i < smic->read_pos; i ++) {
+			printk (" %02x", (smic->read_data [i]));
+		}
+		printk ("\n");
+	}
+	if (length < smic->read_pos) {
+		smic->read_pos = length;
+		smic->truncated = 1;
+	}
+	memcpy(data, smic->read_data, smic->read_pos);
+
+	if ((length >= 3) && (smic->read_pos < 3)) {
+		data[2] = IPMI_ERR_UNSPECIFIED;
+		smic->read_pos = 3;
+	}
+	if (smic->truncated) {
+		data[2] = IPMI_ERR_MSG_TRUNCATED;
+		smic->truncated = 0;
+	}
+	return smic->read_pos;
+}
+
+static inline unsigned char read_smic_flags(struct si_sm_data *smic)
+{
+	return smic->io->inputb(smic->io, 2);
+}
+
+static inline unsigned char read_smic_status(struct si_sm_data *smic)
+{
+	return smic->io->inputb(smic->io, 1);
+}
+
+static inline unsigned char read_smic_data(struct si_sm_data *smic)
+{
+	return smic->io->inputb(smic->io, 0);
+}
+
+static inline void write_smic_flags(struct si_sm_data *smic,
+				    unsigned char   flags)
+{
+	smic->io->outputb(smic->io, 2, flags);
+}
+
+static inline void write_smic_control(struct si_sm_data *smic,
+				      unsigned char   control)
+{
+	smic->io->outputb(smic->io, 1, control);
+}
+
+static inline void write_si_sm_data (struct si_sm_data *smic,
+				   unsigned char   data)
+{
+	smic->io->outputb(smic->io, 0, data);
+}
+
+static inline void start_error_recovery(struct si_sm_data *smic, char *reason)
+{
+	(smic->error_retries)++;
+	if (smic->error_retries > SMIC_MAX_ERROR_RETRIES) {
+		if (smic_debug & SMIC_DEBUG_ENABLE) {
+			printk(KERN_WARNING
+			       "ipmi_smic_drv: smic hosed: %s\n", reason);
+		}
+		smic->state = SMIC_HOSED;
+	} else {
+		smic->write_count = smic->orig_write_count;
+		smic->write_pos = 0;
+		smic->read_pos = 0;
+		smic->state = SMIC_START_OP;
+		smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+	}
+}
+
+static inline void write_next_byte(struct si_sm_data *smic)
+{
+	write_si_sm_data(smic, smic->write_data[smic->write_pos]);
+	(smic->write_pos)++;
+	(smic->write_count)--;
+}
+
+static inline void read_next_byte (struct si_sm_data *smic)
+{
+	if (smic->read_pos >= MAX_SMIC_READ_SIZE) {
+		read_smic_data (smic);
+		smic->truncated = 1;
+	} else {
+		smic->read_data[smic->read_pos] = read_smic_data(smic);
+		(smic->read_pos)++;
+	}
+}
+
+/*  SMIC Control/Status Code Components */
+#define	SMIC_GET_STATUS		0x00	/* Control form's name */
+#define	SMIC_READY		0x00	/* Status  form's name */
+#define	SMIC_WR_START		0x01	/* Unified Control/Status names... */
+#define	SMIC_WR_NEXT		0x02
+#define	SMIC_WR_END		0x03
+#define	SMIC_RD_START		0x04
+#define	SMIC_RD_NEXT		0x05
+#define	SMIC_RD_END		0x06
+#define	SMIC_CODE_MASK		0x0f
+
+#define	SMIC_CONTROL		0x00
+#define	SMIC_STATUS		0x80
+#define	SMIC_CS_MASK		0x80
+
+#define	SMIC_SMS		0x40
+#define	SMIC_SMM		0x60
+#define	SMIC_STREAM_MASK	0x60
+
+/*  SMIC Control Codes */
+#define	SMIC_CC_SMS_GET_STATUS	(SMIC_CONTROL|SMIC_SMS|SMIC_GET_STATUS)
+#define	SMIC_CC_SMS_WR_START	(SMIC_CONTROL|SMIC_SMS|SMIC_WR_START)
+#define	SMIC_CC_SMS_WR_NEXT	(SMIC_CONTROL|SMIC_SMS|SMIC_WR_NEXT)
+#define	SMIC_CC_SMS_WR_END	(SMIC_CONTROL|SMIC_SMS|SMIC_WR_END)
+#define	SMIC_CC_SMS_RD_START	(SMIC_CONTROL|SMIC_SMS|SMIC_RD_START)
+#define	SMIC_CC_SMS_RD_NEXT	(SMIC_CONTROL|SMIC_SMS|SMIC_RD_NEXT)
+#define	SMIC_CC_SMS_RD_END	(SMIC_CONTROL|SMIC_SMS|SMIC_RD_END)
+
+#define	SMIC_CC_SMM_GET_STATUS	(SMIC_CONTROL|SMIC_SMM|SMIC_GET_STATUS)
+#define	SMIC_CC_SMM_WR_START	(SMIC_CONTROL|SMIC_SMM|SMIC_WR_START)
+#define	SMIC_CC_SMM_WR_NEXT	(SMIC_CONTROL|SMIC_SMM|SMIC_WR_NEXT)
+#define	SMIC_CC_SMM_WR_END	(SMIC_CONTROL|SMIC_SMM|SMIC_WR_END)
+#define	SMIC_CC_SMM_RD_START	(SMIC_CONTROL|SMIC_SMM|SMIC_RD_START)
+#define	SMIC_CC_SMM_RD_NEXT	(SMIC_CONTROL|SMIC_SMM|SMIC_RD_NEXT)
+#define	SMIC_CC_SMM_RD_END	(SMIC_CONTROL|SMIC_SMM|SMIC_RD_END)
+
+/*  SMIC Status Codes */
+#define	SMIC_SC_SMS_READY	(SMIC_STATUS|SMIC_SMS|SMIC_READY)
+#define	SMIC_SC_SMS_WR_START	(SMIC_STATUS|SMIC_SMS|SMIC_WR_START)
+#define	SMIC_SC_SMS_WR_NEXT	(SMIC_STATUS|SMIC_SMS|SMIC_WR_NEXT)
+#define	SMIC_SC_SMS_WR_END	(SMIC_STATUS|SMIC_SMS|SMIC_WR_END)
+#define	SMIC_SC_SMS_RD_START	(SMIC_STATUS|SMIC_SMS|SMIC_RD_START)
+#define	SMIC_SC_SMS_RD_NEXT	(SMIC_STATUS|SMIC_SMS|SMIC_RD_NEXT)
+#define	SMIC_SC_SMS_RD_END	(SMIC_STATUS|SMIC_SMS|SMIC_RD_END)
+
+#define	SMIC_SC_SMM_READY	(SMIC_STATUS|SMIC_SMM|SMIC_READY)
+#define	SMIC_SC_SMM_WR_START	(SMIC_STATUS|SMIC_SMM|SMIC_WR_START)
+#define	SMIC_SC_SMM_WR_NEXT	(SMIC_STATUS|SMIC_SMM|SMIC_WR_NEXT)
+#define	SMIC_SC_SMM_WR_END	(SMIC_STATUS|SMIC_SMM|SMIC_WR_END)
+#define	SMIC_SC_SMM_RD_START	(SMIC_STATUS|SMIC_SMM|SMIC_RD_START)
+#define	SMIC_SC_SMM_RD_NEXT	(SMIC_STATUS|SMIC_SMM|SMIC_RD_NEXT)
+#define	SMIC_SC_SMM_RD_END	(SMIC_STATUS|SMIC_SMM|SMIC_RD_END)
+
+/* these are the control/status codes we actually use
+	SMIC_CC_SMS_GET_STATUS	0x40
+	SMIC_CC_SMS_WR_START	0x41
+	SMIC_CC_SMS_WR_NEXT	0x42
+	SMIC_CC_SMS_WR_END	0x43
+	SMIC_CC_SMS_RD_START	0x44
+	SMIC_CC_SMS_RD_NEXT	0x45
+	SMIC_CC_SMS_RD_END	0x46
+
+	SMIC_SC_SMS_READY	0xC0
+	SMIC_SC_SMS_WR_START	0xC1
+	SMIC_SC_SMS_WR_NEXT	0xC2
+	SMIC_SC_SMS_WR_END	0xC3
+	SMIC_SC_SMS_RD_START	0xC4
+	SMIC_SC_SMS_RD_NEXT	0xC5
+	SMIC_SC_SMS_RD_END	0xC6
+*/
+
+static enum si_sm_result smic_event (struct si_sm_data *smic, long time)
+{
+	unsigned char status;
+	unsigned char flags;
+	unsigned char data;
+
+	if (smic->state == SMIC_HOSED) {
+		init_smic_data(smic, smic->io);
+		return SI_SM_HOSED;
+	}
+	if (smic->state != SMIC_IDLE) {
+		if (smic_debug & SMIC_DEBUG_STATES) {
+			printk(KERN_INFO
+			       "smic_event - smic->smic_timeout = %ld,"
+			       " time = %ld\n",
+			       smic->smic_timeout, time);
+		}
+/* FIXME: smic_event is sometimes called with time > SMIC_RETRY_TIMEOUT */
+		if (time < SMIC_RETRY_TIMEOUT) {
+			smic->smic_timeout -= time;
+			if (smic->smic_timeout < 0) {
+				start_error_recovery(smic, "smic timed out.");
+				return SI_SM_CALL_WITH_DELAY;
+			}
+		}
+	}
+	flags = read_smic_flags(smic);
+	if (flags & SMIC_FLAG_BSY)
+		return SI_SM_CALL_WITH_DELAY;
+
+	status = read_smic_status (smic);
+	if (smic_debug & SMIC_DEBUG_STATES)
+		printk(KERN_INFO
+		       "smic_event - state = %d, flags = 0x%02x,"
+		       " status = 0x%02x\n",
+		       smic->state, flags, status);
+
+	switch (smic->state) {
+	case SMIC_IDLE:
+		/* in IDLE we check for available messages */
+		if (flags & (SMIC_SMI |
+			     SMIC_EVM_DATA_AVAIL | SMIC_SMS_DATA_AVAIL))
+		{
+			return SI_SM_ATTN;
+		}
+		return SI_SM_IDLE;
+
+	case SMIC_START_OP:
+		/* sanity check whether smic is really idle */
+		write_smic_control(smic, SMIC_CC_SMS_GET_STATUS);
+		write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		smic->state = SMIC_OP_OK;
+		break;
+
+	case SMIC_OP_OK:
+		if (status != SMIC_SC_SMS_READY) {
+				/* this should not happen */
+			start_error_recovery(smic,
+					     "state = SMIC_OP_OK,"
+					     " status != SMIC_SC_SMS_READY");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* OK so far; smic is idle let us start ... */
+		write_smic_control(smic, SMIC_CC_SMS_WR_START);
+		write_next_byte(smic);
+		write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		smic->state = SMIC_WRITE_START;
+		break;
+
+	case SMIC_WRITE_START:
+		if (status != SMIC_SC_SMS_WR_START) {
+			start_error_recovery(smic,
+					     "state = SMIC_WRITE_START, "
+					     "status != SMIC_SC_SMS_WR_START");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* we must not issue WR_(NEXT|END) unless
+                   TX_DATA_READY is set */
+		if (flags & SMIC_TX_DATA_READY) {
+			if (smic->write_count == 1) {
+				/* last byte */
+				write_smic_control(smic, SMIC_CC_SMS_WR_END);
+				smic->state = SMIC_WRITE_END;
+			} else {
+				write_smic_control(smic, SMIC_CC_SMS_WR_NEXT);
+				smic->state = SMIC_WRITE_NEXT;
+			}
+			write_next_byte(smic);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		}
+		else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_WRITE_NEXT:
+		if (status != SMIC_SC_SMS_WR_NEXT) {
+			start_error_recovery(smic,
+					     "state = SMIC_WRITE_NEXT, "
+					     "status != SMIC_SC_SMS_WR_NEXT");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* this is the same code as in SMIC_WRITE_START */
+		if (flags & SMIC_TX_DATA_READY) {
+			if (smic->write_count == 1) {
+				write_smic_control(smic, SMIC_CC_SMS_WR_END);
+				smic->state = SMIC_WRITE_END;
+			}
+			else {
+				write_smic_control(smic, SMIC_CC_SMS_WR_NEXT);
+				smic->state = SMIC_WRITE_NEXT;
+			}
+			write_next_byte(smic);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		}
+		else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_WRITE_END:
+		if (status != SMIC_SC_SMS_WR_END) {
+			start_error_recovery (smic,
+					      "state = SMIC_WRITE_END, "
+					      "status != SMIC_SC_SMS_WR_END");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* data register holds an error code */
+		data = read_smic_data(smic);
+		if (data != 0) {
+			if (smic_debug & SMIC_DEBUG_ENABLE) {
+				printk(KERN_INFO
+				       "SMIC_WRITE_END: data = %02x\n", data);
+			}
+			start_error_recovery(smic,
+					     "state = SMIC_WRITE_END, "
+					     "data != SUCCESS");
+			return SI_SM_CALL_WITH_DELAY;
+		} else {
+			smic->state = SMIC_WRITE2READ;
+		}
+		break;
+
+	case SMIC_WRITE2READ:
+		/* we must wait for RX_DATA_READY to be set before we
+                   can continue */
+		if (flags & SMIC_RX_DATA_READY) {
+			write_smic_control(smic, SMIC_CC_SMS_RD_START);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+			smic->state = SMIC_READ_START;
+		} else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_READ_START:
+		if (status != SMIC_SC_SMS_RD_START) {
+			start_error_recovery(smic,
+					     "state = SMIC_READ_START, "
+					     "status != SMIC_SC_SMS_RD_START");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		if (flags & SMIC_RX_DATA_READY) {
+			read_next_byte(smic);
+			write_smic_control(smic, SMIC_CC_SMS_RD_NEXT);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+			smic->state = SMIC_READ_NEXT;
+		} else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_READ_NEXT:
+		switch (status) {
+		/* smic tells us that this is the last byte to be read
+                   --> clean up */
+		case SMIC_SC_SMS_RD_END:
+			read_next_byte(smic);
+			write_smic_control(smic, SMIC_CC_SMS_RD_END);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+			smic->state = SMIC_READ_END;
+			break;
+		case SMIC_SC_SMS_RD_NEXT:
+			if (flags & SMIC_RX_DATA_READY) {
+				read_next_byte(smic);
+				write_smic_control(smic, SMIC_CC_SMS_RD_NEXT);
+				write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+				smic->state = SMIC_READ_NEXT;
+			} else {
+				return SI_SM_CALL_WITH_DELAY;
+			}
+			break;
+		default:
+			start_error_recovery(
+				smic,
+				"state = SMIC_READ_NEXT, "
+				"status != SMIC_SC_SMS_RD_(NEXT|END)");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_READ_END:
+		if (status != SMIC_SC_SMS_READY) {
+			start_error_recovery(smic,
+					     "state = SMIC_READ_END, "
+					     "status != SMIC_SC_SMS_READY");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		data = read_smic_data(smic);
+		/* data register holds an error code */
+		if (data != 0) {
+			if (smic_debug & SMIC_DEBUG_ENABLE) {
+				printk(KERN_INFO
+				       "SMIC_READ_END: data = %02x\n", data);
+			}
+			start_error_recovery(smic,
+					     "state = SMIC_READ_END, "
+					     "data != SUCCESS");
+			return SI_SM_CALL_WITH_DELAY;
+		} else {
+			smic->state = SMIC_IDLE;
+			return SI_SM_TRANSACTION_COMPLETE;
+		}
+
+	case SMIC_HOSED:
+		init_smic_data(smic, smic->io);
+		return SI_SM_HOSED;
+
+	default:
+		if (smic_debug & SMIC_DEBUG_ENABLE) {
+			printk(KERN_WARNING "smic->state = %d\n", smic->state);
+			start_error_recovery(smic, "state = UNKNOWN");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+	}
+	smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+	return SI_SM_CALL_WITHOUT_DELAY;
+}
+
+static int smic_detect(struct si_sm_data *smic)
+{
+	/* It's impossible for the SMIC fnags register to be all 1's,
+	   (assuming a properly functioning, self-initialized BMC)
+	   but that's what you get from reading a bogus address, so we
+	   test that first. */
+	if (read_smic_flags(smic) == 0xff)
+		return 1;
+
+	return 0;
+}
+
+static void smic_cleanup(struct si_sm_data *kcs)
+{
+}
+
+static int smic_size(void)
+{
+	return sizeof(struct si_sm_data);
+}
+
+struct si_sm_handlers smic_smi_handlers =
+{
+	.version           = IPMI_SMIC_VERSION,
+	.init_data         = init_smic_data,
+	.start_transaction = start_smic_transaction,
+	.get_result        = smic_get_result,
+	.event             = smic_event,
+	.detect            = smic_detect,
+	.cleanup           = smic_cleanup,
+	.size              = smic_size,
+};
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index eb213e416d23..50aa9590be30 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -33,6 +33,7 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/ipmi.h>
 #include <linux/ipmi_smi.h>
 #include <linux/watchdog.h>
@@ -50,6 +51,8 @@
 #include <asm/apic.h>
 #endif
 
+#define IPMI_WATCHDOG_VERSION "v31"
+
 /*
  * The IPMI command/response information for the watchdog timer.
  */
@@ -137,26 +140,41 @@ static int pretimeout = 0;
 /* Default action is to reset the board on a timeout. */
 static unsigned char action_val = WDOG_TIMEOUT_RESET;
 
-static char *action = "reset";
+static char action[16] = "reset";
 
 static unsigned char preaction_val = WDOG_PRETIMEOUT_NONE;
 
-static char *preaction = "pre_none";
+static char preaction[16] = "pre_none";
 
 static unsigned char preop_val = WDOG_PREOP_NONE;
 
-static char *preop = "preop_none";
+static char preop[16] = "preop_none";
 static spinlock_t ipmi_read_lock = SPIN_LOCK_UNLOCKED;
 static char data_to_read = 0;
 static DECLARE_WAIT_QUEUE_HEAD(read_q);
 static struct fasync_struct *fasync_q = NULL;
 static char pretimeout_since_last_heartbeat = 0;
 
-MODULE_PARM(timeout, "i");
-MODULE_PARM(pretimeout, "i");
-MODULE_PARM(action, "s");
-MODULE_PARM(preaction, "s");
-MODULE_PARM(preop, "s");
+/* If true, the driver will start running as soon as it is configured
+   and ready. */
+static int start_now = 0;
+
+module_param(timeout, int, 0);
+MODULE_PARM_DESC(timeout, "Timeout value in seconds.");
+module_param(pretimeout, int, 0);
+MODULE_PARM_DESC(pretimeout, "Pretimeout value in seconds.");
+module_param_string(action, action, sizeof(action), 0);
+MODULE_PARM_DESC(action, "Timeout action. One of: "
+		 "reset, none, power_cycle, power_off.");
+module_param_string(preaction, preaction, sizeof(preaction), 0);
+MODULE_PARM_DESC(preaction, "Pretimeout action.  One of: "
+		 "pre_none, pre_smi, pre_nmi, pre_int.");
+module_param_string(preop, preop, sizeof(preop), 0);
+MODULE_PARM_DESC(preop, "Pretimeout driver operation.  One of: "
+		 "preop_none, preop_panic, preop_give_data.");
+module_param(start_now, int, 0);
+MODULE_PARM_DESC(start_now, "Set to 1 to start the watchdog as"
+		 "soon as the driver is loaded.");
 
 /* Default state of the timer. */
 static unsigned char ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
@@ -167,10 +185,6 @@ static int ipmi_ignore_heartbeat = 0;
 /* Is someone using the watchdog?  Only one user is allowed. */
 static int ipmi_wdog_open = 0;
 
-/* If true, the driver will start running as soon as it is configured
-   and ready. */
-static int start_now = 0;
-
 /* If set to 1, the heartbeat command will set the state to reset and
    start the timer.  The timer doesn't normally run when the driver is
    first opened until the heartbeat is set the first time, this
@@ -260,6 +274,7 @@ static int i_ipmi_set_timeout(struct ipmi_smi_msg  *smi_msg,
 				      (struct ipmi_addr *) &addr,
 				      0,
 				      &msg,
+				      NULL,
 				      smi_msg,
 				      recv_msg,
 				      1);
@@ -435,6 +450,7 @@ static int ipmi_heartbeat(void)
 				      (struct ipmi_addr *) &addr,
 				      0,
 				      &msg,
+				      NULL,
 				      &heartbeat_smi_msg,
 				      &heartbeat_recv_msg,
 				      1);
@@ -483,6 +499,7 @@ static void panic_halt_ipmi_heartbeat(void)
 				 (struct ipmi_addr *) &addr,
 				 0,
 				 &msg,
+				 NULL,
 				 &panic_halt_heartbeat_smi_msg,
 				 &panic_halt_heartbeat_recv_msg,
 				 1);
@@ -903,6 +920,7 @@ static void ipmi_smi_gone(int if_num)
 
 static struct ipmi_smi_watcher smi_watcher =
 {
+	.owner    = THIS_MODULE,
 	.new_smi  = ipmi_new_smi,
 	.smi_gone = ipmi_smi_gone
 };
@@ -911,6 +929,9 @@ static int __init ipmi_wdog_init(void)
 {
 	int rv;
 
+	printk(KERN_INFO "IPMI watchdog driver version "
+	       IPMI_WATCHDOG_VERSION "\n");
+
 	if (strcmp(action, "reset") == 0) {
 		action_val = WDOG_TIMEOUT_RESET;
 	} else if (strcmp(action, "none") == 0) {
@@ -999,14 +1020,10 @@ static int __init ipmi_wdog_init(void)
 	register_reboot_notifier(&wdog_reboot_notifier);
 	notifier_chain_register(&panic_notifier_list, &wdog_panic_notifier);
 
-	printk(KERN_INFO "IPMI watchdog by "
-	       "Corey Minyard (minyard@mvista.com)\n");
-
 	return 0;
 }
 
-#ifdef MODULE
-static void ipmi_unregister_watchdog(void)
+static __exit void ipmi_unregister_watchdog(void)
 {
 	int rv;
 
@@ -1034,6 +1051,7 @@ static void ipmi_unregister_watchdog(void)
 	   pointers to our buffers, we want to make sure they are done before
 	   we release our memory. */
 	while (atomic_read(&set_timeout_tofree)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
 	}
 
@@ -1056,76 +1074,6 @@ static void __exit ipmi_wdog_exit(void)
 	ipmi_unregister_watchdog();
 }
 module_exit(ipmi_wdog_exit);
-#else
-static int __init ipmi_wdog_setup(char *str)
-{
-	int  val;
-	int  rv;
-	char *option;
-
-	rv = get_option(&str, &val);
-	if (rv == 0)
-		return 1;
-	if (val > 0)
-		timeout = val;
-	if (rv == 1)
-		return 1;
-
-	rv = get_option(&str, &val);
-	if (rv == 0)
-		return 1;
-	if (val >= 0)
-		pretimeout = val;
-	if (rv == 1)
-		return 1;
-
-	while ((option = strsep(&str, ",")) != NULL) {
-		if (strcmp(option, "reset") == 0) {
-			action = "reset";
-		}
-		else if (strcmp(option, "none") == 0) {
-			action = "none";
-		}
-		else if (strcmp(option, "power_cycle") == 0) {
-			action = "power_cycle";
-		}
-		else if (strcmp(option, "power_off") == 0) {
-			action = "power_off";
-		}
-		else if (strcmp(option, "pre_none") == 0) {
-			preaction = "pre_none";
-		}
-		else if (strcmp(option, "pre_smi") == 0) {
-			preaction = "pre_smi";
-		}
-#ifdef HAVE_NMI_HANDLER
-		else if (strcmp(option, "pre_nmi") == 0) {
-			preaction = "pre_nmi";
-		}
-#endif
-		else if (strcmp(option, "pre_int") == 0) {
-			preaction = "pre_int";
-		}
-		else if (strcmp(option, "start_now") == 0) {
-			start_now = 1;
-		}
-		else if (strcmp(option, "preop_none") == 0) {
-			preop = "preop_none";
-		}
-		else if (strcmp(option, "preop_panic") == 0) {
-			preop = "preop_panic";
-		}
-		else if (strcmp(option, "preop_give_data") == 0) {
-			preop = "preop_give_data";
-		} else {
-		    printk("Unknown IPMI watchdog option: '%s'\n", option);
-		}
-	}
-
-	return 1;
-}
-__setup("ipmi_wdog=", ipmi_wdog_setup);
-#endif
 
 EXPORT_SYMBOL(ipmi_delayed_shutdown);
 
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 9479a550b924..75311f205806 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -109,6 +109,35 @@ struct ipmi_ipmb_addr
 	unsigned char lun;
 };
 
+/*
+ * A LAN Address.  This is an address to/from a LAN interface bridged
+ * by the BMC, not an address actually out on the LAN.
+ *
+ * A concious decision was made here to deviate slightly from the IPMI
+ * spec.  We do not use rqSWID and rsSWID like it shows in the
+ * message.  Instead, we use remote_SWID and local_SWID.  This means
+ * that any message (a request or response) from another device will
+ * always have exactly the same address.  If you didn't do this,
+ * requests and responses from the same device would have different
+ * addresses, and that's not too cool.
+ *
+ * In this address, the remote_SWID is always the SWID the remote
+ * message came from, or the SWID we are sending the message to.
+ * local_SWID is always our SWID.  Note that having our SWID in the
+ * message is a little wierd, but this is required.
+ */
+#define IPMI_LAN_ADDR_TYPE		0x04
+struct ipmi_lan_addr
+{
+	int           addr_type;
+	short         channel;
+	unsigned char privilege;
+	unsigned char session_handle;
+	unsigned char remote_SWID;
+	unsigned char local_SWID;
+	unsigned char lun;
+};
+
 
 /*
  * Channel for talking directly with the BMC.  When using this
@@ -145,10 +174,20 @@ struct ipmi_msg
  * Receive types for messages coming from the receive interface.  This
  * is used for the receive in-kernel interface and in the receive
  * IOCTL.
+ *
+ * The "IPMI_RESPONSE_RESPNOSE_TYPE" is a little strange sounding, but
+ * it allows you to get the message results when you send a response
+ * message.
  */
 #define IPMI_RESPONSE_RECV_TYPE		1 /* A response to a command */
 #define IPMI_ASYNC_EVENT_RECV_TYPE	2 /* Something from the event queue */
 #define IPMI_CMD_RECV_TYPE		3 /* A command from somewhere else */
+#define IPMI_RESPONSE_RESPONSE_TYPE	4 /* The response for
+					      a sent response, giving any
+					      error status for sending the
+					      response.  When you send a
+					      response message, this will
+					      be returned. */
 /* Note that async events and received commands do not have a completion
    code as the first byte of the incoming data, unlike a response. */
 
@@ -160,6 +199,7 @@ struct ipmi_msg
  * The in-kernel interface.
  */
 #include <linux/list.h>
+#include <linux/module.h>
 
 /* Opaque type for a IPMI message user.  One of these is needed to
    send and receive messages. */
@@ -185,6 +225,12 @@ struct ipmi_recv_msg
 	long             msgid;
 	struct ipmi_msg  msg;
 
+	/* The user_msg_data is the data supplied when a message was
+	   sent, if this is a response to a sent message.  If this is
+	   not a response to a sent message, then user_msg_data will
+	   be NULL. */
+	void             *user_msg_data;
+
 	/* Call this when done with the message.  It will presumably free
 	   the message and do any other necessary cleanup. */
 	void (*done)(struct ipmi_recv_msg *msg);
@@ -206,9 +252,10 @@ struct ipmi_user_hndl
         /* Routine type to call when a message needs to be routed to
 	   the upper layer.  This will be called with some locks held,
 	   the only IPMI routines that can be called are ipmi_request
-	   and the alloc/free operations. */
+	   and the alloc/free operations.  The handler_data is the
+	   variable supplied when the receive handler was registered. */
 	void (*ipmi_recv_hndl)(struct ipmi_recv_msg *msg,
-			       void                 *handler_data);
+			       void                 *user_msg_data);
 
 	/* Called when the interface detects a watchdog pre-timeout.  If
 	   this is NULL, it will be ignored for the user. */
@@ -221,7 +268,12 @@ int ipmi_create_user(unsigned int          if_num,
 		     void                  *handler_data,
 		     ipmi_user_t           *user);
 
-/* Destroy the given user of the IPMI layer. */
+/* Destroy the given user of the IPMI layer.  Note that after this
+   function returns, the system is guaranteed to not call any
+   callbacks for the user.  Thus as long as you destroy all the users
+   before you unload a module, you will be safe.  And if you destroy
+   the users before you destroy the callback structures, it should be
+   safe, too. */
 int ipmi_destroy_user(ipmi_user_t user);
 
 /* Get the IPMI version of the BMC we are talking to. */
@@ -253,13 +305,43 @@ unsigned char ipmi_get_my_LUN(ipmi_user_t user);
  * in the msgid field of the received command.  If the priority is >
  * 0, the message will go into a high-priority queue and be sent
  * first.  Otherwise, it goes into a normal-priority queue.
+ * The user_msg_data field will be returned in any response to this
+ * message.
+ *
+ * Note that if you send a response (with the netfn lower bit set),
+ * you *will* get back a SEND_MSG response telling you what happened
+ * when the response was sent.  You will not get back a response to
+ * the message itself.
  */
 int ipmi_request(ipmi_user_t      user,
 		 struct ipmi_addr *addr,
 		 long             msgid,
 		 struct ipmi_msg  *msg,
+		 void             *user_msg_data,
 		 int              priority);
 
+/*
+ * Like ipmi_request, but lets you specify the number of retries and
+ * the retry time.  The retries is the number of times the message
+ * will be resent if no reply is received.  If set to -1, the default
+ * value will be used.  The retry time is the time in milliseconds
+ * between retries.  If set to zero, the default value will be
+ * used.
+ *
+ * Don't use this unless you *really* have to.  It's primarily for the
+ * IPMI over LAN converter; since the LAN stuff does its own retries,
+ * it makes no sense to do it here.  However, this can be used if you
+ * have unusual requirements.
+ */
+int ipmi_request_settime(ipmi_user_t      user,
+			 struct ipmi_addr *addr,
+			 long             msgid,
+			 struct ipmi_msg  *msg,
+			 void             *user_msg_data,
+			 int              priority,
+			 int              max_retries,
+			 unsigned int     retry_time_ms);
+
 /*
  * Like ipmi_request, but lets you specify the slave return address.
  */
@@ -267,6 +349,7 @@ int ipmi_request_with_source(ipmi_user_t      user,
 			     struct ipmi_addr *addr,
 			     long             msgid,
 			     struct ipmi_msg  *msg,
+			     void             *user_msg_data,
 			     int              priority,
 			     unsigned char    source_address,
 			     unsigned char    source_lun);
@@ -284,6 +367,7 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			     struct ipmi_addr     *addr,
 			     long                 msgid,
 			     struct ipmi_msg      *msg,
+			     void                 *user_msg_data,
 			     void                 *supplied_smi,
 			     struct ipmi_recv_msg *supplied_recv,
 			     int                  priority);
@@ -331,6 +415,10 @@ struct ipmi_smi_watcher
 {
 	struct list_head link;
 
+	/* You must set the owner to the current module, if you are in
+	   a module (generally just set it to "THIS_MODULE"). */
+	struct module *owner;
+
 	/* These two are called with read locks held for the interface
 	   the watcher list.  So you can add and remove users from the
 	   IPMI interface, send messages, etc., but you cannot add
@@ -422,6 +510,29 @@ struct ipmi_req
 #define IPMICTL_SEND_COMMAND		_IOR(IPMI_IOC_MAGIC, 13,	\
 					     struct ipmi_req)
 
+/* Messages sent to the interface with timing parameters are this
+   format. */
+struct ipmi_req_settime
+{
+	struct ipmi_req req;
+
+	/* See ipmi_request_settime() above for details on these
+           values. */
+	int          retries;
+	unsigned int retry_time_ms;
+};
+/*
+ * Send a message to the interfaces with timing parameters.  error values
+ * are:
+ *   - EFAULT - an address supplied was invalid.
+ *   - EINVAL - The address supplied was not valid, or the command
+ *              was not allowed.
+ *   - EMSGSIZE - The message to was too large.
+ *   - ENOMEM - Buffers could not be allocated for the command.
+ */
+#define IPMICTL_SEND_COMMAND_SETTIME	_IOR(IPMI_IOC_MAGIC, 21,	\
+					     struct ipmi_req_settime)
+
 /* Messages received from the interface are this format. */
 struct ipmi_recv
 {
@@ -513,4 +624,18 @@ struct ipmi_cmdspec
 #define IPMICTL_SET_MY_LUN_CMD		_IOR(IPMI_IOC_MAGIC, 19, unsigned int)
 #define IPMICTL_GET_MY_LUN_CMD		_IOR(IPMI_IOC_MAGIC, 20, unsigned int)
 
+/*
+ * Get/set the default timing values for an interface.  You shouldn't
+ * generally mess with these.
+ */
+struct ipmi_timing_parms
+{
+	int          retries;
+	unsigned int retry_time_ms;
+};
+#define IPMICTL_SET_TIMING_PARMS_CMD	_IOR(IPMI_IOC_MAGIC, 22, \
+					     struct ipmi_timing_parms)
+#define IPMICTL_GET_TIMING_PARMS_CMD	_IOR(IPMI_IOC_MAGIC, 23, \
+					     struct ipmi_timing_parms)
+
 #endif /* __LINUX_IPMI_H */
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index ccdb9386faed..40ed591fd84b 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -53,6 +53,7 @@
 #define IPMI_SET_BMC_GLOBAL_ENABLES_CMD	0x2e
 #define IPMI_GET_BMC_GLOBAL_ENABLES_CMD	0x2f
 #define IPMI_READ_EVENT_MSG_BUFFER_CMD	0x35
+#define IPMI_GET_CHANNEL_INFO_CMD	0x42
 
 #define IPMI_NETFN_STORAGE_REQUEST		0x0a
 #define IPMI_NETFN_STORAGE_RESPONSE		0x0b
@@ -61,8 +62,39 @@
 /* The default slave address */
 #define IPMI_BMC_SLAVE_ADDR	0x20
 
-#define IPMI_MAX_MSG_LENGTH	80
+/* The BT interface on high-end HP systems supports up to 255 bytes in
+ * one transfer.  Its "virtual" BMC supports some commands that are longer
+ * than 128 bytes.  Use the full 256, plus NetFn/LUN, Cmd, cCode, plus
+ * some overhead.  It would be nice to base this on the "BT Capabilities"
+ * but that's too hard to propogate to the rest of the driver. */
+#define IPMI_MAX_MSG_LENGTH	272	/* multiple of 16 */
 
-#define IPMI_CC_NO_ERROR	0
+#define IPMI_CC_NO_ERROR		0x00
+#define IPMI_NODE_BUSY_ERR		0xc0
+#define IPMI_ERR_MSG_TRUNCATED		0xc6
+#define IPMI_LOST_ARBITRATION_ERR	0x81
+#define IPMI_ERR_UNSPECIFIED		0xff
+
+#define IPMI_CHANNEL_PROTOCOL_IPMB	1
+#define IPMI_CHANNEL_PROTOCOL_ICMB	2
+#define IPMI_CHANNEL_PROTOCOL_SMBUS	4
+#define IPMI_CHANNEL_PROTOCOL_KCS	5
+#define IPMI_CHANNEL_PROTOCOL_SMIC	6
+#define IPMI_CHANNEL_PROTOCOL_BT10	7
+#define IPMI_CHANNEL_PROTOCOL_BT15	8
+#define IPMI_CHANNEL_PROTOCOL_TMODE	9
+
+#define IPMI_CHANNEL_MEDIUM_IPMB	1
+#define IPMI_CHANNEL_MEDIUM_ICMB10	2
+#define IPMI_CHANNEL_MEDIUM_ICMB09	3
+#define IPMI_CHANNEL_MEDIUM_8023LAN	4
+#define IPMI_CHANNEL_MEDIUM_ASYNC	5
+#define IPMI_CHANNEL_MEDIUM_OTHER_LAN	6
+#define IPMI_CHANNEL_MEDIUM_PCI_SMBUS	7
+#define IPMI_CHANNEL_MEDIUM_SMBUS1	8
+#define IPMI_CHANNEL_MEDIUM_SMBUS2	9
+#define IPMI_CHANNEL_MEDIUM_USB1	10
+#define IPMI_CHANNEL_MEDIUM_USB2	11
+#define IPMI_CHANNEL_MEDIUM_SYSINTF	12
 
 #endif /* __LINUX_IPMI_MSGDEFS_H */
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index f18187b00c05..6b42943ac3a3 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -35,6 +35,8 @@
 #define __LINUX_IPMI_SMI_H
 
 #include <linux/ipmi_msgdefs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
 
 /* This files describes the interface for IPMI system management interface
    drivers to bind into the IPMI message handler. */
@@ -48,7 +50,7 @@ typedef struct ipmi_smi *ipmi_smi_t;
  * been received, it will report this same data structure back up to
  * the upper layer.  If an error occurs, it should fill in the
  * response with an error code in the completion code location. When
- * asyncronous data is received, one of these is allocated, the
+ * asynchronous data is received, one of these is allocated, the
  * data_size is set to zero and the response holds the data from the
  * get message or get event command that the interface initiated.
  * Note that it is the interfaces responsibility to detect
@@ -62,9 +64,6 @@ struct ipmi_smi_msg
 	long    msgid;
 	void    *user_data;
 
-	/* If 0, add to the end of the queue.  If 1, add to the beginning. */
-	int     prio;
-
 	int           data_size;
 	unsigned char data[IPMI_MAX_MSG_LENGTH];
 
@@ -134,4 +133,11 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
 	msg->done(msg);
 }
 
+/* Allow the lower layer to add things to the proc filesystem
+   directory for this interface.  Note that the entry will
+   automatically be dstroyed when the interface is destroyed. */
+int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
+			    read_proc_t *read_proc, write_proc_t *write_proc,
+			    void *data, struct module *owner);
+
 #endif /* __LINUX_IPMI_SMI_H */
-- 
cgit v1.2.3


From 7860b37198b0650f51bfafebac820386b552a071 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:46 -0700
Subject: [PATCH] move job control fields from task_struct to signal_struct

From: Roland McGrath <roland@redhat.com>

This patch moves all the fields relating to job control from task_struct to
signal_struct, so that all this info is properly per-process rather than
being per-thread.
---
 arch/ia64/kernel/unaligned.c    |   2 +-
 arch/sparc64/solaris/misc.c     |   2 +-
 drivers/char/n_tty.c            |   3 +-
 drivers/char/rocket.c           |   2 +-
 drivers/char/sx.c               |   2 +-
 drivers/char/tty_io.c           | 116 +++++++++++++++++-----------------------
 drivers/char/vt.c               |   2 +-
 drivers/char/vt_ioctl.c         |   3 +-
 drivers/net/slip.c              |   2 +-
 drivers/s390/char/keyboard.c    |   2 +-
 fs/binfmt_elf.c                 |   4 +-
 fs/compat_ioctl.c               |   2 +-
 fs/dquot.c                      |  10 ++--
 fs/exec.c                       |   5 ++
 fs/open.c                       |   2 +-
 fs/proc/array.c                 |  22 ++++----
 include/linux/sched.h           |  17 +++---
 kernel/acct.c                   |   2 +-
 kernel/exit.c                   |  22 ++++----
 kernel/fork.c                   |  10 ++--
 kernel/pid.c                    |   8 +--
 kernel/signal.c                 |   5 +-
 kernel/sys.c                    |  18 +++----
 net/bridge/netfilter/ebtables.c |   2 +-
 net/ipv4/netfilter/ipt_owner.c  |   2 +-
 net/ipv6/netfilter/ip6t_owner.c |   2 +-
 26 files changed, 133 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 2247254be7ac..b1a68e4367bc 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1337,7 +1337,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			 * be holding locks...
 			 */
 			if (user_mode(regs))
-				tty_write_message(current->tty, buf);
+				tty_write_message(current->signal->tty, buf);
 			buf[len-1] = '\0';	/* drop '\r' */
 			printk(KERN_WARNING "%s", buf);	/* watch for command names containing %s */
 		}
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index ea7b2c439653..cea38c0cbb5c 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -402,7 +402,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid)
 			   Solaris setpgrp and setsid? */
 			ret = sys_setpgid(0, 0);
 			if (ret) return ret;
-			current->tty = NULL;
+			current->signal->tty = NULL;
 			return process_group(current);
 		}
 	case 2: /* getsid */
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index 0c02e2debbb1..08f46259e183 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -999,7 +999,8 @@ do_it_again:
 	/* NOTE: not yet done after every sleep pending a thorough
 	   check of the logic of this change. -- jlc */
 	/* don't stop on /dev/console */
-	if (file->f_op->write != redirected_tty_write && current->tty == tty) {
+	if (file->f_op->write != redirected_tty_write &&
+	    current->signal->tty == tty) {
 		if (tty->pgrp <= 0)
 			printk("read_chan: tty->pgrp <= 0!\n");
 		else if (process_group(current) != tty->pgrp) {
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 38544de9fbd9..b0da37eab8e7 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -953,7 +953,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	info->session = current->session;
+	info->session = current->signal->session;
 	info->pgrp = process_group(current);
 
 	if ((info->flags & ROCKET_INITIALIZED) == 0) {
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 25c95fbc65d3..643163b08a8f 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -1420,7 +1420,7 @@ static int sx_open  (struct tty_struct * tty, struct file * filp)
 
 	line = tty->index;
 	sx_dprintk (SX_DEBUG_OPEN, "%d: opening line %d. tty=%p ctty=%p, np=%d)\n", 
-	            current->pid, line, tty, current->tty, sx_nports);
+	            current->pid, line, tty, current->signal->tty, sx_nports);
 
 	if ((line < 0) || (line >= SX_NPORTS) || (line >= sx_nports))
 		return -ENODEV;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 0ba52078f637..e4607d86a755 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -321,7 +321,7 @@ struct tty_driver *get_tty_driver(dev_t device, int *index)
  */
 int tty_check_change(struct tty_struct * tty)
 {
-	if (current->tty != tty)
+	if (current->signal->tty != tty)
 		return 0;
 	if (tty->pgrp <= 0) {
 		printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n");
@@ -486,17 +486,14 @@ void do_tty_hangup(void *data)
 	if (tty->session > 0) {
 		struct list_head *l;
 		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-			task_t *task = p;
-			do {
-				if (task->tty == tty)
-					task->tty = NULL;
-				if (task->leader) {
-					send_group_sig_info(SIGHUP, SEND_SIG_PRIV, task);
-					send_group_sig_info(SIGCONT, SEND_SIG_PRIV, task);
-				}
-			} while_each_thread(p, task);
+			if (p->signal->tty == tty)
+				p->signal->tty = NULL;
+			if (!p->signal->leader)
+				continue;
+			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
-				p->tty_old_pgrp = tty->pgrp;
+				p->signal->tty_old_pgrp = tty->pgrp;
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -575,15 +572,15 @@ void disassociate_ctty(int on_exit)
 
 	lock_kernel();
 
-	tty = current->tty;
+	tty = current->signal->tty;
 	if (tty) {
 		tty_pgrp = tty->pgrp;
 		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
 			tty_vhangup(tty);
 	} else {
-		if (current->tty_old_pgrp) {
-			kill_pg(current->tty_old_pgrp, SIGHUP, on_exit);
-			kill_pg(current->tty_old_pgrp, SIGCONT, on_exit);
+		if (current->signal->tty_old_pgrp) {
+			kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit);
+			kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit);
 		}
 		unlock_kernel();	
 		return;
@@ -594,17 +591,13 @@ void disassociate_ctty(int on_exit)
 			kill_pg(tty_pgrp, SIGCONT, on_exit);
 	}
 
-	current->tty_old_pgrp = 0;
+	current->signal->tty_old_pgrp = 0;
 	tty->session = 0;
 	tty->pgrp = -1;
 
 	read_lock(&tasklist_lock);
-	for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) {
-		task_t *task = p;
-		do {
-			task->tty = NULL;
-		} while_each_thread(p, task);
-	}
+	for_each_task_pid(current->signal->session, PIDTYPE_SID, p, l, pid)
+		p->signal->tty = NULL;
 	read_unlock(&tasklist_lock);
 	unlock_kernel();
 }
@@ -1257,20 +1250,11 @@ static void release_dev(struct file * filp)
 		struct pid *pid;
 
 		read_lock(&tasklist_lock);
-		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-			task_t *task = p;
-			do {
-				task->tty = NULL;
-			} while_each_thread(p, task);
-		}
-		if (o_tty) {
-			for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) {
-				task_t *task = p;
-				do {
-					task->tty = NULL;
-				} while_each_thread(p, task);
-			}
-		}
+		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid)
+			p->signal->tty = NULL;
+		if (o_tty)
+			for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid)
+				p->signal->tty = NULL;
 		read_unlock(&tasklist_lock);
 	}
 
@@ -1341,10 +1325,10 @@ static int tty_open(struct inode * inode, struct file * filp)
 retry_open:
 	noctty = filp->f_flags & O_NOCTTY;
 	if (device == MKDEV(TTYAUX_MAJOR,0)) {
-		if (!current->tty)
+		if (!current->signal->tty)
 			return -ENXIO;
-		driver = current->tty->driver;
-		index = current->tty->index;
+		driver = current->signal->tty->driver;
+		index = current->signal->tty->index;
 		filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
 		/* noctty = 1; */
 		goto got_driver;
@@ -1445,14 +1429,14 @@ got_driver:
 		goto retry_open;
 	}
 	if (!noctty &&
-	    current->leader &&
-	    !current->tty &&
+	    current->signal->leader &&
+	    !current->signal->tty &&
 	    tty->session == 0) {
 	    	task_lock(current);
-		current->tty = tty;
+		current->signal->tty = tty;
 		task_unlock(current);
-		current->tty_old_pgrp = 0;
-		tty->session = current->session;
+		current->signal->tty_old_pgrp = 0;
+		tty->session = current->signal->session;
 		tty->pgrp = process_group(current);
 	}
 	return 0;
@@ -1510,7 +1494,7 @@ static int tiocsti(struct tty_struct *tty, char * arg)
 {
 	char ch, mbz = 0;
 
-	if ((current->tty != tty) && !capable(CAP_SYS_ADMIN))
+	if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (get_user(ch, arg))
 		return -EFAULT;
@@ -1601,14 +1585,14 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 	struct pid *pid;
 	task_t *p;
 
-	if (current->leader &&
-	    (current->session == tty->session))
+	if (current->signal->leader &&
+	    (current->signal->session == tty->session))
 		return 0;
 	/*
 	 * The process must be a session leader and
 	 * not have a controlling tty already.
 	 */
-	if (!current->leader || current->tty)
+	if (!current->signal->leader || current->signal->tty)
 		return -EPERM;
 	if (tty->session > 0) {
 		/*
@@ -1621,21 +1605,17 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 			 */
 
 			read_lock(&tasklist_lock);
-			for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-				task_t *task = p;
-				do {
-					task->tty = NULL;
-				} while_each_thread(p, task);
-			}
+			for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid)
+				p->signal->tty = NULL;
 			read_unlock(&tasklist_lock);
 		} else
 			return -EPERM;
 	}
 	task_lock(current);
-	current->tty = tty;
+	current->signal->tty = tty;
 	task_unlock(current);
-	current->tty_old_pgrp = 0;
-	tty->session = current->session;
+	current->signal->tty_old_pgrp = 0;
+	tty->session = current->signal->session;
 	tty->pgrp = process_group(current);
 	return 0;
 }
@@ -1646,7 +1626,7 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	 */
-	if (tty == real_tty && current->tty != real_tty)
+	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 	return put_user(real_tty->pgrp, arg);
 }
@@ -1660,15 +1640,15 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 		return -ENOTTY;
 	if (retval)
 		return retval;
-	if (!current->tty ||
-	    (current->tty != real_tty) ||
-	    (real_tty->session != current->session))
+	if (!current->signal->tty ||
+	    (current->signal->tty != real_tty) ||
+	    (real_tty->session != current->signal->session))
 		return -ENOTTY;
 	if (get_user(pgrp, (pid_t *) arg))
 		return -EFAULT;
 	if (pgrp < 0)
 		return -EINVAL;
-	if (session_of_pgrp(pgrp) != current->session)
+	if (session_of_pgrp(pgrp) != current->signal->session)
 		return -EPERM;
 	real_tty->pgrp = pgrp;
 	return 0;
@@ -1680,7 +1660,7 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	*/
-	if (tty == real_tty && current->tty != real_tty)
+	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 	if (real_tty->session <= 0)
 		return -ENOTTY;
@@ -1838,12 +1818,12 @@ int tty_ioctl(struct inode * inode, struct file * file,
 			clear_bit(TTY_EXCLUSIVE, &tty->flags);
 			return 0;
 		case TIOCNOTTY:
-			if (current->tty != tty)
+			if (current->signal->tty != tty)
 				return -ENOTTY;
-			if (current->leader)
+			if (current->signal->leader)
 				disassociate_ctty(0);
 			task_lock(current);
-			current->tty = NULL;
+			current->signal->tty = NULL;
 			task_unlock(current);
 			return 0;
 		case TIOCSCTTY:
@@ -1947,9 +1927,9 @@ static void __do_SAK(void *arg)
 		tty->driver->flush_buffer(tty);
 	read_lock(&tasklist_lock);
 	for_each_task_pid(session, PIDTYPE_SID, p, l, pid) {
-		if (p->tty == tty || session > 0) {
+		if (p->signal->tty == tty || session > 0) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): p->session==tty->session\n",
+			    " (%s): p->signal->session==tty->session\n",
 			    p->pid, p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 2febed52e19f..a1a59abc915c 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2278,7 +2278,7 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 
 	if (tty->driver->type != TTY_DRIVER_TYPE_CONSOLE)
 		return -EINVAL;
-	if (current->tty != tty && !capable(CAP_SYS_ADMIN))
+	if (current->signal->tty != tty && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (get_user(type, (char *)arg))
 		return -EFAULT;
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index d8c6acc8e62c..0685fe7be2d1 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -382,7 +382,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
 	perm = 0;
-	if (current->tty == tty || capable(CAP_SYS_TTY_CONFIG))
+	if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
 		perm = 1;
  
 	kbd = kbd_table + console;
@@ -1221,4 +1221,3 @@ void change_console(unsigned int new_console)
 
 	complete_change_console(new_console);
 }
-
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 601df52ebb29..e783ac0fa71e 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -1307,7 +1307,7 @@ static int sl_ioctl(struct net_device *dev,struct ifreq *rq,int cmd)
 		/* Resolve race condition, when ioctl'ing hanged up 
 		   and opened by another process device.
 		 */
-		if (sl->tty != current->tty && sl->pid != current->pid) {
+		if (sl->tty != current->signal->tty && sl->pid != current->pid) {
 			spin_unlock_bh(&sl->lock);
 			return -EPERM;
 		}
diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 892ebc7739b0..b124ebb7fc9b 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -471,7 +471,7 @@ kbd_ioctl(struct kbd_data *kbd, struct file *file,
 	 * To have permissions to do most of the vt ioctls, we either have
 	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
-	perm = current->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG);
+	perm = current->signal->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG);
 	switch (cmd) {
 	case KDGKBTYPE:
 		return put_user(KB_101, (char*) arg);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9cc7cc648b42..e5b79a294c80 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1129,7 +1129,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pid = p->pid;
 	prstatus->pr_ppid = p->parent->pid;
 	prstatus->pr_pgrp = process_group(p);
-	prstatus->pr_sid = p->session;
+	prstatus->pr_sid = p->signal->session;
 	jiffies_to_timeval(p->utime, &prstatus->pr_utime);
 	jiffies_to_timeval(p->stime, &prstatus->pr_stime);
 	jiffies_to_timeval(p->cutime, &prstatus->pr_cutime);
@@ -1157,7 +1157,7 @@ static void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_pid = p->pid;
 	psinfo->pr_ppid = p->parent->pid;
 	psinfo->pr_pgrp = process_group(p);
-	psinfo->pr_sid = p->session;
+	psinfo->pr_sid = p->signal->session;
 
 	i = p->state ? ffz(~p->state) + 1 : 0;
 	psinfo->pr_state = i;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 23baed6180ff..de45d833d0f4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1604,7 +1604,7 @@ static int vt_check(struct file *file)
 	 * To have permissions to do most of the vt ioctls, we either have
 	 * to be the owner of the tty, or super-user.
 	 */
-	if (current->tty == tty || capable(CAP_SYS_ADMIN))
+	if (current->signal->tty == tty || capable(CAP_SYS_ADMIN))
 		return 1;
 	return 0;                                                    
 }
diff --git a/fs/dquot.c b/fs/dquot.c
index e6b39e66207a..5749044d028e 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -689,12 +689,12 @@ static void print_warning(struct dquot *dquot, const char warntype)
 
 	if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
 		return;
-	tty_write_message(current->tty, dquot->dq_sb->s_id);
+	tty_write_message(current->signal->tty, dquot->dq_sb->s_id);
 	if (warntype == ISOFTWARN || warntype == BSOFTWARN)
-		tty_write_message(current->tty, ": warning, ");
+		tty_write_message(current->signal->tty, ": warning, ");
 	else
-		tty_write_message(current->tty, ": write failed, ");
-	tty_write_message(current->tty, quotatypes[dquot->dq_type]);
+		tty_write_message(current->signal->tty, ": write failed, ");
+	tty_write_message(current->signal->tty, quotatypes[dquot->dq_type]);
 	switch (warntype) {
 		case IHARDWARN:
 			msg = " file limit reached.\n";
@@ -715,7 +715,7 @@ static void print_warning(struct dquot *dquot, const char warntype)
 			msg = " block quota exceeded.\n";
 			break;
 	}
-	tty_write_message(current->tty, msg);
+	tty_write_message(current->signal->tty, msg);
 }
 
 static inline void flush_warnings(struct dquot **dquots, char *warntype)
diff --git a/fs/exec.c b/fs/exec.c
index 225afb0d94e5..62bf2c537abd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -601,6 +601,11 @@ static inline int de_thread(struct task_struct *tsk)
 		newsig->group_stop_count = 0;
 		newsig->curr_target = NULL;
 		init_sigpending(&newsig->shared_pending);
+
+		newsig->pgrp = oldsig->pgrp;
+		newsig->session = oldsig->session;
+		newsig->leader = oldsig->leader;
+		newsig->tty_old_pgrp = oldsig->tty_old_pgrp;
 	}
 
 	if (thread_group_empty(current))
diff --git a/fs/open.c b/fs/open.c
index 9a9ce5be4dbc..ce11096afcad 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1037,7 +1037,7 @@ EXPORT_SYMBOL(sys_close);
 asmlinkage long sys_vhangup(void)
 {
 	if (capable(CAP_SYS_TTY_CONFIG)) {
-		tty_vhangup(current->tty);
+		tty_vhangup(current->signal->tty);
 		return 0;
 	}
 	return -EPERM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7af62577287e..ac9ccac5d1ee 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -168,7 +168,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		p->pid && p->ptrace ? p->parent->pid : 0,
 		p->uid, p->euid, p->suid, p->fsuid,
 		p->gid, p->egid, p->sgid, p->fsgid);
-	read_unlock(&tasklist_lock);	
+	read_unlock(&tasklist_lock);
 	task_lock(p);
 	buffer += sprintf(buffer,
 		"FDSize:\t%d\n"
@@ -301,7 +301,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	sigset_t sigign, sigcatch;
 	char state;
 	int res;
-	pid_t ppid;
+ 	pid_t ppid, pgid = -1, sid = -1;
 	int num_threads = 0;
 	struct mm_struct *mm;
 
@@ -311,10 +311,6 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	mm = task->mm;
 	if(mm)
 		mm = mmgrab(mm);
-	if (task->tty) {
-		tty_pgrp = task->tty->pgrp;
-		tty_nr = new_encode_dev(tty_devnum(task->tty));
-	}
 	task_unlock(task);
 	if (mm) {
 		down_read(&mm->mmap_sem);
@@ -335,7 +331,15 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 		spin_unlock_irq(&task->sighand->siglock);
 	}
-	read_unlock(&tasklist_lock);		
+	if (task->signal) {
+		if (task->signal->tty) {
+			tty_pgrp = task->signal->tty->pgrp;
+			tty_nr = new_encode_dev(tty_devnum(task->signal->tty));
+		}
+		pgid = process_group(task);
+		sid = task->signal->session;
+	}
+	read_unlock(&tasklist_lock);
 
 	/* scale priority and nice values from timeslices to -20..20 */
 	/* to make it look like a "normal" Unix priority/nice value  */
@@ -352,8 +356,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		task->comm,
 		state,
 		ppid,
-		process_group(task),
-		task->session,
+		pgid,
+		sid,
 		tty_nr,
 		tty_pgrp,
 		task->flags,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 054b3c0d5962..5a1229121123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -269,6 +269,15 @@ struct signal_struct {
 
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
+
+	/* job control IDs */
+	pid_t pgrp;
+	pid_t tty_old_pgrp;
+	pid_t session;
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
 };
 
 /*
@@ -398,12 +407,7 @@ struct task_struct {
 	unsigned long personality;
 	int did_exec:1;
 	pid_t pid;
-	pid_t __pgrp;		/* Accessed via process_group() */
-	pid_t tty_old_pgrp;
-	pid_t session;
 	pid_t tgid;
-	/* boolean value for session group leader */
-	int leader;
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
@@ -446,7 +450,6 @@ struct task_struct {
 	char comm[16];
 /* file system info */
 	int link_count, total_link_count;
-	struct tty_struct *tty; /* NULL if no tty */
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 /* CPU-specific state of this task */
@@ -499,7 +502,7 @@ struct task_struct {
 
 static inline pid_t process_group(struct task_struct *tsk)
 {
-	return tsk->group_leader->__pgrp;
+	return tsk->signal->pgrp;
 }
 
 extern void __put_task_struct(struct task_struct *tsk);
diff --git a/kernel/acct.c b/kernel/acct.c
index 9dbab88b2d31..b417066778a7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -347,7 +347,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
-	ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0;
+	ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 
 	ac.ac_flag = 0;
 	if (current->flags & PF_FORKNOEXEC)
diff --git a/kernel/exit.c b/kernel/exit.c
index 308f6959add6..810eebd77559 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -136,13 +136,13 @@ int session_of_pgrp(int pgrp)
 
 	read_lock(&tasklist_lock);
 	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
-		if (p->session > 0) {
-			sid = p->session;
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
 			goto out;
 		}
 	p = find_task_by_pid(pgrp);
 	if (p)
-		sid = p->session;
+		sid = p->signal->session;
 out:
 	read_unlock(&tasklist_lock);
 	
@@ -170,7 +170,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
-			    && p->real_parent->session == p->session) {
+			    && p->real_parent->signal->session == p->signal->session) {
 			ret = 0;
 			break;
 		}
@@ -259,14 +259,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
 	struct task_struct *curr = current;
 
-	if (curr->session != session) {
+	if (curr->signal->session != session) {
 		detach_pid(curr, PIDTYPE_SID);
-		curr->session = session;
+		curr->signal->session = session;
 		attach_pid(curr, PIDTYPE_SID, session);
 	}
 	if (process_group(curr) != pgrp) {
 		detach_pid(curr, PIDTYPE_PGID);
-		curr->group_leader->__pgrp = pgrp;
+		curr->signal->pgrp = pgrp;
 		attach_pid(curr, PIDTYPE_PGID, pgrp);
 	}
 }
@@ -341,7 +341,7 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	current->tty = NULL;
+	current->signal->tty = NULL;
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
@@ -564,7 +564,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 	 * outside, so the child pgrp is now orphaned.
 	 */
 	if ((process_group(p) != process_group(father)) &&
-	    (p->session == father->session)) {
+	    (p->signal->session == father->signal->session)) {
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
@@ -675,7 +675,7 @@ static void exit_notify(struct task_struct *tsk)
 	t = tsk->real_parent;
 	
 	if ((process_group(t) != process_group(tsk)) &&
-	    (t->session == tsk->session) &&
+	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
 		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
@@ -780,7 +780,7 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	exit_itimers(tsk);
 	exit_thread();
 
-	if (tsk->leader)
+	if (tsk->signal->leader)
 		disassociate_ctty(1);
 
 	module_put(tsk->thread_info->exec_domain->module);
diff --git a/kernel/fork.c b/kernel/fork.c
index a1f20cabbdd3..d2dd97e866bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -811,6 +811,12 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
 
+	sig->tty = current->signal->tty;
+	sig->pgrp = process_group(current);
+	sig->session = current->signal->session;
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
+
 	return 0;
 }
 
@@ -935,8 +941,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;
 
-	p->leader = 0;		/* session leadership doesn't inherit */
-	p->tty_old_pgrp = 0;
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
@@ -1055,7 +1059,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->session);
+		attach_pid(p, PIDTYPE_SID, p->signal->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	} else
diff --git a/kernel/pid.c b/kernel/pid.c
index 4c85144759c5..6ed44f56ca45 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 
 	attach_pid(thread, PIDTYPE_PID, thread->pid);
 	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->session);
+	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
 	list_add_tail(&thread->tasks, &init_task.tasks);
 
 	attach_pid(leader, PIDTYPE_PID, leader->pid);
 	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->session);
+	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index e6b7904df68f..7a4b479a6f45 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -588,7 +588,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	error = -EPERM;
 	if ((!info || ((unsigned long)info != 1 &&
 			(unsigned long)info != 2 && SI_FROMUSER(info)))
-	    && ((sig != SIGCONT) || (current->session != t->session))
+	    && ((sig != SIGCONT) ||
+		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
 	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
 	    && !capable(CAP_KILL))
@@ -1103,7 +1104,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid)
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) {
-		if (!p->leader)
+		if (!p->signal->leader)
 			continue;
 		err = group_send_sig_info(sig, info, p);
 		if (retval)
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f9e02f2071..9d57482758f3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -990,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 
 	if (p->parent == current || p->real_parent == current) {
 		err = -EPERM;
-		if (p->session != current->session)
+		if (p->signal->session != current->signal->session)
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
@@ -1002,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	}
 
 	err = -EPERM;
-	if (p->leader)
+	if (p->signal->leader)
 		goto out;
 
 	if (pgid != pid) {
@@ -1011,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct list_head *l;
 
 		for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid)
-			if (p->session == current->session)
+			if (p->signal->session == current->signal->session)
 				goto ok_pgid;
 		goto out;
 	}
@@ -1023,7 +1023,7 @@ ok_pgid:
 
 	if (process_group(p) != pgid) {
 		detach_pid(p, PIDTYPE_PGID);
-		p->group_leader->__pgrp = pgid;
+		p->signal->pgrp = pgid;
 		attach_pid(p, PIDTYPE_PGID, pgid);
 	}
 
@@ -1065,7 +1065,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
 	if (!pid) {
-		return current->session;
+		return current->signal->session;
 	} else {
 		int retval;
 		struct task_struct *p;
@@ -1077,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		if(p) {
 			retval = security_task_getsid(p);
 			if (!retval)
-				retval = p->session;
+				retval = p->signal->session;
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
@@ -1098,10 +1098,10 @@ asmlinkage long sys_setsid(void)
 	if (pid)
 		goto out;
 
-	current->leader = 1;
+	current->signal->leader = 1;
 	__set_special_pids(current->pid, current->pid);
-	current->tty = NULL;
-	current->tty_old_pgrp = 0;
+	current->signal->tty = NULL;
+	current->signal->tty_old_pgrp = 0;
 	err = process_group(current);
 out:
 	write_unlock_irq(&tasklist_lock);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 33b687d60efe..f76563312ee4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -46,7 +46,7 @@ static void print_string(char *str)
 	struct tty_struct *my_tty;
 
 	/* The tty for the current task */
-	my_tty = current->tty;
+	my_tty = current->signal->tty;
 	if (my_tty != NULL) {
 		my_tty->driver->write(my_tty, 0, str, strlen(str));
 		my_tty->driver->write(my_tty, 0, "\015\012", 2);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index a1529896ec1b..91c3fd3f1f8f 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -95,7 +95,7 @@ match_sid(const struct sk_buff *skb, pid_t sid)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct files_struct *files;
-		if (p->session != sid)
+		if (p->signal->session != sid)
 			continue;
 
 		task_lock(p);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 02e5ee4e7418..0bb9c661b73c 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -61,7 +61,7 @@ match_sid(const struct sk_buff *skb, pid_t sid)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct files_struct *files;
-		if (p->session != sid)
+		if (p->signal->session != sid)
 			continue;
 
 		task_lock(p);
-- 
cgit v1.2.3


From af70f7673155616ffd004d551e1b612002a58bf0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:59 -0700
Subject: [PATCH] Fix page allocator lower zone protection for NUMA

From: Martin Hicks <mort@wildopensource.com>

This changes __alloc_pages() so it uses precalculated values for the "min".
This should prevent the problem of min incrementing from zone to zone across
many nodes on a NUMA machine.  The result of falling back to other nodes with
the old incremental min calculations was that the min value became very
large.
---
 include/linux/mmzone.h |  39 ++++++++++---
 kernel/sysctl.c        |   2 +-
 mm/page_alloc.c        | 150 +++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 159 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b5398fa7be88..51b8f3f67741 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -54,6 +54,15 @@ struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
 } ____cacheline_aligned_in_smp;
 
+#define ZONE_DMA		0
+#define ZONE_NORMAL		1
+#define ZONE_HIGHMEM		2
+
+#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
+#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
+
+#define GFP_ZONEMASK	0x03
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -70,6 +79,19 @@ struct zone {
 	spinlock_t		lock;
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	/*
+	 * protection[] is a pre-calculated number of extra pages that must be
+	 * available in a zone in order for __alloc_pages() to allocate memory
+	 * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
+	 * be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
+	 * for us to choose to allocate the page from that zone.
+	 *
+	 * It uses both min_free_kbytes and sysctl_lower_zone_protection.
+	 * The protection values are recalculated if either of these values
+	 * change.  The array elements are in zonelist order:
+	 *	[0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
+	 */
+	unsigned long		protection[MAX_NR_ZONES];
 
 	ZONE_PADDING(_pad1_)
 
@@ -157,14 +179,6 @@ struct zone {
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 } ____cacheline_maxaligned_in_smp;
 
-#define ZONE_DMA		0
-#define ZONE_NORMAL		1
-#define ZONE_HIGHMEM		2
-
-#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
-
-#define GFP_ZONEMASK	0x03
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -228,6 +242,11 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive,
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone);
 
+/*
+ * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
+ */
+#define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
+
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
  * @pgdat - pointer to a pg_data_t variable
@@ -299,7 +318,9 @@ static inline int is_normal(struct zone *zone)
 struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
-					  void __user *, size_t *);
+					void __user *, size_t *);
+int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5f3123b0522..f2c8c8ce4926 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -722,7 +722,7 @@ static ctl_table vm_table[] = {
 		.data		= &sysctl_lower_zone_protection,
 		.maxlen		= sizeof(sysctl_lower_zone_protection),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &lower_zone_protection_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9764a4e78e45..c87ca3dd2f11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct task_struct *p = current;
 	int i;
 	int cold;
+	int alloc_type;
 	int do_retry;
 
 	might_sleep_if(wait);
@@ -564,28 +565,27 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (zones[0] == NULL)     /* no zones in the zonelist */
 		return NULL;
 
+	alloc_type = zone_idx(zones[0]);
+
 	/* Go through the zonelist once, looking for a zone with enough free */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
-		unsigned long local_low;
+
+		min = (1<<order) + z->protection[alloc_type];
 
 		/*
-		 * This is the fabled 'incremental min'. We let real-time tasks
-		 * dip their real-time paws a little deeper into reserves.
+		 * We let real-time tasks dip their real-time paws a little
+		 * deeper into reserves.
 		 */
-		local_low = z->pages_low;
 		if (rt_task(p))
-			local_low >>= 1;
-		min += local_low;
+			min -= z->pages_low >> 1;
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-		       		goto got_pg;
+				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/* we're somewhat low on memory, failed to find what we needed */
@@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		wakeup_kswapd(zones[i]);
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
-		unsigned long local_min;
 		struct zone *z = zones[i];
 
-		local_min = z->pages_min;
+		min = (1<<order) + z->protection[alloc_type];
+
 		if (gfp_mask & __GFP_HIGH)
-			local_min >>= 2;
+			min -= z->pages_low >> 2;
 		if (rt_task(p))
-			local_min >>= 1;
-		min += local_min;
+			min -= z->pages_low >> 1;
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += local_min * sysctl_lower_zone_protection;
 	}
 
 	/* here we're in the low on memory slow path */
@@ -642,18 +640,17 @@ rebalance:
 	p->flags &= ~PF_MEMALLOC;
 
 	/* go through the zonelist yet one more time */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
-		min += z->pages_min;
+		min = (1UL << order) + z->protection[alloc_type];
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/*
@@ -1056,6 +1053,8 @@ void show_free_areas(void)
 		ps.nr_page_table_pages);
 
 	for_each_zone(zone) {
+		int i;
+
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1075,6 +1074,10 @@ void show_free_areas(void)
 			K(zone->nr_inactive),
 			K(zone->present_pages)
 			);
+		printk("protections[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->protection[i]);
+		printk("\n");
 	}
 
 	for_each_zone(zone) {
@@ -1272,7 +1275,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
-	} 
+	}
 }
 
 #endif	/* CONFIG_NUMA */
@@ -1744,6 +1747,93 @@ void __init page_alloc_init(void)
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
+static unsigned long higherzone_val(struct zone *z, int max_zone,
+					int alloc_type)
+{
+	int z_idx = zone_idx(z);
+	struct zone *higherzone;
+	unsigned long pages;
+
+	/* there is no higher zone to get a contribution from */
+	if (z_idx == MAX_NR_ZONES-1)
+		return 0;
+
+	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
+
+	/* We always start with the higher zone's protection value */
+	pages = higherzone->protection[alloc_type];
+
+	/*
+	 * We get a lower-zone-protection contribution only if there are
+	 * pages in the higher zone and if we're not the highest zone
+	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
+	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
+	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
+	 */
+	if (higherzone->present_pages && z_idx < alloc_type)
+		pages += higherzone->pages_low * sysctl_lower_zone_protection;
+
+	return pages;
+}
+
+/*
+ * setup_per_zone_protection - called whenver min_free_kbytes or
+ *	sysctl_lower_zone_protection changes.  Ensures that each zone
+ *	has a correct pages_protected value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ *
+ *	This algorithm is way confusing.  I tries to keep the same behavior
+ *	as we had with the incremental min iterative algorithm.
+ */
+static void setup_per_zone_protection(void)
+{
+	struct pglist_data *pgdat;
+	struct zone *zones, *zone;
+	int max_zone;
+	int i, j;
+
+	for_each_pgdat(pgdat) {
+		zones = pgdat->node_zones;
+
+		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
+			if (zones[i].present_pages)
+				max_zone = i;
+
+		/*
+		 * For each of the different allocation types:
+		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+		 */
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			/*
+			 * For each of the zones:
+			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+			 */
+			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+				zone = &zones[j];
+
+				/*
+				 * We never protect zones that don't have memory
+				 * in them (j>max_zone) or zones that aren't in
+				 * the zonelists for a certain type of
+				 * allocation (j>i).  We have to assign these to
+				 * zero because the lower zones take
+				 * contributions from the higher zones.
+				 */
+				if (j > max_zone || j > i) {
+					zone->protection[i] = 0;
+					continue;
+				}
+				/*
+				 * The contribution of the next higher zone
+				 */
+				zone->protection[i] = higherzone_val(zone,
+								max_zone, i);
+				zone->protection[i] += zone->pages_low;
+			}
+		}
+	}
+}
+
 /*
  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
  *	that the pages_{min,low,high} values for each zone are set correctly 
@@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void)
 	unsigned long flags;
 
 	/* Calculate total number of !ZONE_HIGHMEM pages */
-	for_each_zone(zone)
+	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
+	}
 
 	for_each_zone(zone) {
 		spin_lock_irqsave(&zone->lru_lock, flags);
@@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void)
 	if (min_free_kbytes > 16384)
 		min_free_kbytes = 16384;
 	setup_per_zone_pages_min();
+	setup_per_zone_protection();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
 
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
- *	that we can call setup_per_zone_pages_min() whenever min_free_kbytes 
+ *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
@@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 {
 	proc_dointvec(table, write, file, buffer, length);
 	setup_per_zone_pages_min();
+	setup_per_zone_protection();
+	return 0;
+}
+
+/*
+ * lower_zone_protection_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_protection()
+ *	whenever sysctl_lower_zone_protection changes.
+ */
+int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+		 struct file *file, void __user *buffer, size_t *length)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length);
+	setup_per_zone_protection();
 	return 0;
 }
-- 
cgit v1.2.3


From a1ff5989c622e78d2266237396545876359f5edf Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:57:12 -0700
Subject: [PATCH] ext3 fsync() and fdatasync() speedup

ext3's fsync/fdatasync implementation is currently syncing the inode via a
full journal commit even if it was unaltered.

Fix that up by exporting the core VFS's inode sync function to modules and
calling it if the inode is dirty.  We need to do it this way so that the
inode is moved to the appropriate superblock list and so that the i_state
dirty flags are appropriately updated.

This speeds up ext3 fsync() for file overwrites by a factor of four (disk
non-writeback) to forty (disk in writeback mode).
---
 fs/ext3/fsync.c     | 38 ++++++++++++++++++++++++++++----------
 fs/fs-writeback.c   | 42 ++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h  |  1 +
 mm/page-writeback.c |  2 ++
 4 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 0888abcd9757..0b6a45929030 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -24,6 +24,8 @@
 
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
@@ -38,29 +40,28 @@
  *
  * What we do is just kick off a commit and wait on it.  This will snapshot the
  * inode to disk.
- *
- * Note that there is a serious optimisation we can make here: if the current
- * inode is not part of j_running_transaction or j_committing_transaction
- * then we have nothing to do.  That would require implementation of t_ilist,
- * which isn't too hard.
  */
 
 int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	int ret = 0;
 
 	J_ASSERT(ext3_journal_current_handle() == 0);
 
+	smp_mb();		/* prepare for lockless i_state read */
+	if (!(inode->i_state & I_DIRTY))
+		goto out;
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  ext3_force_commit() will sync the metadata
+	 *  sync_inode() will sync the metadata
 	 *
 	 * data=ordered:
 	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  ext3_force_commit() will wait on the buffers.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages (but all IO is complete)
-	 *  Not pretty, but it works.
+	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
+	 *  filemap_fdatawait() will wait on the pages.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -70,5 +71,22 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	return ext3_force_commit(inode->i_sb);
+	if (ext3_should_journal_data(inode)) {
+		ret = ext3_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	/*
+	 * The VFS has written the file data.  If the inode is unaltered
+	 * then we need not start a commit.
+	 */
+	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 0, /* sys_fsync did this */
+		};
+		ret = sync_inode(inode, &wbc);
+	}
+out:
+	return ret;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 016891bb2b70..aa5f34b85747 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -137,13 +137,14 @@ static void write_inode(struct inode *inode, int sync)
  *
  * Called under inode_lock.
  */
-static void
+static int
 __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	unsigned dirty;
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
 	int wait = wbc->sync_mode == WB_SYNC_ALL;
+	int ret;
 
 	BUG_ON(inode->i_state & I_LOCK);
 
@@ -164,14 +165,17 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_unlock(&mapping->page_lock);
 	spin_unlock(&inode_lock);
 
-	do_writepages(mapping, wbc);
+	ret = do_writepages(mapping, wbc);
 
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
 		write_inode(inode, wait);
 
-	if (wait)
-		filemap_fdatawait(mapping);
+	if (wait) {
+		int err = filemap_fdatawait(mapping);
+		if (ret == 0)
+			ret = err;
+	}
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
@@ -195,18 +199,19 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	wake_up_inode(inode);
+	return ret;
 }
 
 /*
  * Write out an inode's dirty pages.  Called under inode_lock.
  */
-static void
+static int
 __writeback_single_inode(struct inode *inode,
 			struct writeback_control *wbc)
 {
 	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
 		list_move(&inode->i_list, &inode->i_sb->s_dirty);
-		return;
+		return 0;
 	}
 
 	/*
@@ -219,7 +224,7 @@ __writeback_single_inode(struct inode *inode,
 		iput(inode);
 		spin_lock(&inode_lock);
 	}
-	__sync_single_inode(inode, wbc);
+	return __sync_single_inode(inode, wbc);
 }
 
 /*
@@ -499,9 +504,30 @@ void write_inode_now(struct inode *inode, int sync)
 	if (sync)
 		wait_on_inode(inode);
 }
-
 EXPORT_SYMBOL(write_inode_now);
 
+/**
+ * sync_inode - write an inode and its pages to disk.
+ * @inode: the inode to sync
+ * @wbc: controls the writeback mode
+ *
+ * sync_inode() will write an inode and its pages to disk.  It will also
+ * correctly update the inode on its superblock's dirty inode lists and will
+ * update inode->i_state.
+ *
+ * The caller must have a ref on the inode.
+ */
+int sync_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	spin_lock(&inode_lock);
+	ret = __writeback_single_inode(inode, wbc);
+	spin_unlock(&inode_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sync_inode);
+
 /**
  * generic_osync_inode - flush all dirty data for a given inode to disk
  * @inode: inode to write
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e47f6360f74c..3e71560374c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -925,6 +925,7 @@ static inline void file_accessed(struct file *file)
 	touch_atime(file->f_vfsmnt, file->f_dentry);
 }
 
+int sync_inode(struct inode *inode, struct writeback_control *wbc);
 
 /**
  * &export_operations - for nfsd to communicate with file systems
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1ad6717ade97..f1ecbd88e846 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -441,6 +441,8 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+	if (wbc->nr_to_write <= 0)
+		return 0;
 	if (mapping->a_ops->writepages)
 		return mapping->a_ops->writepages(mapping, wbc);
 	return generic_writepages(mapping, wbc);
-- 
cgit v1.2.3


From 2b38960cbf6a500bb8d2d8afd7e0cd546c72efe6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:57:51 -0700
Subject: [PATCH] JBD: ordered-data commit cleanup

For data=ordered, kjournald at commit time has to write out and wait upon a
long list of buffers.  It does this in a rather awkward way with a single
list.  it causes complexity and long lock hold times, and makes the addition
of rescheduling points quite hard

So what we do instead (based on Chris Mason's suggestion) is to add a new
buffer list (t_locked_list) to the journal.  It contains buffers which have
been placed under I/O.

So as we walk the t_sync_datalist list we move buffers over to t_locked_list
as they are written out.

When t_sync_datalist is empty we may then walk t_locked_list waiting for the
I/O to complete.

As a side-effect this means that we can remove the nasty synchronous wait in
journal_dirty_data which is there to avoid the kjournald livelock which would
otherwise occur when someone is continuously dirtying a buffer.
---
 fs/jbd/commit.c      | 143 +++++++++++++++++++++++++++------------------------
 fs/jbd/transaction.c |  13 +++--
 include/linux/jbd.h  |   9 +++-
 3 files changed, 95 insertions(+), 70 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index de335c04c962..b3cb6bf406d1 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -78,6 +78,21 @@ nope:
 	__brelse(bh);
 }
 
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held.  For ranking reasons we must trylock.  If we lose, schedule away and
+ * return 0.  j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+	if (!jbd_trylock_bh_state(bh)) {
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		return 0;
+	}
+	return 1;
+}
+
 /*
  * journal_commit_transaction
  *
@@ -88,7 +103,6 @@ void journal_commit_transaction(journal_t *journal)
 {
 	transaction_t *commit_transaction;
 	struct journal_head *jh, *new_jh, *descriptor;
-	struct journal_head *next_jh, *last_jh;
 	struct buffer_head *wbuf[64];
 	int bufs;
 	int flags;
@@ -222,113 +236,110 @@ void journal_commit_transaction(journal_t *journal)
 	err = 0;
 	/*
 	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_datalist, so we have to keep looping back to write_out_data
-	 * until we *know* that the list is empty.
+	 * onto ->t_sync_datalist, so we have to keep looping back to
+	 * write_out_data until we *know* that the list is empty.
 	 */
-write_out_data:
-
+	bufs = 0;
 	/*
 	 * Cleanup any flushed data buffers from the data list.  Even in
 	 * abort mode, we want to flush this out as soon as possible.
-	 *
-	 * We take j_list_lock to protect the lists from
-	 * journal_try_to_free_buffers().
 	 */
+write_out_data:
+	cond_resched();
 	spin_lock(&journal->j_list_lock);
 
-write_out_data_locked:
-	bufs = 0;
-	next_jh = commit_transaction->t_sync_datalist;
-	if (next_jh == NULL)
-		goto sync_datalist_empty;
-	last_jh = next_jh->b_tprev;
-
-	do {
+	while (commit_transaction->t_sync_datalist) {
 		struct buffer_head *bh;
 
-		jh = next_jh;
-		next_jh = jh->b_tnext;
+		jh = commit_transaction->t_sync_datalist;
+		commit_transaction->t_sync_datalist = jh->b_tnext;
 		bh = jh2bh(jh);
-		if (!buffer_locked(bh)) {
+		if (buffer_locked(bh)) {
+			BUFFER_TRACE(bh, "locked");
+			if (!inverted_lock(journal, bh))
+				goto write_out_data;
+			__journal_unfile_buffer(jh);
+			__journal_file_buffer(jh, jh->b_transaction, BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			if (need_resched()) {
+				spin_unlock(&journal->j_list_lock);
+				goto write_out_data;
+			}
+		} else {
 			if (buffer_dirty(bh)) {
 				BUFFER_TRACE(bh, "start journal writeout");
-				atomic_inc(&bh->b_count);
+				get_bh(bh);
 				wbuf[bufs++] = bh;
-			} else {
-				BUFFER_TRACE(bh, "writeout complete: unfile");
-				/*
-				 * We have a lock ranking problem..
-				 */
-				if (!jbd_trylock_bh_state(bh)) {
+				if (bufs == ARRAY_SIZE(wbuf)) {
+					jbd_debug(2, "submit %d writes\n",
+							bufs);
 					spin_unlock(&journal->j_list_lock);
-					schedule();
+					ll_rw_block(WRITE, bufs, wbuf);
+					journal_brelse_array(wbuf, bufs);
+					bufs = 0;
 					goto write_out_data;
 				}
+			} else {
+				BUFFER_TRACE(bh, "writeout complete: unfile");
+				if (!inverted_lock(journal, bh))
+					goto write_out_data;
 				__journal_unfile_buffer(jh);
 				jh->b_transaction = NULL;
 				jbd_unlock_bh_state(bh);
 				journal_remove_journal_head(bh);
-				__brelse(bh);
-				if (need_resched() && commit_transaction->
-							t_sync_datalist) {
-					commit_transaction->t_sync_datalist =
-								next_jh;
-					if (bufs)
-						break;
+				put_bh(bh);
+				if (need_resched()) {
 					spin_unlock(&journal->j_list_lock);
-					cond_resched();
 					goto write_out_data;
 				}
 			}
 		}
-		if (bufs == ARRAY_SIZE(wbuf)) {
-			/*
-			 * Major speedup: start here on the next scan
-			 */
-			J_ASSERT(commit_transaction->t_sync_datalist != 0);
-			commit_transaction->t_sync_datalist = jh;
-			break;
-		}
-	} while (jh != last_jh);
+	}
 
-	if (bufs || need_resched()) {
-		jbd_debug(2, "submit %d writes\n", bufs);
+	if (bufs) {
 		spin_unlock(&journal->j_list_lock);
-		if (bufs)
-			ll_rw_block(WRITE, bufs, wbuf);
-		cond_resched();
+		ll_rw_block(WRITE, bufs, wbuf);
 		journal_brelse_array(wbuf, bufs);
 		spin_lock(&journal->j_list_lock);
-		goto write_out_data_locked;
 	}
 
 	/*
-	 * Wait for all previously submitted IO on the data list to complete.
+	 * Wait for all previously submitted IO to complete.
 	 */
-	jh = commit_transaction->t_sync_datalist;
-	if (jh == NULL)
-		goto sync_datalist_empty;
-
-	do {
+	while (commit_transaction->t_locked_list) {
 		struct buffer_head *bh;
-		jh = jh->b_tprev;	/* Wait on the last written */
+
+		jh = commit_transaction->t_locked_list->b_tprev;
 		bh = jh2bh(jh);
+		get_bh(bh);
 		if (buffer_locked(bh)) {
-			get_bh(bh);
 			spin_unlock(&journal->j_list_lock);
 			wait_on_buffer(bh);
 			if (unlikely(!buffer_uptodate(bh)))
 				err = -EIO;
+			spin_lock(&journal->j_list_lock);
+		}
+		if (!inverted_lock(journal, bh)) {
 			put_bh(bh);
-			/* the journal_head may have been removed now */
-			goto write_out_data;
-		} else if (buffer_dirty(bh)) {
-			goto write_out_data_locked;
+			spin_lock(&journal->j_list_lock);
+			continue;
 		}
-	} while (jh != commit_transaction->t_sync_datalist);
-	goto write_out_data_locked;
-
-sync_datalist_empty:
+		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+			__journal_unfile_buffer(jh);
+			jh->b_transaction = NULL;
+			jbd_unlock_bh_state(bh);
+			journal_remove_journal_head(bh);
+			put_bh(bh);
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+		put_bh(bh);
+		if (need_resched()) {
+			spin_unlock(&journal->j_list_lock);
+			cond_resched();
+			spin_lock(&journal->j_list_lock);
+		}
+	}
 	spin_unlock(&journal->j_list_lock);
 
 	journal_write_revoke_records(journal, commit_transaction);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73ef79d97fd0..a052407712a7 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1010,7 +1010,8 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 			 * the write() data.
 			 */
 			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData) {
+					jh->b_jlist != BJ_SyncData &&
+					jh->b_jlist != BJ_Locked) {
 				JBUFFER_TRACE(jh, "Not stealing");
 				goto no_journal;
 			}
@@ -1048,7 +1049,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 		 * committing transaction, so might still be left on that
 		 * transaction's metadata lists.
 		 */
-		if (jh->b_jlist != BJ_SyncData) {
+		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
 			JBUFFER_TRACE(jh, "not on correct data list: unfile");
 			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
 			__journal_unfile_buffer(jh);
@@ -1539,6 +1540,9 @@ void __journal_unfile_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
+	case BJ_Locked:
+		list = &transaction->t_locked_list;
+		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1576,7 +1580,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 
 	spin_lock(&journal->j_list_lock);
 	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
-		if (jh->b_jlist == BJ_SyncData) {
+		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
 			/* A written-back ordered data buffer */
 			JBUFFER_TRACE(jh, "release data");
 			__journal_unfile_buffer(jh);
@@ -1985,6 +1989,9 @@ void __journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
+	case BJ_Locked:
+		list =  &transaction->t_locked_list;
+		break;
 	}
 
 	__blist_add_buffer(list, jh);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index eb154bafe1e4..241387b13764 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -486,6 +486,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_reserved_list;
 
+	/*
+	 * Doubly-linked circular list of all buffers under writeout during
+	 * commit [j_list_lock]
+	 */
+	struct journal_head	*t_locked_list;
+
 	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
@@ -1079,7 +1085,8 @@ static inline int jbd_space_needed(journal_t *journal)
 #define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
 #define BJ_LogCtl	6	/* Buffer contains log descriptors */
 #define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Types	8
+#define BJ_Locked	8	/* Locked for I/O during commit */
+#define BJ_Types	9
  
 extern int jbd_blocks_per_page(struct inode *inode);
 
-- 
cgit v1.2.3


From cea39746deca7ce8b10f21e4a4b3e96c33381e2e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:59:45 -0700
Subject: [PATCH] add file_operations.fcntl

From: Chuck Lever <cel@citi.umich.edu>

O_DIRECT|O_APPEND cannot possibly work on NFS, so NFS needs some way of
preventing the user from setting this combination.  We felt that the best
way of implementing this restriction is to allow the filesytem to implement
its own fcntl() handler.

This patch does, that, and provide the appropriate handler for NFS.

Additional details from Chuck:

Forgetting O_DIRECT for a moment, O_APPEND writes on NFS don't work in any
case when multiple clients are writing to a file, since an NFS client can
never guarantee it knows where the true end of file is 100% of the time.
it works as expected iff only one client writes to an O_APPEND file at a
time.

Multi-client O_APPEND writing doesn't seem to be a problem for any
application I'm aware of.  Since it can be made to behave in the
multi-client case with careful application logic or by using file locking,
I don't think we should disallow it.

I want to drop the inode semaphore when doing NFS direct I/O because it is
synchronous; holding the i_sem means we reduce direct I/O concurrency to
one I/O per file at a time.  the important thing sct was worried about was
the case where a single client is writing with O_APPEND and O_DIRECT, and
we don't hold the i_sem during the write.

We must at least hold the i_sem when determining where the end of file is
to do the O_APPEND write.  In 2.6, I believe that is handled correctly in
the VFS layer, so this is not an issue for 2.6, right?
---
 fs/fcntl.c         | 136 ++++++++++++++++++++++++++++-------------------------
 fs/nfs/file.c      |  28 +++++++++++
 include/linux/fs.h |   5 ++
 3 files changed, 105 insertions(+), 64 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3486b799e9e4..abad0aa00d13 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -282,80 +282,88 @@ void f_delown(struct file *filp)
 
 EXPORT_SYMBOL(f_delown);
 
-static long do_fcntl(unsigned int fd, unsigned int cmd,
-		     unsigned long arg, struct file * filp)
+long generic_file_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp)
 {
 	long err = -EINVAL;
 
 	switch (cmd) {
-		case F_DUPFD:
-			get_file(filp);
-			err = dupfd(filp, arg);
-			break;
-		case F_GETFD:
-			err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
-			break;
-		case F_SETFD:
-			err = 0;
-			set_close_on_exec(fd, arg & FD_CLOEXEC);
-			break;
-		case F_GETFL:
-			err = filp->f_flags;
-			break;
-		case F_SETFL:
-			err = setfl(fd, filp, arg);
-			break;
-		case F_GETLK:
-			err = fcntl_getlk(filp, (struct flock __user *) arg);
-			break;
-		case F_SETLK:
-		case F_SETLKW:
-			err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
-			break;
-		case F_GETOWN:
-			/*
-			 * XXX If f_owner is a process group, the
-			 * negative return value will get converted
-			 * into an error.  Oops.  If we keep the
-			 * current syscall conventions, the only way
-			 * to fix this will be in libc.
-			 */
-			err = filp->f_owner.pid;
-			force_successful_syscall_return();
-			break;
-		case F_SETOWN:
-			err = f_setown(filp, arg, 1);
-			break;
-		case F_GETSIG:
-			err = filp->f_owner.signum;
-			break;
-		case F_SETSIG:
-			/* arg == 0 restores default behaviour. */
-			if (arg < 0 || arg > _NSIG) {
-				break;
-			}
-			err = 0;
-			filp->f_owner.signum = arg;
-			break;
-		case F_GETLEASE:
-			err = fcntl_getlease(filp);
-			break;
-		case F_SETLEASE:
-			err = fcntl_setlease(fd, filp, arg);
-			break;
-		case F_NOTIFY:
-			err = fcntl_dirnotify(fd, filp, arg);
-			break;
-		default:
+	case F_DUPFD:
+		get_file(filp);
+		err = dupfd(filp, arg);
+		break;
+	case F_GETFD:
+		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+		break;
+	case F_SETFD:
+		err = 0;
+		set_close_on_exec(fd, arg & FD_CLOEXEC);
+		break;
+	case F_GETFL:
+		err = filp->f_flags;
+		break;
+	case F_SETFL:
+		err = setfl(fd, filp, arg);
+		break;
+	case F_GETLK:
+		err = fcntl_getlk(filp, (struct flock __user *) arg);
+		break;
+	case F_SETLK:
+	case F_SETLKW:
+		err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
+		break;
+	case F_GETOWN:
+		/*
+		 * XXX If f_owner is a process group, the
+		 * negative return value will get converted
+		 * into an error.  Oops.  If we keep the
+		 * current syscall conventions, the only way
+		 * to fix this will be in libc.
+		 */
+		err = filp->f_owner.pid;
+		force_successful_syscall_return();
+		break;
+	case F_SETOWN:
+		err = f_setown(filp, arg, 1);
+		break;
+	case F_GETSIG:
+		err = filp->f_owner.signum;
+		break;
+	case F_SETSIG:
+		/* arg == 0 restores default behaviour. */
+		if (arg < 0 || arg > _NSIG) {
 			break;
+		}
+		err = 0;
+		filp->f_owner.signum = arg;
+		break;
+	case F_GETLEASE:
+		err = fcntl_getlease(filp);
+		break;
+	case F_SETLEASE:
+		err = fcntl_setlease(fd, filp, arg);
+		break;
+	case F_NOTIFY:
+		err = fcntl_dirnotify(fd, filp, arg);
+		break;
+	default:
+		break;
 	}
-
 	return err;
 }
+EXPORT_SYMBOL(generic_file_fcntl);
 
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static long do_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp)
+{
+	if (filp->f_op && filp->f_op->fcntl)
+		return filp->f_op->fcntl(fd, cmd, arg, filp);
+	return generic_file_fcntl(fd, cmd, arg, filp);
+}
+
+asmlinkage long sys_fcntl(int fd, unsigned int cmd, unsigned long arg)
 {	
-	struct file * filp;
+	struct file *filp;
 	long err = -EBADF;
 
 	filp = fget(fd);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e1203ef2275e..df23d4de5b89 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -33,6 +33,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
 
+static long nfs_file_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp);
 static int nfs_file_open(struct inode *, struct file *);
 static int nfs_file_release(struct inode *, struct file *);
 static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
@@ -55,6 +57,7 @@ struct file_operations nfs_file_operations = {
 	.fsync		= nfs_fsync,
 	.lock		= nfs_lock,
 	.sendfile	= nfs_file_sendfile,
+	.fcntl		= nfs_file_fcntl,
 };
 
 struct inode_operations nfs_file_inode_operations = {
@@ -68,6 +71,28 @@ struct inode_operations nfs_file_inode_operations = {
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
+#define nfs_invalid_flags	(O_APPEND | O_DIRECT)
+
+/*
+ * Check for special cases that NFS doesn't support, and
+ * pass the rest to the generic fcntl function.
+ */
+static long
+nfs_file_fcntl(int fd, unsigned int cmd,
+		unsigned long arg, struct file *filp)
+{
+	switch (cmd) {
+	case F_SETFL:
+		if ((filp->f_flags & nfs_invalid_flags) == nfs_invalid_flags)
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+
+	return generic_file_fcntl(fd, cmd, arg, filp);
+}
+
 /*
  * Open file
  */
@@ -78,6 +103,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
 	int (*open)(struct inode *, struct file *);
 	int res = 0;
 
+	if ((filp->f_flags & nfs_invalid_flags) == nfs_invalid_flags)
+		return -EINVAL;
+
 	lock_kernel();
 	/* Do NFSv4 open() call */
 	if ((open = server->rpc_ops->file_open) != NULL)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e71560374c0..02976f7c9f47 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -621,6 +621,9 @@ extern struct list_head file_lock_list;
 
 #include <linux/fcntl.h>
 
+extern long generic_file_fcntl(int fd, unsigned int cmd,
+				unsigned long arg, struct file *filp);
+
 extern int fcntl_getlk(struct file *, struct flock __user *);
 extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *);
 
@@ -830,6 +833,8 @@ struct file_operations {
 	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	long (*fcntl)(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp);
 };
 
 struct inode_operations {
-- 
cgit v1.2.3


From 3a2d85eabe55ed976c74a4fc2c7dd079a0ba8bcc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:02:53 -0700
Subject: [PATCH] Kill duplicate #include <linux_ioport.h>

From: Rusty Russell <rusty@rustcorp.com.au>

include/linux/device.h includes include/linux/ioport.h twice.
---
 include/linux/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/device.h b/include/linux/device.h
index 28e101a77070..9bc07b556eea 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -17,7 +17,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
-#include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <asm/semaphore.h>
-- 
cgit v1.2.3


From 47b54fbff358a1d5ee4738cec8a53a08bead72e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:04:22 -0700
Subject: [PATCH] /dev/urandom scalability improvement

From: David Mosberger <davidm@napali.hpl.hp.com>

Somebody recently pointed out a performance-anomaly to me where an unusual
amount of time was being spent reading from /dev/urandom.  The problem
isn't really surprising as it happened only on >= 4-way machines and the
random driver isn't terribly scalable the way it is written today.  If
scalability _really_ mattered, I suppose per-CPU data structures would be
the way to go.  However, I found that at least for 4-way machines,
performance can be improved considerably with the attached patch.  In
particular, I saw the following performance on a 4-way ia64 machine:

Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024":

			throughput:
---
 drivers/char/random.c    | 51 +++++++++++++++++++++++++++++++++---------------
 include/linux/prefetch.h | 12 ++++++++++++
 2 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 117f195029a1..6941fdeb6a4b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 word)
  **********************************************************************/
 
 struct entropy_store {
+	/* mostly-read data: */
+	struct poolinfo poolinfo;
+	__u32		*pool;
+
+	/* read-write data: */
+	spinlock_t lock ____cacheline_aligned_in_smp;
 	unsigned	add_ptr;
 	int		entropy_count;
 	int		input_rotate;
-	struct poolinfo poolinfo;
-	__u32		*pool;
-	spinlock_t lock;
 };
 
 /*
@@ -571,38 +574,54 @@ static void add_entropy_words(struct entropy_store *r, const __u32 *in,
 	static __u32 const twist_table[8] = {
 		         0, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
 		0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
-	unsigned i;
-	int new_rotate;
+	unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5;
+	int new_rotate, input_rotate;
 	int wordmask = r->poolinfo.poolwords - 1;
-	__u32 w;
+	__u32 w, next_w;
 	unsigned long flags;
 
+	/* Taps are constant, so we can load them without holding r->lock.  */
+	tap1 = r->poolinfo.tap1;
+	tap2 = r->poolinfo.tap2;
+	tap3 = r->poolinfo.tap3;
+	tap4 = r->poolinfo.tap4;
+	tap5 = r->poolinfo.tap5;
+	next_w = *in++;
+
 	spin_lock_irqsave(&r->lock, flags);
+	prefetch_range(r->pool, wordmask);
+	input_rotate = r->input_rotate;
+	add_ptr = r->add_ptr;
 
 	while (nwords--) {
-		w = rotate_left(r->input_rotate, *in++);
-		i = r->add_ptr = (r->add_ptr - 1) & wordmask;
+		w = rotate_left(input_rotate, next_w);
+		if (nwords > 0)
+			next_w = *in++;
+		i = add_ptr = (add_ptr - 1) & wordmask;
 		/*
 		 * Normally, we add 7 bits of rotation to the pool.
 		 * At the beginning of the pool, add an extra 7 bits
 		 * rotation, so that successive passes spread the
 		 * input bits across the pool evenly.
 		 */
-		new_rotate = r->input_rotate + 14;
+		new_rotate = input_rotate + 14;
 		if (i)
-			new_rotate = r->input_rotate + 7;
-		r->input_rotate = new_rotate & 31;
+			new_rotate = input_rotate + 7;
+		input_rotate = new_rotate & 31;
 
 		/* XOR in the various taps */
-		w ^= r->pool[(i + r->poolinfo.tap1) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap2) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap3) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap4) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap5) & wordmask];
+		w ^= r->pool[(i + tap1) & wordmask];
+		w ^= r->pool[(i + tap2) & wordmask];
+		w ^= r->pool[(i + tap3) & wordmask];
+		w ^= r->pool[(i + tap4) & wordmask];
+		w ^= r->pool[(i + tap5) & wordmask];
 		w ^= r->pool[i];
 		r->pool[i] = (w >> 3) ^ twist_table[w & 7];
 	}
 
+	r->input_rotate = input_rotate;
+	r->add_ptr = add_ptr;
+
 	spin_unlock_irqrestore(&r->lock, flags);
 }
 
diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
index 73c4a344156b..fc86f274147f 100644
--- a/include/linux/prefetch.h
+++ b/include/linux/prefetch.h
@@ -10,6 +10,7 @@
 #ifndef _LINUX_PREFETCH_H
 #define _LINUX_PREFETCH_H
 
+#include <linux/types.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
 
@@ -54,4 +55,15 @@ static inline void prefetchw(const void *x) {;}
 #define PREFETCH_STRIDE (4*L1_CACHE_BYTES)
 #endif
 
+static inline void prefetch_range(void *addr, size_t len)
+{
+#ifdef ARCH_HAS_PREFETCH
+	char *cp;
+	char *end = addr + len;
+
+	for (cp = addr; cp < end; cp += PREFETCH_STRIDE)
+		prefetch(cp);
+#endif
+}
+
 #endif
-- 
cgit v1.2.3


From 7ee168c0b7a988210cc8024d105dfd1cb3e956e6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:05:02 -0700
Subject: [PATCH] Move __this_module to modpost

From: Brian Gerst <bgerst@didntduck.org>

Move the __this_module structure to the modpost code where it really
belongs.
---
 include/linux/module.h   | 16 +---------------
 scripts/Makefile.modpost |  2 ++
 scripts/modpost.c        | 12 ++++++++++++
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/module.h b/include/linux/module.h
index 4c9b53d5d51c..0a86652fb1cb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -70,6 +70,7 @@ static const char __module_cat(name,__LINE__)[]				  \
 extern const struct gtype##_id __mod_##gtype##_table		\
   __attribute__ ((unused, alias(__stringify(name))))
 
+extern struct module __this_module;
 #define THIS_MODULE (&__this_module)
 
 #else  /* !MODULE */
@@ -481,21 +482,6 @@ static inline int unregister_module_notifier(struct notifier_block * nb)
 
 #endif /* CONFIG_MODULES */
 
-#ifdef MODULE
-extern struct module __this_module;
-#ifdef KBUILD_MODNAME
-/* We make the linker do some of the work. */
-struct module __this_module
-__attribute__((section(".gnu.linkonce.this_module"))) = {
-	.name = __stringify(KBUILD_MODNAME),
-	.init = init_module,
-#ifdef CONFIG_MODULE_UNLOAD
-	.exit = cleanup_module,
-#endif
-};
-#endif /* KBUILD_MODNAME */
-#endif /* MODULE */
-
 #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)
 
 /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index cd716ece8976..d349dda5edf7 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -35,6 +35,8 @@ targets += $(modules)
 
 # Compile version info for unresolved symbols
 
+modname = $(*F)
+
 quiet_cmd_cc_o_c = CC      $@
       cmd_cc_o_c = $(CC) $(c_flags) $(CFLAGS_MODULE)	\
 		   -c -o $@ $<
diff --git a/scripts/modpost.c b/scripts/modpost.c
index a0976fcb9982..69168e18d5bf 100644
--- a/scripts/modpost.c
+++ b/scripts/modpost.c
@@ -343,6 +343,9 @@ handle_modversions(struct module *mod, struct elf_info *info,
 		/* ignore global offset table */
 		if (strcmp(symname, "_GLOBAL_OFFSET_TABLE_") == 0)
 			break;
+		/* ignore __this_module, it will be resolved shortly */
+		if (strcmp(symname, MODULE_SYMBOL_PREFIX "__this_module") == 0)
+			break;
 #ifdef STT_REGISTER
 		if (info->hdr->e_machine == EM_SPARC ||
 		    info->hdr->e_machine == EM_SPARCV9) {
@@ -468,6 +471,15 @@ add_header(struct buffer *b)
 	buf_printf(b, "#include <linux/compiler.h>\n");
 	buf_printf(b, "\n");
 	buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
+	buf_printf(b, "\n");
+	buf_printf(b, "struct module __this_module\n");
+	buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n");
+	buf_printf(b, " .name = __stringify(KBUILD_MODNAME),\n");
+	buf_printf(b, " .init = init_module,\n");
+	buf_printf(b, "#ifdef CONFIG_MODULE_UNLOAD\n");
+	buf_printf(b, " .exit = cleanup_module,\n");
+	buf_printf(b, "#endif\n");
+	buf_printf(b, "};\n");
 }
 
 /* Record CRCs for unresolved symbols */
-- 
cgit v1.2.3


From 387f7c83eb26b4f45e6d843f2ef703aafbe6c80f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:05:28 -0700
Subject: [PATCH] Support for floppies whose sectors are numbered from zero
 instead of one

From: Marcelo Tosatti <marcelo.tosatti@cyclades.com>

From: Alain Knaff <alain.knaff@lll.lu>

This patch adds support for floppy disks whose sectors are numbered
starting at 0 rather than 1 as usual disks would be.  This format is used
for some CP/M disks, and also for certain music samplers (such as Ensoniq
Ensoniq EPS 16plus).

In order to use it, you need an fdutils with the current patch from
http://fdutils.linux.lu as well, and then do setfdrpm /dev/fd0 dd zerobased
sect=10 or setfdprm /dev/fd0 hd zerobased sect.

In addtion, the patch also fixes my email addresses.  I no longer use
pobox.com.
---
 drivers/block/floppy.c | 19 +++++++++++++++----
 include/linux/fd.h     |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8e05b64feeb2..494acc9fcfcc 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2258,6 +2258,10 @@ static void setup_format_params(int track)
 			}
 		}
 	}
+	if(_floppy->stretch & FD_ZEROBASED) {
+	    for(count = 0; count < F_SECT_PER_TRACK; count++)
+		here[count].sect--;
+	}
 }
 
 static void redo_format(void)
@@ -2679,7 +2683,8 @@ static int make_raw_rw_request(void)
 	}
 	HEAD = fsector_t / _floppy->sect;
 
-	if (((_floppy->stretch & FD_SWAPSIDES) || TESTF(FD_NEED_TWADDLE)) &&
+	if (((_floppy->stretch & (FD_SWAPSIDES | FD_ZEROBASED)) ||
+	     TESTF(FD_NEED_TWADDLE)) &&
 	    fsector_t < _floppy->sect)
 		max_sector = _floppy->sect;
 
@@ -2709,7 +2714,8 @@ static int make_raw_rw_request(void)
 	GAP = _floppy->gap;
 	CODE2SIZE;
 	SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE;
-	SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) + 1;
+	SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) +
+	    ((_floppy->stretch & FD_ZEROBASED) ? 0 : 1);
 
 	/* tracksize describes the size which can be filled up with sectors
 	 * of size ssize.
@@ -3346,7 +3352,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 	    g->track <= 0 ||
 	    g->track > UDP->tracks>>STRETCH(g) ||
 	    /* check if reserved bits are set */
-	    (g->stretch&~(FD_STRETCH|FD_SWAPSIDES)) != 0)
+	    (g->stretch&~(FD_STRETCH|FD_SWAPSIDES|FD_ZEROBASED)) != 0)
 		return -EINVAL;
 	if (type){
 		if (!capable(CAP_SYS_ADMIN))
@@ -3367,11 +3373,13 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 		}
 		up(&open_lock);
 	} else {
+		int oldStretch;
 		LOCK_FDC(drive,1);
 		if (cmd != FDDEFPRM)
 			/* notice a disk change immediately, else
 			 * we lose our settings immediately*/
 			CALL(poll_drive(1, FD_RAW_NEED_DISK));
+		oldStretch = g->stretch;
 		user_params[drive] = *g;
 		if (buffer_drive == drive)
 			SUPBOUND(buffer_max, user_params[drive].sect);
@@ -3386,7 +3394,10 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 		 * whose number will change. This is useful, because
 		 * mtools often changes the geometry of the disk after
 		 * looking at the boot block */
-		if (DRS->maxblock > user_params[drive].sect || DRS->maxtrack)
+		if (DRS->maxblock > user_params[drive].sect ||
+		    DRS->maxtrack ||
+		    ((user_params[drive].sect ^ oldStretch) &
+		     (FD_SWAPSIDES | FD_ZEROBASED)))
 			invalidate_drive(bdev);
 		else
 			process_fd_request();
diff --git a/include/linux/fd.h b/include/linux/fd.h
index 187785b83958..cbbd0f876585 100644
--- a/include/linux/fd.h
+++ b/include/linux/fd.h
@@ -17,6 +17,7 @@ struct floppy_struct {
 			stretch;	/* !=0 means double track steps */
 #define FD_STRETCH 1
 #define FD_SWAPSIDES 2
+#define FD_ZEROBASED 4
 
 	unsigned char	gap,		/* gap1 size */
 
-- 
cgit v1.2.3


From 8398bcc6b3eb950a1242f6dc4cfb151b6b9238c3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:08:32 -0700
Subject: [PATCH] eliminate nswap and cnswap

From: Matt Mackall <mpm@selenic.com>

The nswap and cnswap variables counters have never been incremented as
Linux doesn't do task swapping.
---
 arch/alpha/kernel/osf_sys.c | 3 ---
 fs/proc/array.c             | 4 ++--
 include/linux/sched.h       | 2 +-
 kernel/acct.c               | 2 +-
 kernel/exit.c               | 1 -
 kernel/fork.c               | 1 -
 kernel/sys.c                | 3 ---
 7 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 34adfc76dd92..f725059fe47f 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1095,14 +1095,12 @@ osf_getrusage(int who, struct rusage32 *ru)
 		jiffies_to_timeval32(current->stime, &r.ru_stime);
 		r.ru_minflt = current->min_flt;
 		r.ru_majflt = current->maj_flt;
-		r.ru_nswap = current->nswap;
 		break;
 	case RUSAGE_CHILDREN:
 		jiffies_to_timeval32(current->cutime, &r.ru_utime);
 		jiffies_to_timeval32(current->cstime, &r.ru_stime);
 		r.ru_minflt = current->cmin_flt;
 		r.ru_majflt = current->cmaj_flt;
-		r.ru_nswap = current->cnswap;
 		break;
 	default:
 		jiffies_to_timeval32(current->utime + current->cutime,
@@ -1111,7 +1109,6 @@ osf_getrusage(int who, struct rusage32 *ru)
 				   &r.ru_stime);
 		r.ru_minflt = current->min_flt + current->cmin_flt;
 		r.ru_majflt = current->maj_flt + current->cmaj_flt;
-		r.ru_nswap = current->nswap + current->cnswap;
 		break;
 	}
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ac9ccac5d1ee..ae90151e45ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -392,8 +392,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		sigign      .sig[0] & 0x7fffffffUL,
 		sigcatch    .sig[0] & 0x7fffffffUL,
 		wchan,
-		task->nswap,
-		task->cnswap,
+		0UL,
+		0UL,
 		task->exit_signal,
 		task_cpu(task),
 		task->rt_priority,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a1229121123..22080f919266 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -436,7 +436,7 @@ struct task_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
 	u64 start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
-	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
diff --git a/kernel/acct.c b/kernel/acct.c
index b417066778a7..8e32413c41f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -376,7 +376,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_minflt = encode_comp_t(current->min_flt);
 	ac.ac_majflt = encode_comp_t(current->maj_flt);
-	ac.ac_swaps = encode_comp_t(current->nswap);
+	ac.ac_swaps = encode_comp_t(0);
 	ac.ac_exitcode = exitcode;
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 810eebd77559..8157dbc037d6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -92,7 +92,6 @@ repeat:
 	p->parent->cstime += p->stime + p->cstime;
 	p->parent->cmin_flt += p->min_flt + p->cmin_flt;
 	p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
-	p->parent->cnswap += p->nswap + p->cnswap;
 	p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
 	p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
 	sched_exit(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 315a06125e65..da5213611496 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -513,7 +513,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->cmin_flt = tsk->cmaj_flt = 0;
-	tsk->nswap = tsk->cnswap = 0;
 	tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0;
 
 	tsk->mm = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 9d57482758f3..4d414d925889 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1532,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw;
 			r.ru_minflt = p->min_flt;
 			r.ru_majflt = p->maj_flt;
-			r.ru_nswap = p->nswap;
 			break;
 		case RUSAGE_CHILDREN:
 			jiffies_to_timeval(p->cutime, &r.ru_utime);
@@ -1541,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->cnivcsw;
 			r.ru_minflt = p->cmin_flt;
 			r.ru_majflt = p->cmaj_flt;
-			r.ru_nswap = p->cnswap;
 			break;
 		default:
 			jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime);
@@ -1550,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw + p->cnivcsw;
 			r.ru_minflt = p->min_flt + p->cmin_flt;
 			r.ru_majflt = p->maj_flt + p->cmaj_flt;
-			r.ru_nswap = p->nswap + p->cnswap;
 			break;
 	}
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-- 
cgit v1.2.3


From 87217f471c7d293548938c4d396fbafde664dde4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:08:45 -0700
Subject: [PATCH] shrink inode when quota is disabled

From: Matt Mackall <mpm@selenic.com>

drop quota array in inode struct if no quota support
---
 fs/inode.c         | 2 ++
 include/linux/fs.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index e1b51a3f3ebc..d192c238c5a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -126,7 +126,9 @@ static struct inode *alloc_inode(struct super_block *sb)
 		inode->i_blocks = 0;
 		inode->i_bytes = 0;
 		inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
 		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
 		inode->i_pipe = NULL;
 		inode->i_bdev = NULL;
 		inode->i_cdev = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 02976f7c9f47..4abf3ff1fe1c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,7 +403,9 @@ struct inode {
 	struct file_lock	*i_flock;
 	struct address_space	*i_mapping;
 	struct address_space	i_data;
+#ifdef CONFIG_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
+#endif
 	/* These three should probably be a union */
 	struct list_head	i_devices;
 	struct pipe_inode_info	*i_pipe;
-- 
cgit v1.2.3


From bc0e2bbf8f0c8e77501677116798e8d7c6a8f49f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:09:10 -0700
Subject: [PATCH] O_DIRECT data exposure fixes

From: Badari Pulavarty, Suparna Bhattacharya, Andrew Morton

Forward port of Stephen Tweedie's DIO fixes from 2.4, to fix various DIO vs
buffered IO exposures involving races causing:

(a) stale data from uninstantiated blocks to be read, e.g.

    - O_DIRECT reads against buffered writes to a sparse region

    - O_DIRECT writes to a sparse region against buffered reads

(b) potential data corruption with

    - O_DIRECT IOs against truncate

    due to writes to truncated blocks (which may have been reallocated to
    another file).

Summary of fixes:

1) All the changes affect only regular files.  RAW/O_DIRECT on block are
   unaffected.

2) The DIO code will not fill in sparse regions on a write.  Instead
   -ENOTBLK is returned and the generic file write code would fallthrough to
   buffered IO in this case followed by writing through the pages to disk
   using filemap_fdatawrite/wait.

3) i_sem is held during both DIO reads and writes.  For reads, and writes
   to already allocated blocks, it is released right after IO is issued,
   while for writes to newly allocated blocks (e.g file extending writes and
   hole overwrites) it is held all the way through until IO completes (and
   data is committed to disk).

4) filemap_fdatawrite/wait are called under i_sem to synchronize buffered
   pages to disk blocks before issuing DIO.

5) A new rwsem (i_alloc_sem) is held in shared mode all the while a DIO
   (read or write) is in progress, and in exclusive mode by truncate to guard
   against deallocation of data blocks during DIO.

6) All this new locking has been pushed down into blockdev_direct_IO to
   avoid interfering with NFS direct IO.  The locks are taken in the order
   i_sem followed by i_alloc_sem.  While i_sem may be released after IO
   submission in some cases, i_alloc_sem is held through until dio_complete
   (in the case of AIO-DIO this happens through the IO completion callback).

7) i_sem and i_alloc_sem are not held for the _nolock versions of write
   routines, as used by blockdev and XFS.  Filesystems can specify the
   needs_special_locking parameter to __blockdev_direct_IO from their direct
   IO address space op accordingly.

Note from Badari:
Here is the locking (when needs_special_locking is true):

(1) generic_file_*_write() holds i_sem (as before) and calls
    ->direct_IO().  blockdev_direct_IO gets i_alloc_sem and call
    direct_io_worker().

(2) generic_file_*_read() does not hold any locks.  blockdev_direct_IO()
    gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the
    work

(3) direct_io_worker() does the work and drops i_sem after submitting IOs
    if appropriate and drops i_alloc_sem after completing IOs.
---
 fs/direct-io.c          | 93 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/inode.c              |  1 +
 fs/open.c               |  2 ++
 fs/xfs/linux/xfs_aops.c |  3 +-
 include/linux/fs.h      | 31 +++++++++++++++--
 mm/filemap.c            | 53 +++++++++++++++++++++-------
 6 files changed, 154 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 4711d134cfd9..72309514e112 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -52,6 +52,10 @@
  *
  * If blkfactor is zero then the user's request was aligned to the filesystem's
  * blocksize.
+ *
+ * needs_locking is set for regular files on direct-IO-naive filesystems.  It
+ * determines whether we need to do the fancy locking which prevents direct-IO
+ * from being able to read uninitialised disk blocks.
  */
 
 struct dio {
@@ -59,6 +63,7 @@ struct dio {
 	struct bio *bio;		/* bio under assembly */
 	struct inode *inode;
 	int rw;
+	int needs_locking;		/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
 {
 	if (dio->end_io)
 		dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
+	if (dio->needs_locking)
+		up_read(&dio->inode->i_alloc_sem);
 }
 
 /*
@@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *dio)
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
 	unsigned long blkmask;
+	int beyond_eof = 0;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *dio)
 		if (dio_count & blkmask)	
 			fs_count++;
 
+		if (dio->needs_locking) {
+			if (dio->block_in_file >= (i_size_read(dio->inode) >>
+							dio->blkbits))
+				beyond_eof = 1;
+		}
+		/*
+		 * For writes inside i_size we forbid block creations: only
+		 * overwrites are permitted.  We fall back to buffered writes
+		 * at a higher level for inside-i_size block-instantiating
+		 * writes.
+		 */
 		ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
-				map_bh, dio->rw == WRITE);
+				map_bh, (dio->rw == WRITE) && beyond_eof);
 	}
 	return ret;
 }
@@ -774,6 +793,10 @@ do_holes:
 			if (!buffer_mapped(map_bh)) {
 				char *kaddr;
 
+				/* AKPM: eargh, -ENOTBLK is a hack */
+				if (dio->rw == WRITE)
+					return -ENOTBLK;
+
 				if (dio->block_in_file >=
 					i_size_read(dio->inode)>>blkbits) {
 					/* We hit eof */
@@ -839,21 +862,21 @@ out:
 	return ret;
 }
 
+/*
+ * Releases both i_sem and i_alloc_sem
+ */
 static int
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io)
+	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io,
+	struct dio *dio)
 {
 	unsigned long user_addr; 
 	int seg;
 	int ret = 0;
 	int ret2;
-	struct dio *dio;
 	size_t bytes;
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
 	dio->is_async = !is_sync_kiocb(iocb);
 
 	dio->bio = NULL;
@@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->start_zero_done = 0;
 	dio->block_in_file = offset >> blkbits;
 	dio->blocks_available = 0;
-
 	dio->cur_page = NULL;
 
 	dio->boundary = 0;
@@ -952,6 +974,13 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	dio_cleanup(dio);
 
+	/*
+	 * All new block allocations have been performed.  We can let i_sem
+	 * go now.
+	 */
+	if (dio->needs_locking)
+		up(&dio->inode->i_sem);
+
 	/*
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
 	 * reflect the number of to-be-processed BIOs.
@@ -987,11 +1016,17 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 /*
  * This is a library function for use by filesystem drivers.
+ *
+ * For writes to S_ISREG files, we are called under i_sem and return with i_sem
+ * held, even though it is internally dropped.
+ *
+ * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
  */
 int
-blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io)
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	int needs_special_locking)
 {
 	int seg;
 	size_t size;
@@ -1000,6 +1035,8 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	unsigned bdev_blkbits = 0;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
+	struct dio *dio;
+	int needs_locking;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1025,10 +1062,40 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	}
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
-				nr_segs, blkbits, get_blocks, end_io);
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	retval = -ENOMEM;
+	if (!dio)
+		goto out;
+
+	/*
+	 * For regular files,
+	 *	readers need to grab i_sem and i_alloc_sem
+	 *	writers need to grab i_alloc_sem only (i_sem is already held)
+	 */
+	needs_locking = 0;
+	if (S_ISREG(inode->i_mode) && needs_special_locking) {
+		needs_locking = 1;
+		if (rw == READ) {
+			struct address_space *mapping;
+
+			mapping = iocb->ki_filp->f_mapping;
+			down(&inode->i_sem);
+			retval = filemap_write_and_wait(mapping);
+			if (retval) {
+				up(&inode->i_sem);
+				kfree(dio);
+				goto out;
+			}
+		}
+		down_read(&inode->i_alloc_sem);
+	}
+	dio->needs_locking = needs_locking;
+
+	retval = direct_io_worker(rw, iocb, inode, iov, offset,
+				nr_segs, blkbits, get_blocks, end_io, dio);
+	if (needs_locking && rw == WRITE)
+		down(&inode->i_sem);
 out:
 	return retval;
 }
-
-EXPORT_SYMBOL(blockdev_direct_IO);
+EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/inode.c b/fs/inode.c
index d192c238c5a9..b7f80405c076 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
diff --git a/fs/open.c b/fs/open.c
index ce11096afcad..e0d546e01561 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, loff_t length)
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	down(&dentry->d_inode->i_sem);
+	down_write(&dentry->d_inode->i_alloc_sem);
 	err = notify_change(dentry, &newattrs);
+	up_write(&dentry->d_inode->i_alloc_sem);
 	up(&dentry->d_inode->i_sem);
 	return err;
 }
diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c
index 75ab8d29cd2f..dd446266d33f 100644
--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
@@ -1032,7 +1032,8 @@ linvfs_direct_IO(
 	if (error)
 		return -error;
 
-	return blockdev_direct_IO(rw, iocb, inode, iomap.iomap_target->pbr_bdev,
+	return blockdev_direct_IO_no_locking(rw, iocb, inode,
+		iomap.iomap_target->pbr_bdev,
 		iov, offset, nr_segs,
 		linvfs_get_blocks_direct,
 		linvfs_unwritten_convert_direct);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4abf3ff1fe1c..91ff9225ba86 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -397,6 +397,7 @@ struct inode {
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct semaphore	i_sem;
+	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
@@ -1235,6 +1236,7 @@ extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
+extern int filemap_write_and_wait(struct address_space *mapping);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
@@ -1347,9 +1349,6 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb,
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
-	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
 	unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
@@ -1371,6 +1370,32 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
 				actor);
 }
 
+int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	int needs_special_locking);
+
+/*
+ * For filesystems which need locking between buffered and direct access
+ */
+static inline int blockdev_direct_IO(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_blocks, end_io, 1);
+}
+
+static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_blocks, end_io, 0);
+}
+
 extern struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
diff --git a/mm/filemap.c b/mm/filemap.c
index 6fbd980c25e5..ad234dc52cbf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -73,6 +73,9 @@
  *  ->mmap_sem
  *    ->i_sem			(msync)
  *
+ *  ->i_sem
+ *    ->i_alloc_sem             (various)
+ *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->page_lock	(__sync_single_inode)
@@ -228,6 +231,18 @@ restart:
 
 EXPORT_SYMBOL(filemap_fdatawait);
 
+int filemap_write_and_wait(struct address_space *mapping)
+{
+	int retval = 0;
+
+	if (mapping->nrpages) {
+		retval = filemap_fdatawrite(mapping);
+		if (retval == 0)
+			retval = filemap_fdatawait(mapping);
+	}
+	return retval;
+}
+
 /*
  * This adds a page to the page cache, starting out as locked, unreferenced,
  * not uptodate and with no errors.
@@ -1716,6 +1731,7 @@ EXPORT_SYMBOL(generic_write_checks);
 
 /*
  * Write to a file through the page cache. 
+ * Called under i_sem for S_ISREG files.
  *
  * We put everything into the page cache prior to writing it. This is not a
  * problem when writing full pages. With partial pages, however, we first have
@@ -1806,12 +1822,19 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 		/*
 		 * Sync the fs metadata but not the minor inode changes and
 		 * of course not the data as we did direct DMA for the IO.
+		 * i_sem is held, which protects generic_osync_inode() from
+		 * livelocking.
 		 */
 		if (written >= 0 && file->f_flags & O_SYNC)
 			status = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 		if (written >= 0 && !is_sync_kiocb(iocb))
 			written = -EIOCBQUEUED;
-		goto out_status;
+		if (written != -ENOTBLK)
+			goto out_status;
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 */
+		written = 0;
 	}
 
 	buf = iov->iov_base;
@@ -1900,6 +1923,14 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 					OSYNC_METADATA|OSYNC_DATA);
 	}
 	
+	/*
+	 * If we get here for O_DIRECT writes then we must have fallen through
+	 * to buffered writes (block instantiation inside i_size).  So we sync
+	 * the file data here, to try to honour O_DIRECT expectations.
+	 */
+	if (unlikely(file->f_flags & O_DIRECT) && written)
+		status = filemap_write_and_wait(mapping);
+
 out_status:	
 	err = written ? written : status;
 out:
@@ -1991,6 +2022,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 
 EXPORT_SYMBOL(generic_file_writev);
 
+/*
+ * Called under i_sem for writes to S_ISREG files
+ */
 ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs)
@@ -1999,18 +2033,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct address_space *mapping = file->f_mapping;
 	ssize_t retval;
 
-	if (mapping->nrpages) {
-		retval = filemap_fdatawrite(mapping);
-		if (retval == 0)
-			retval = filemap_fdatawait(mapping);
-		if (retval)
-			goto out;
+	retval = filemap_write_and_wait(mapping);
+	if (retval == 0) {
+		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
+						offset, nr_segs);
+		if (rw == WRITE && mapping->nrpages)
+			invalidate_inode_pages2(mapping);
 	}
-
-	retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-	if (rw == WRITE && mapping->nrpages)
-		invalidate_inode_pages2(mapping);
-out:
 	return retval;
 }
 
-- 
cgit v1.2.3


From 8691fb836b268c622c61281238219fc166f0eee5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:10:27 -0700
Subject: [PATCH] radix-tree tags for selective lookup

Add radix-tree tagging so we can look up dirty or writeback pages in
O(log64(n)) time.

Each radix-tree node gains two bits for each slot: one for page dirtiness and
one for page writebackness.

If a tag bit is set on a leaf node, it indicates that item at the
corresponding slot is tagged (say, a dirty page).

If a tag bit is set in a non-leaf node it indicates that the same tag bit is
set in the subtree which lies under the corresponding slot.  ie: "there is a
dirty page under here somewhere, but you need to search down further to find
it".

A gang lookup function is provided which can walk the radix tree in
logarithmic time looking for items which are tagged, starting from a
specified offset.  We use this for in-order searches for dirty or writeback
pages.

There is a userspace test harness for this code at

http://www.zip.com.au/~akpm/linux/patches/stuff/rtth.tar.gz
---
 include/linux/radix-tree.h |  38 ++--
 lib/radix-tree.c           | 444 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 426 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c32a45fd1f0d..8081a281fa5e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -20,8 +20,7 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/preempt.h>
-
-struct radix_tree_node;
+#include <linux/types.h>
 
 struct radix_tree_root {
 	unsigned int		height;
@@ -29,25 +28,40 @@ struct radix_tree_root {
 	struct radix_tree_node	*rnode;
 };
 
-#define RADIX_TREE_INIT(mask)	{0, (mask), NULL}
+#define RADIX_TREE_INIT(mask)	{					\
+	.height = 0,							\
+	.gfp_mask = (mask),						\
+	.rnode = NULL,							\
+}
 
 #define RADIX_TREE(name, mask) \
 	struct radix_tree_root name = RADIX_TREE_INIT(mask)
 
-#define INIT_RADIX_TREE(root, mask)	\
-do {					\
-	(root)->height = 0;		\
-	(root)->gfp_mask = (mask);	\
-	(root)->rnode = NULL;		\
+#define INIT_RADIX_TREE(root, mask)					\
+do {									\
+	(root)->height = 0;						\
+	(root)->gfp_mask = (mask);					\
+	(root)->rnode = NULL;						\
 } while (0)
 
-extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-extern unsigned int
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
 int radix_tree_preload(int gfp_mask);
+void radix_tree_init(void);
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items, int tag);
+int radix_tree_tagged(struct radix_tree_root *root, int tag);
 
 static inline void radix_tree_preload_end(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 70ad32ff37ca..5fb59f715eab 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -6,12 +6,12 @@
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2, or (at
  * your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
@@ -28,21 +28,36 @@
 #include <linux/cpu.h>
 #include <linux/gfp.h>
 #include <linux/string.h>
+#include <linux/bitops.h>
 
 /*
  * Radix tree node definition.
+ *
+ * RADIX_TREE_MAP_SHIFT must be >= log2(BITS_PER_LONG).  Otherwise the tags
+ * array will have zero size and the set_tag() arithmetic will go wrong.
  */
-#define RADIX_TREE_MAP_SHIFT  6
-#define RADIX_TREE_MAP_SIZE  (1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK  (RADIX_TREE_MAP_SIZE-1)
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	6
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+#define RADIX_TREE_TAGS		2
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
 struct radix_tree_node {
 	unsigned int	count;
 	void		*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
 struct radix_tree_path {
 	struct radix_tree_node *node, **slot;
+	int offset;
 };
 
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
@@ -124,6 +139,22 @@ out:
 	return ret;
 }
 
+static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
+{
+	if (!test_bit(offset, &node->tags[tag][0]))
+		__set_bit(offset, &node->tags[tag][0]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
+{
+	__clear_bit(offset, &node->tags[tag][0]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
+{
+	return test_bit(offset, &node->tags[tag][0]);
+}
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -140,26 +171,53 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_node *node;
 	unsigned int height;
+	char tags[RADIX_TREE_TAGS];
+	int tag;
 
 	/* Figure out what the height should be.  */
 	height = root->height + 1;
 	while (index > radix_tree_maxindex(height))
 		height++;
 
-	if (root->rnode) {
-		do {
-			if (!(node = radix_tree_node_alloc(root)))
-				return -ENOMEM;
-
-			/* Increase the height.  */
-			node->slots[0] = root->rnode;
-			node->count = 1;
-			root->rnode = node;
-			root->height++;
-		} while (height > root->height);
-	} else 
+	if (root->rnode == NULL) {
 		root->height = height;
+		goto out;
+	}
+
+	/*
+	 * Prepare the tag status of the top-level node for propagation
+	 * into the newly-pushed top-level node(s)
+	 */
+	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+		int idx;
+
+		tags[tag] = 0;
+		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+			if (root->rnode->tags[tag][idx]) {
+				tags[tag] = 1;
+				break;
+			}
+		}
+	}
+
+	do {
+		if (!(node = radix_tree_node_alloc(root)))
+			return -ENOMEM;
+
+		/* Increase the height.  */
+		node->slots[0] = root->rnode;
 
+		/* Propagate the aggregated tag info into the new root */
+		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+			if (tags[tag])
+				tag_set(node, tag, 0);
+		}
+
+		node->count = 1;
+		root->rnode = node;
+		root->height++;
+	} while (height > root->height);
+out:
 	return 0;
 }
 
@@ -171,23 +229,27 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
  *
  *	Insert an item into the radix tree at position @index.
  */
-int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+int radix_tree_insert(struct radix_tree_root *root,
+			unsigned long index, void *item)
 {
 	struct radix_tree_node *node = NULL, *tmp, **slot;
 	unsigned int height, shift;
+	int offset;
 	int error;
 
 	/* Make sure the tree is high enough.  */
-	if (index > radix_tree_maxindex(root->height)) {
+	if ((!index && !root->rnode) ||
+			index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
 			return error;
 	}
-    
+
 	slot = &root->rnode;
 	height = root->height;
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
+	offset = 0;			/* uninitialised var warning */
 	while (height > 0) {
 		if (*slot == NULL) {
 			/* Have to add a child node.  */
@@ -198,18 +260,21 @@ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *i
 				node->count++;
 		}
 
-		/* Go a level down.  */
+		/* Go a level down */
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		node = *slot;
-		slot = (struct radix_tree_node **)
-			(node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+		slot = (struct radix_tree_node **)(node->slots + offset);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
 
 	if (*slot != NULL)
 		return -EEXIST;
-	if (node)
+	if (node) {
 		node->count++;
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+	}
 
 	*slot = item;
 	return 0;
@@ -221,7 +286,7 @@ EXPORT_SYMBOL(radix_tree_insert);
  *	@root:		radix tree root
  *	@index:		index key
  *
- *	Lookup them item at the position @index in the radix tree @root.
+ *	Lookup the item at the position @index in the radix tree @root.
  */
 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 {
@@ -240,16 +305,174 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 			return NULL;
 
 		slot = (struct radix_tree_node **)
-			((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+			((*slot)->slots +
+				((index >> shift) & RADIX_TREE_MAP_MASK));
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
 
-	return (void *) *slot;
+	return *slot;
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
-static /* inline */ unsigned int
+/**
+ *	radix_tree_tag_set - set a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Set the search tag corresponging to @index in the radix tree.  From
+ *	the root all the way down to the leaf node.
+ *
+ *	Returns the address of the tagged item.   Setting a tag on a not-present
+ *	item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	while (height > 0) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		tag_set(*slot, tag, offset);
+		slot = (struct radix_tree_node **)((*slot)->slots + offset);
+		BUG_ON(*slot == NULL);
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	return *slot;
+}
+EXPORT_SYMBOL(radix_tree_tag_set);
+
+/**
+ *	radix_tree_tag_clear - clear a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Clear the search tag corresponging to @index in the radix tree.  If
+ *	this causes the leaf node to have no tags set then clear the tag in the
+ *	next-to-leaf node, etc.
+ *
+ *	Returns the address of the tagged item on success, else NULL.  ie:
+ *	has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, int tag)
+{
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	unsigned int height, shift;
+	void *ret = NULL;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		goto out;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+	pathp->slot = &root->rnode;
+
+	while (height > 0) {
+		int offset;
+
+		if (*pathp->slot == NULL)
+			goto out;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp[1].offset = offset;
+		pathp[1].node = *pathp[0].slot;
+		pathp[1].slot = (struct radix_tree_node **)
+				(pathp[1].node->slots + offset);
+		pathp++;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	ret = *pathp[0].slot;
+	if (ret == NULL)
+		goto out;
+
+	do {
+		int idx;
+
+		tag_clear(pathp[0].node, tag, pathp[0].offset);
+		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+			if (pathp[0].node->tags[tag][idx])
+				goto out;
+		}
+		pathp--;
+	} while (pathp[0].node);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_tag_clear);
+
+#ifndef __KERNEL__	/* Only the test harness uses this at present */
+/**
+ *	radix_tree_tag_get - get a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Return the search tag corresponging to @index in the radix tree.
+ *
+ *	Returns zero if the tag is unset, or if there is no corresponding item
+ *	in the tree.
+ */
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+	int saw_unset_tag = 0;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return 0;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	for ( ; ; ) {
+		int offset;
+
+		if (*slot == NULL)
+			return 0;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		/*
+		 * This is just a debug check.  Later, we can bale as soon as
+		 * we see an unset tag.
+		 */
+		if (!tag_get(*slot, tag, offset))
+			saw_unset_tag = 1;
+		if (height == 1) {
+			int ret = tag_get(*slot, tag, offset);
+
+			BUG_ON(ret && saw_unset_tag);
+			return ret;
+		}
+		slot = (struct radix_tree_node **)((*slot)->slots + offset);
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+}
+EXPORT_SYMBOL(radix_tree_tag_get);
+#endif
+
+static unsigned int
 __lookup(struct radix_tree_root *root, void **results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
 {
@@ -316,17 +539,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 	unsigned long cur_index = first_index;
 	unsigned int ret = 0;
 
-	if (root->rnode == NULL)
-		goto out;
-	if (max_index == 0) {			/* Bah.  Special case */
-		if (first_index == 0) {
-			if (max_items > 0) {
-				*results = root->rnode;
-				ret = 1;
-			}
-		}
-		goto out;
-	}
 	while (ret < max_items) {
 		unsigned int nr_found;
 		unsigned long next_index;	/* Index of next search */
@@ -340,11 +552,101 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			break;
 		cur_index = next_index;
 	}
-out:
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup);
 
+/*
+ * FIXME: the two tag_get()s here should use find_next_bit() instead of
+ * open-coding the search.
+ */
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index, int tag)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	while (height > 0) {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (tag_get(slot, tag, i)) {
+				BUG_ON(slot->slots[i] == NULL);
+				break;
+			}
+			index &= ~((1 << shift) - 1);
+			index += 1 << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		if (height == 0) {	/* Bottom level: grab some items */
+			unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (tag_get(slot, tag, j)) {
+					BUG_ON(slot->slots[j] == NULL);
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = slot->slots[i];
+	}
+out:
+	*next_index = index;
+	return nr_found;
+}
+
+/**
+ *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ *	                             based on a tag
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *	@tag:		the tag index
+ *
+ *	Performs an index-ascending scan of the tree for present items which
+ *	have the tag indexed by @tag set.  Places the items at *@results and
+ *	returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items, int tag)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup_tag(root, results + ret, cur_index,
+					max_items - ret, &next_index, tag);
+		ret += nr_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
+
 /**
  *	radix_tree_delete    -    delete an item from a radix tree
  *	@root:		radix tree root
@@ -357,24 +659,31 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_path *orig_pathp;
 	unsigned int height, shift;
 	void *ret = NULL;
+	char tags[RADIX_TREE_TAGS];
+	int nr_cleared_tags;
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	pathp->node = NULL;
 	pathp->slot = &root->rnode;
 
 	while (height > 0) {
+		int offset;
+
 		if (*pathp->slot == NULL)
 			goto out;
 
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp[1].offset = offset;
 		pathp[1].node = *pathp[0].slot;
 		pathp[1].slot = (struct radix_tree_node **)
-		    (pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+				(pathp[1].node->slots + offset);
 		pathp++;
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
@@ -384,20 +693,67 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	if (ret == NULL)
 		goto out;
 
+	orig_pathp = pathp;
+
+	/*
+	 * Clear all tags associated with the just-deleted item
+	 */
+	memset(tags, 0, sizeof(tags));
+	do {
+		int tag;
+
+		nr_cleared_tags = RADIX_TREE_TAGS;
+		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+			int idx;
+
+			if (!tags[tag])
+				tag_clear(pathp[0].node, tag, pathp[0].offset);
+
+			for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+				if (pathp[0].node->tags[tag][idx]) {
+					tags[tag] = 1;
+					nr_cleared_tags--;
+					break;
+				}
+			}
+		}
+		pathp--;
+	} while (pathp[0].node && nr_cleared_tags);
+
+	pathp = orig_pathp;
 	*pathp[0].slot = NULL;
 	while (pathp[0].node && --pathp[0].node->count == 0) {
 		pathp--;
+		BUG_ON(*pathp[0].slot == NULL);
 		*pathp[0].slot = NULL;
 		radix_tree_node_free(pathp[1].node);
 	}
-
 	if (root->rnode == NULL)
-		root->height = 0;  /* Empty tree, we can reset the height */
+		root->height = 0;
 out:
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+/**
+ *	radix_tree_tagged - test whether any items in the tree are tagged
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, int tag)
+{
+	int idx;
+
+	if (!root->rnode)
+		return 0;
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		if (root->rnode->tags[tag][idx])
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(radix_tree_tagged);
+
 static void
 radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
 {
-- 
cgit v1.2.3


From 89261aab0c7064ca9766bc79e7867b6104274f56 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:10:41 -0700
Subject: [PATCH] make the pagecache lock irq-safe.

Intro to these patches:

- Major surgery against the pagecache, radix-tree and writeback code.  This
  work is to address the O_DIRECT-vs-buffered data exposure horrors which
  we've been struggling with for months.

  As a side-effect, 32 bytes are saved from struct inode and eight bytes
  are removed from struct page.  At a cost of approximately 2.5 bits per page
  in the radix tree nodes on 4k pagesize, assuming the pagecache is densely
  populated.  Not all pages are pagecache; other pages gain the full 8 byte
  saving.

  This change will break any arch code which is using page->list and will
  also break any arch code which is using page->lru of memory which was
  obtained from slab.

  The basic problem which we (mainly Daniel McNeil) have been struggling
  with is in getting a really reliable fsync() across the page lists while
  other processes are performing writeback against the same file.  It's like
  juggling four bars of wet soap with your eyes shut while someone is
  whacking you with a baseball bat.  Daniel pretty much has the problem
  plugged but I suspect that's just because we don't have testcases to
  trigger the remaining problems.  The complexity and additional locking
  which those patches add is worrisome.

  So the approach taken here is to remove the page lists altogether and
  replace the list-based writeback and wait operations with in-order
  radix-tree walks.

  The radix-tree code has been enhanced to support "tagging" of pages, for
  later searches for pages which have a particular tag set.  This means that
  we can ask the radix tree code "find me the next 16 dirty pages starting at
  pagecache index N" and it will do that in O(log64(N)) time.

  This affects I/O scheduling potentially quite significantly.  It is no
  longer the case that the kernel will submit pages for I/O in the order in
  which the application dirtied them.  We instead submit them in file-offset
  order all the time.

  This is likely to be advantageous when applications are seeking all over
  a large file randomly writing small amounts of data.  I haven't performed
  much benchmarking, but tiobench random write throughput seems to be
  increased by 30%.  Other tests appear to be unaltered.  dbench may have got
  10-20% quicker, but it's variable.

  There is one large file which everyone seeks all over randomly writing
  small amounts of data: the blockdev mapping which caches filesystem
  metadata.  The kernel's IO submission patterns for this are now ideal.


  Because writeback and wait-for-writeback use a tree walk instead of a
  list walk they are no longer livelockable.  This probably means that we no
  longer need to hold i_sem across O_SYNC writes and perhaps fsync() and
  fdatasync().  This may be beneficial for databases: multiple processes
  writing and syncing different parts of the same file at the same time can
  now all submit and wait upon writes to just their own little bit of the
  file, so we can get a lot more data into the queues.

  It is trivial to implement a part-file-fdatasync() as well, so
  applications can say "sync the file from byte N to byte M", and multiple
  applications can do this concurrently.  This is easy for ext2 filesystems,
  but probably needs lots of work for data-journalled filesystems and XFS and
  it probably doesn't offer much benefit over an i_semless O_SYNC write.


  These patches can end up making ext3 (even) slower:

	for i in 1 2 3 4
	do
		dd if=/dev/zero of=$i bs=1M count=2000 &
	done

  runs awfully slow on SMP.  This is, yet again, because all the file
  blocks are jumbled up and the per-file linear writeout causes tons of
  seeking.  The above test runs sweetly on UP because the on UP we don't
  allocate blocks to different files in parallel.

  Mingming and Badari are working on getting block reservation working for
  ext3 (preallocation on steroids).  That should fix ext3 up.


This patch:

- Later, we'll need to access the radix trees from inside disk I/O
  completion handlers.  So make mapping->page_lock irq-safe.  And rename it
  to tree_lock to reliably break any missed conversions.
---
 fs/buffer.c         |  8 ++++----
 fs/cifs/file.c      | 10 +---------
 fs/fs-writeback.c   |  4 ++--
 fs/inode.c          |  2 +-
 fs/mpage.c          | 10 +++++-----
 include/linux/fs.h  |  2 +-
 ipc/shm.c           |  2 --
 mm/filemap.c        | 50 +++++++++++++++++++++++++-------------------------
 mm/page-writeback.c | 10 +++++-----
 mm/readahead.c      |  8 ++++----
 mm/swap_state.c     | 22 +++++++++++-----------
 mm/swapfile.c       |  8 ++++----
 mm/truncate.c       |  8 ++++----
 mm/vmscan.c         | 13 ++++---------
 14 files changed, 71 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 2cbe21bccb0b..81d0bb842ec9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -396,7 +396,7 @@ out:
  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
  * may be quite high.  This code could TryLock the page, and if that
  * succeeds, there is no need to take private_lock. (But if
- * private_lock is contended then so is mapping->page_lock).
+ * private_lock is contended then so is mapping->tree_lock).
  */
 static struct buffer_head *
 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
@@ -867,14 +867,14 @@ int __set_page_dirty_buffers(struct page *page)
 	spin_unlock(&mapping->private_lock);
 
 	if (!TestSetPageDirty(page)) {
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (!mapping->backing_dev_info->memory_backed)
 				inc_page_state(nr_dirty);
 			list_del(&page->list);
 			list_add(&page->list, &mapping->dirty_pages);
 		}
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 	
@@ -1254,7 +1254,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  * inode to its superblock's dirty inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->page_lock and the global inode_lock.
+ * mapping->tree_lock and the global inode_lock.
  */
 void fastcall mark_buffer_dirty(struct buffer_head *bh)
 {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d991eef801ac..f120f126eab5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -898,11 +898,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		if(list_empty(pages))
 			break;
 
-		spin_lock(&mapping->page_lock);
 		page = list_entry(pages->prev, struct page, list);
 
 		list_del(&page->list);
-		spin_unlock(&mapping->page_lock);
 
 		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
 			page_cache_release(page);
@@ -962,14 +960,10 @@ cifs_readpages(struct file *file, struct address_space *mapping,
 	pagevec_init(&lru_pvec, 0);
 
 	for(i = 0;i<num_pages;) {
-		spin_lock(&mapping->page_lock);
-		if(list_empty(page_list)) {
-			spin_unlock(&mapping->page_lock);
+		if(list_empty(page_list))
 			break;
-		}
 		page = list_entry(page_list->prev, struct page, list);
 		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	        spin_unlock(&mapping->page_lock);
 
 		/* for reads over a certain size could initiate async read ahead */
 
@@ -989,12 +983,10 @@ cifs_readpages(struct file *file, struct address_space *mapping,
 			cFYI(1,("Read error in readpages: %d",rc));
 			/* clean up remaing pages off list */
             
-			spin_lock(&mapping->page_lock);
 			while (!list_empty(page_list) && (i < num_pages)) {
 				page = list_entry(page_list->prev, struct page, list);
 				list_del(&page->list);
 			}
-			spin_unlock(&mapping->page_lock);
 			break;
 		} else if (bytes_read > 0) {
 			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index aa5f34b85747..f8b6182cb152 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -159,10 +159,10 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * read speculatively by this cpu before &= ~I_DIRTY  -- mikulas
 	 */
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
 		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
diff --git a/fs/inode.c b/fs/inode.c
index b7f80405c076..b5d43d858e0b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -187,7 +187,7 @@ void inode_init_once(struct inode *inode)
 	sema_init(&inode->i_sem, 1);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.page_lock);
+	spin_lock_init(&inode->i_data.tree_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
 	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
diff --git a/fs/mpage.c b/fs/mpage.c
index 630d6a0f7e7b..c3e781cb4906 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -635,7 +635,7 @@ mpage_writepages(struct address_space *mapping,
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	while (!list_empty(&mapping->io_pages) && !done) {
 		struct page *page = list_entry(mapping->io_pages.prev,
 					struct page, list);
@@ -655,10 +655,10 @@ mpage_writepages(struct address_space *mapping,
 		list_add(&page->list, &mapping->locked_pages);
 
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 
 		/*
-		 * At this point we hold neither mapping->page_lock nor
+		 * At this point we hold neither mapping->tree_lock nor
 		 * lock on the page itself: the page may be truncated or
 		 * invalidated (changing page->mapping to NULL), or even
 		 * swizzled back from swapper_space to tmpfs file mapping.
@@ -695,12 +695,12 @@ mpage_writepages(struct address_space *mapping,
 			unlock_page(page);
 		}
 		page_cache_release(page);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 	}
 	/*
 	 * Leave any remaining dirty pages on ->io_pages
 	 */
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 91ff9225ba86..f64f8fb2f819 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -322,7 +322,7 @@ struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	spinlock_t		page_lock;	/* and spinlock protecting it */
+	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
diff --git a/ipc/shm.c b/ipc/shm.c
index 4897cfe16f27..714933b144fa 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -380,9 +380,7 @@ static void shm_get_stat(unsigned long *rss, unsigned long *swp)
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			spin_lock(&mapping->page_lock);
 			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
-			spin_unlock(&mapping->page_lock);
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
diff --git a/mm/filemap.c b/mm/filemap.c
index f992d76831e4..360c5feec975 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -59,7 +59,7 @@
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_list_lock
  *        ->swap_device_lock	(exclusive_swap_page, others)
- *          ->mapping->page_lock
+ *          ->mapping->tree_lock
  *
  *  ->i_sem
  *    ->i_shared_sem		(truncate->invalidate_mmap_range)
@@ -78,12 +78,12 @@
  *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
- *    ->mapping->page_lock	(__sync_single_inode)
+ *    ->mapping->tree_lock	(__sync_single_inode)
  *
  *  ->page_table_lock
  *    ->swap_device_lock	(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
- *    ->page_lock		(try_to_unmap_one)
+ *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
  *
  *  ->task->proc_lock
@@ -93,7 +93,7 @@
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's page_lock.
+ * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
  */
 void __remove_from_page_cache(struct page *page)
 {
@@ -114,9 +114,9 @@ void remove_from_page_cache(struct page *page)
 	if (unlikely(!PageLocked(page)))
 		PAGE_BUG(page);
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 
 static inline int sync_page(struct page *page)
@@ -148,9 +148,9 @@ static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ret = do_writepages(mapping, &wbc);
 	return ret;
 }
@@ -185,7 +185,7 @@ int filemap_fdatawait(struct address_space * mapping)
 
 restart:
 	progress = 0;
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
         while (!list_empty(&mapping->locked_pages)) {
 		struct page *page;
 
@@ -199,7 +199,7 @@ restart:
 		if (!PageWriteback(page)) {
 			if (++progress > 32) {
 				if (need_resched()) {
-					spin_unlock(&mapping->page_lock);
+					spin_unlock_irq(&mapping->tree_lock);
 					__cond_resched();
 					goto restart;
 				}
@@ -209,16 +209,16 @@ restart:
 
 		progress = 0;
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 
 		wait_on_page_writeback(page);
 		if (PageError(page))
 			ret = -EIO;
 
 		page_cache_release(page);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 	}
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	/* Check for outstanding write errors */
 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
@@ -267,7 +267,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 
 	if (error == 0) {
 		page_cache_get(page);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
 			SetPageLocked(page);
@@ -275,7 +275,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 		} else {
 			page_cache_release(page);
 		}
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -411,11 +411,11 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page)
 		page_cache_get(page);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -428,11 +428,11 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
 {
 	struct page *page;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -454,15 +454,15 @@ struct page *find_lock_page(struct address_space *mapping,
 {
 	struct page *page;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 repeat:
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			lock_page(page);
-			spin_lock(&mapping->page_lock);
+			spin_lock_irq(&mapping->tree_lock);
 
 			/* Has the page been truncated while we slept? */
 			if (page->mapping != mapping || page->index != offset) {
@@ -472,7 +472,7 @@ repeat:
 			}
 		}
 	}
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -546,12 +546,12 @@ unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
 	unsigned int i;
 	unsigned int ret;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return ret;
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f1ecbd88e846..044becdff304 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -472,12 +472,12 @@ int write_one_page(struct page *page, int wait)
 	if (wait)
 		wait_on_page_writeback(page);
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	list_del(&page->list);
 	if (test_clear_page_dirty(page)) {
 		list_add(&page->list, &mapping->locked_pages);
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
@@ -487,7 +487,7 @@ int write_one_page(struct page *page, int wait)
 		page_cache_release(page);
 	} else {
 		list_add(&page->list, &mapping->clean_pages);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		unlock_page(page);
 	}
 	return ret;
@@ -515,7 +515,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		struct address_space *mapping = page->mapping;
 
 		if (mapping) {
-			spin_lock(&mapping->page_lock);
+			spin_lock_irq(&mapping->tree_lock);
 			if (page->mapping) {	/* Race with truncate? */
 				BUG_ON(page->mapping != mapping);
 				if (!mapping->backing_dev_info->memory_backed)
@@ -523,7 +523,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 				list_del(&page->list);
 				list_add(&page->list, &mapping->dirty_pages);
 			}
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			if (!PageSwapCache(page))
 				__mark_inode_dirty(mapping->host,
 							I_DIRTY_PAGES);
diff --git a/mm/readahead.c b/mm/readahead.c
index e1d25a8b528c..6135e1484ffc 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -230,7 +230,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		unsigned long page_offset = offset + page_idx;
 		
@@ -241,16 +241,16 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		if (page)
 			continue;
 
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		page = page_cache_alloc_cold(mapping);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
 		page->index = page_offset;
 		list_add(&page->list, &page_pool);
 		ret++;
 	}
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	/*
 	 * Now start the IO.  We ignore I/O errors - if the page is not
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 88cfd4403a4c..d670c5846b45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -25,7 +25,7 @@ extern struct address_space_operations swap_aops;
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
-	.page_lock	= SPIN_LOCK_UNLOCKED,
+	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
 	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
@@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page *page)
   
 	entry.val = page->index;
 
-	spin_lock(&swapper_space.page_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock(&swapper_space.page_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
@@ -195,8 +195,8 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	struct address_space *mapping = page->mapping;
 	int err;
 
-	spin_lock(&swapper_space.page_lock);
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
+	spin_lock(&mapping->tree_lock);
 
 	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
 	if (!err) {
@@ -204,8 +204,8 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 		___add_to_page_cache(page, &swapper_space, entry.val);
 	}
 
-	spin_unlock(&mapping->page_lock);
-	spin_unlock(&swapper_space.page_lock);
+	spin_unlock(&mapping->tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	if (!err) {
 		if (!swap_duplicate(entry))
@@ -231,8 +231,8 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 
 	entry.val = page->index;
 
-	spin_lock(&swapper_space.page_lock);
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
+	spin_lock(&mapping->tree_lock);
 
 	err = radix_tree_insert(&mapping->page_tree, index, page);
 	if (!err) {
@@ -240,8 +240,8 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 		___add_to_page_cache(page, mapping, index);
 	}
 
-	spin_unlock(&mapping->page_lock);
-	spin_unlock(&swapper_space.page_lock);
+	spin_unlock(&mapping->tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	if (!err) {
 		swap_free(entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58bf083a96b5..e5cebb1800b9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -253,10 +253,10 @@ static int exclusive_swap_page(struct page *page)
 		/* Is the only swap cache user the cache itself? */
 		if (p->swap_map[swp_offset(entry)] == 1) {
 			/* Recheck the page count with the pagecache lock held.. */
-			spin_lock(&swapper_space.page_lock);
+			spin_lock_irq(&swapper_space.tree_lock);
 			if (page_count(page) - !!PagePrivate(page) == 2)
 				retval = 1;
-			spin_unlock(&swapper_space.page_lock);
+			spin_unlock_irq(&swapper_space.tree_lock);
 		}
 		swap_info_put(p);
 	}
@@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct page *page)
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock(&swapper_space.page_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock(&swapper_space.page_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 	}
 	swap_info_put(p);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index d94d6bf56d80..1dd32a204dfc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -62,7 +62,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
  * This is for invalidate_inode_pages().  That function can be called at
  * any time, and is not supposed to throw away dirty pages.  But pages can
  * be marked dirty at any time too.  So we re-check the dirtiness inside
- * ->page_lock.  That provides exclusion against the __set_page_dirty
+ * ->tree_lock.  That provides exclusion against the __set_page_dirty
  * functions.
  */
 static int
@@ -74,13 +74,13 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page)) {
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return 0;
 	}
 	__remove_from_page_cache(page);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0b290c82c1f4..df658dd6c743 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -354,7 +354,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
-			spin_lock(&mapping->page_lock);
 			if (test_clear_page_dirty(page)) {
 				int res;
 				struct writeback_control wbc = {
@@ -364,9 +363,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 					.for_reclaim = 1,
 				};
 
-				list_move(&page->list, &mapping->locked_pages);
-				spin_unlock(&mapping->page_lock);
-
 				SetPageReclaim(page);
 				res = mapping->a_ops->writepage(page, &wbc);
 				if (res < 0)
@@ -381,7 +377,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				}
 				goto keep;
 			}
-			spin_unlock(&mapping->page_lock);
 		}
 
 		/*
@@ -415,7 +410,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 		if (!mapping)
 			goto keep_locked;	/* truncate got there first */
 
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 
 		/*
 		 * The non-racy check for busy page.  It is critical to check
@@ -423,7 +418,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 		 * not in use by anybody. 	(pagecache + us == 2)
 		 */
 		if (page_count(page) != 2 || PageDirty(page)) {
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			goto keep_locked;
 		}
 
@@ -431,7 +426,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->index };
 			__delete_from_swap_cache(page);
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
 			__put_page(page);	/* The pagecache ref */
 			goto free_it;
@@ -439,7 +434,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 #endif /* CONFIG_SWAP */
 
 		__remove_from_page_cache(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		__put_page(page);
 
 free_it:
-- 
cgit v1.2.3


From 8ece6262c5fef1b935a944f5d16965ff7dd5d1cc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:10:54 -0700
Subject: [PATCH] tag dirty pages as such in the radix tree

Arrange for all dirty pagecache pages to be tagged as dirty within their
radix tree.
---
 fs/buffer.c                |  2 ++
 include/linux/fs.h         |  7 +++++++
 include/linux/page-flags.h |  2 ++
 mm/page-writeback.c        | 48 +++++++++++++++++++++++++++++++++++++++-------
 mm/swap_state.c            |  4 ++--
 5 files changed, 54 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 81d0bb842ec9..59f4508a472f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -873,6 +873,8 @@ int __set_page_dirty_buffers(struct page *page)
 				inc_page_state(nr_dirty);
 			list_del(&page->list);
 			list_add(&page->list, &mapping->dirty_pages);
+			radix_tree_tag_set(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f64f8fb2f819..857e797b0ad2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -364,6 +364,13 @@ struct block_device {
 	unsigned long		bd_private;
 };
 
+/*
+ * Radix-tre tags, for tagging dirty and writeback pages within the pagecache
+ * radix trees
+ */
+#define PAGECACHE_TAG_DIRTY	0
+#define PAGECACHE_TAG_WRITEBACK	1
+
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f58c9e68d3d8..9f4fb3da00d9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -318,4 +318,6 @@ static inline void clear_page_dirty(struct page *page)
 	test_clear_page_dirty(page);
 }
 
+int __clear_page_dirty(struct page *page);
+
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 044becdff304..23da9ce262ca 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -522,6 +522,8 @@ int __set_page_dirty_nobuffers(struct page *page)
 					inc_page_state(nr_dirty);
 				list_del(&page->list);
 				list_add(&page->list, &mapping->dirty_pages);
+				radix_tree_tag_set(&mapping->page_tree,
+					page->index, PAGECACHE_TAG_DIRTY);
 			}
 			spin_unlock_irq(&mapping->tree_lock);
 			if (!PageSwapCache(page))
@@ -560,13 +562,45 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  */
 int test_clear_page_dirty(struct page *page)
 {
-	if (TestClearPageDirty(page)) {
-		struct address_space *mapping = page->mapping;
-
-		if (mapping && !mapping->backing_dev_info->memory_backed)
-			dec_page_state(nr_dirty);
-		return 1;
+	struct address_space *mapping = page->mapping;
+	unsigned long flags;
+
+	if (mapping) {
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		if (TestClearPageDirty(page)) {
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
+			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+			if (!mapping->backing_dev_info->memory_backed)
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return 0;
 	}
-	return 0;
+	return TestClearPageDirty(page);
 }
 EXPORT_SYMBOL(test_clear_page_dirty);
+
+/*
+ * Clear a page's dirty flag while ignoring dirty memory accounting
+ */
+int __clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		if (TestClearPageDirty(page)) {
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
+			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+			return 1;
+		}
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d670c5846b45..736fd2b82300 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -149,7 +149,7 @@ int add_to_swap(struct page * page)
 		switch (err) {
 		case 0:				/* Success */
 			SetPageUptodate(page);
-			ClearPageDirty(page);
+			__clear_page_dirty(page);
 			set_page_dirty(page);
 			INC_CACHE_INFO(add_total);
 			return 1;
@@ -246,7 +246,7 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 	if (!err) {
 		swap_free(entry);
 		/* shift page from clean_pages to dirty_pages list */
-		ClearPageDirty(page);
+		__clear_page_dirty(page);
 		set_page_dirty(page);
 	}
 	return err;
-- 
cgit v1.2.3


From 40c8348ec03fa2c525e13ca6ee54279735563ee4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:11:08 -0700
Subject: [PATCH] tag writeback pages as such in their radix tree

Arrange for under-writeback pages to be marked thus in their pagecache radix
tree.
---
 fs/buffer.c                |  4 ++--
 fs/mpage.c                 |  2 +-
 fs/nfs/write.c             |  2 +-
 fs/ntfs/aops.c             |  4 ++--
 fs/reiserfs/inode.c        |  4 ++--
 fs/xfs/linux/xfs_aops.c    |  2 +-
 include/linux/page-flags.h |  8 +++++++-
 mm/filemap.c               |  3 +--
 mm/page-writeback.c        | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_io.c               |  2 +-
 mm/swap.c                  |  2 +-
 11 files changed, 61 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 59f4508a472f..56b0df6bf752 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1829,7 +1829,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	} while ((bh = bh->b_this_page) != head);
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);		/* Keeps try_to_free_buffers() away */
+	set_page_writeback(page);	/* Keeps try_to_free_buffers() away */
 	unlock_page(page);
 
 	/*
@@ -1892,7 +1892,7 @@ recover:
 	} while ((bh = bh->b_this_page) != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
diff --git a/fs/mpage.c b/fs/mpage.c
index c3e781cb4906..5f5f5e63fca2 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -546,7 +546,7 @@ alloc_new:
 	}
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 	if (boundary || (first_unmapped != blocks_per_page)) {
 		bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 83bc0b498a01..53bff1a2a731 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -768,7 +768,7 @@ nfs_write_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
-		SetPageWriteback(req->wb_page);
+		set_page_writeback(req->wb_page);
 		*pages++ = req->wb_page;
 		count += req->wb_bytes;
 	}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index e3b1c227cb7b..bb048a75318d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -743,7 +743,7 @@ lock_retry_remap:
 	}
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);		/* Keeps try_to_free_buffers() away. */
+	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
 	unlock_page(page);
 
 	/*
@@ -885,7 +885,7 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 	// FIXME: Make sure it is ok to SetPageError() on unlocked page under
 	// writeback before doing the change!
 #if 0
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 #endif
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 31016572683e..c01847228d2c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2134,7 +2134,7 @@ static int reiserfs_write_full_page(struct page *page, struct writeback_control
     } while(bh != head) ;
 
     BUG_ON(PageWriteback(page));
-    SetPageWriteback(page);
+    set_page_writeback(page);
     unlock_page(page);
 
     /*
@@ -2198,7 +2198,7 @@ fail:
     } while(bh != head);
     SetPageError(page);
     BUG_ON(PageWriteback(page));
-    SetPageWriteback(page);
+    set_page_writeback(page);
     unlock_page(page);
     do {
         struct buffer_head *next = bh->b_this_page;
diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c
index dd446266d33f..52a8c40d7f71 100644
--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
@@ -566,7 +566,7 @@ xfs_submit_page(
 	int			i;
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	clear_page_dirty(page);
 	unlock_page(page);
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f4fb3da00d9..bd6ddb279c55 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -312,12 +312,18 @@ extern struct address_space swapper_space;
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
+int __clear_page_dirty(struct page *page);
+int test_clear_page_writeback(struct page *page);
+int test_set_page_writeback(struct page *page);
 
 static inline void clear_page_dirty(struct page *page)
 {
 	test_clear_page_dirty(page);
 }
 
-int __clear_page_dirty(struct page *page);
+static inline void set_page_writeback(struct page *page)
+{
+	test_set_page_writeback(page);
+}
 
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 360c5feec975..4d5e76ceaf29 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -363,8 +363,7 @@ void end_page_writeback(struct page *page)
 	wait_queue_head_t *waitqueue = page_waitqueue(page);
 
 	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
-		smp_mb__before_clear_bit();
-		if (!TestClearPageWriteback(page))
+		if (!test_clear_page_writeback(page))
 			BUG();
 		smp_mb__after_clear_bit();
 	}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 23da9ce262ca..bc4f3258daf2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -604,3 +604,45 @@ int __clear_page_dirty(struct page *page)
 	}
 	return TestClearPageDirty(page);
 }
+
+int test_clear_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestClearPageWriteback(page);
+		if (ret)
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_WRITEBACK);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestClearPageWriteback(page);
+	}
+	return ret;
+}
+
+int test_set_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestSetPageWriteback(page);
+		if (!ret)
+			radix_tree_tag_set(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_WRITEBACK);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestSetPageWriteback(page);
+	}
+	return ret;
+
+}
+EXPORT_SYMBOL(test_set_page_writeback);
diff --git a/mm/page_io.c b/mm/page_io.c
index 421f77d2c39c..dde9d23f99bd 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -104,7 +104,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		goto out;
 	}
 	inc_page_state(pswpout);
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(WRITE, bio);
 out:
diff --git a/mm/swap.c b/mm/swap.c
index a5352c98751a..90a9ac490a3c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -70,7 +70,7 @@ int rotate_reclaimable_page(struct page *page)
 		list_add_tail(&page->lru, &zone->inactive_list);
 		inc_page_state(pgrotated);
 	}
-	if (!TestClearPageWriteback(page))
+	if (!test_clear_page_writeback(page))
 		BUG();
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 	return 0;
-- 
cgit v1.2.3


From 1d7d3304e9845f61cab6b6091e8952f6fb05009a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:11:21 -0700
Subject: [PATCH] stop using the address_space dirty_pages list

Move everything over to walking the radix tree via the PAGECACHE_TAG_DIRTY
tag.  Remove address_space.dirty_pages.
---
 fs/buffer.c              |  18 ++-----
 fs/fs-writeback.c        |  15 +-----
 fs/inode.c               |   1 -
 fs/mpage.c               | 127 +++++++++++++++++++----------------------------
 fs/xfs/linux/xfs_vnode.h |   3 +-
 include/linux/fs.h       |   3 +-
 include/linux/pagemap.h  |   7 +--
 include/linux/pagevec.h  |   7 ++-
 mm/filemap.c             |  35 ++++++++-----
 mm/page-writeback.c      |  29 +++++++----
 mm/page_alloc.c          |   2 +
 mm/swap.c                |  12 ++++-
 mm/swap_state.c          |   3 --
 13 files changed, 122 insertions(+), 140 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 56b0df6bf752..baae58828510 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -825,12 +825,6 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
  * page on the dirty page list.
  *
- * There is also a small window where the page is dirty, and not on dirty_pages.
- * Also a possibility that by the time the page is added to dirty_pages, it has
- * been set clean.  The page lists are somewhat approximate in this regard.
- * It's better to have clean pages accidentally attached to dirty_pages than to
- * leave dirty pages attached to clean_pages.
- *
  * We use private_lock to lock against try_to_free_buffers while using the
  * page's buffer list.  Also use this to protect against clean buffers being
  * added to the page after it was set dirty.
@@ -871,8 +865,6 @@ int __set_page_dirty_buffers(struct page *page)
 		if (page->mapping) {	/* Race with truncate? */
 			if (!mapping->backing_dev_info->memory_backed)
 				inc_page_state(nr_dirty);
-			list_del(&page->list);
-			list_add(&page->list, &mapping->dirty_pages);
 			radix_tree_tag_set(&mapping->page_tree, page->index,
 						PAGECACHE_TAG_DIRTY);
 		}
@@ -1228,7 +1220,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  * The relationship between dirty buffers and dirty pages:
  *
  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page appears on its address_space.dirty_pages list.
+ * the page is tagged dirty in its radix tree.
  *
  * At all times, the dirtiness of the buffers represents the dirtiness of
  * subsections of the page.  If the page has buffers, the page dirty bit is
@@ -1250,10 +1242,10 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 /**
  * mark_buffer_dirty - mark a buffer_head as needing writeout
  *
- * mark_buffer_dirty() will set the dirty bit against the buffer,
- * then set its backing page dirty, then attach the page to its
- * address_space's dirty_pages list and then attach the address_space's
- * inode to its superblock's dirty inode list.
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
+ * backing page dirty, then tag the page as dirty in its address_space's radix
+ * tree and then attach the address_space's inode to its superblock's dirty
+ * inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
  * mapping->tree_lock and the global inode_lock.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8b6182cb152..0a75c690f142 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -129,12 +129,6 @@ static void write_inode(struct inode *inode, int sync)
  * starvation of particular inodes when others are being redirtied, prevent
  * livelocks, etc.
  *
- * So what we do is to move all pages which are to be written from dirty_pages
- * onto io_pages.  And keep on writing io_pages until it's empty.  Refusing to
- * move more pages onto io_pages until io_pages is empty.  Once that point has
- * been reached, we are ready to take another pass across the inode's dirty
- * pages.
- *
  * Called under inode_lock.
  */
 static int
@@ -159,10 +153,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * read speculatively by this cpu before &= ~I_DIRTY  -- mikulas
 	 */
 
-	spin_lock_irq(&mapping->tree_lock);
-	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
-		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock_irq(&mapping->tree_lock);
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
@@ -180,10 +170,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
 	if (!(inode->i_state & I_FREEING)) {
-		if (!list_empty(&mapping->io_pages)) {
-		 	/* Needs more writeback */
-			inode->i_state |= I_DIRTY_PAGES;
-		} else if (!list_empty(&mapping->dirty_pages)) {
+		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/* Redirtied */
 			inode->i_state |= I_DIRTY_PAGES;
 			inode->dirtied_when = jiffies;
diff --git a/fs/inode.c b/fs/inode.c
index b5d43d858e0b..3ffd4e4fc522 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -179,7 +179,6 @@ void inode_init_once(struct inode *inode)
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
-	INIT_LIST_HEAD(&inode->i_data.dirty_pages);
 	INIT_LIST_HEAD(&inode->i_data.locked_pages);
 	INIT_LIST_HEAD(&inode->i_data.io_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f5f5e63fca2..964a06035da8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -592,28 +592,12 @@ out:
  * (The next two paragraphs refer to code which isn't here yet, but they
  *  explain the presence of address_space.io_pages)
  *
- * Pages can be moved from clean_pages or locked_pages onto dirty_pages
- * at any time - it's not possible to lock against that.  So pages which
- * have already been added to a BIO may magically reappear on the dirty_pages
- * list.  And mpage_writepages() will again try to lock those pages.
- * But I/O has not yet been started against the page.  Thus deadlock.
- *
- * To avoid this, mpage_writepages() will only write pages from io_pages. The
- * caller must place them there.  We walk io_pages, locking the pages and
- * submitting them for I/O, moving them to locked_pages.
- *
- * This has the added benefit of preventing a livelock which would otherwise
- * occur if pages are being dirtied faster than we can write them out.
- *
  * If a page is already under I/O, generic_writepages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  So if called_for_sync()
  * is true, we must wait for existing IO to complete.
- *
- * It's fairly rare for PageWriteback pages to be on ->dirty_pages.  It
- * means that someone redirtied the page while it was under I/O.
  */
 int
 mpage_writepages(struct address_space *mapping,
@@ -625,6 +609,9 @@ mpage_writepages(struct address_space *mapping,
 	int ret = 0;
 	int done = 0;
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -635,72 +622,58 @@ mpage_writepages(struct address_space *mapping,
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
 
-	spin_lock_irq(&mapping->tree_lock);
-	while (!list_empty(&mapping->io_pages) && !done) {
-		struct page *page = list_entry(mapping->io_pages.prev,
-					struct page, list);
-		list_del(&page->list);
-		if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) {
-			if (PageDirty(page)) {
-				list_add(&page->list, &mapping->dirty_pages);
-				continue;
-			}
-			list_add(&page->list, &mapping->locked_pages);
-			continue;
-		}
-		if (!PageDirty(page)) {
-			list_add(&page->list, &mapping->clean_pages);
-			continue;
-		}
-		list_add(&page->list, &mapping->locked_pages);
-
-		page_cache_get(page);
-		spin_unlock_irq(&mapping->tree_lock);
-
-		/*
-		 * At this point we hold neither mapping->tree_lock nor
-		 * lock on the page itself: the page may be truncated or
-		 * invalidated (changing page->mapping to NULL), or even
-		 * swizzled back from swapper_space to tmpfs file mapping.
-		 */
-
-		lock_page(page);
-
-		if (wbc->sync_mode != WB_SYNC_NONE)
-			wait_on_page_writeback(page);
-
-		if (page->mapping == mapping && !PageWriteback(page) &&
-					test_clear_page_dirty(page)) {
-			if (writepage) {
-				ret = (*writepage)(page, wbc);
-				if (ret) {
-					if (ret == -ENOSPC)
-						set_bit(AS_ENOSPC,
-							&mapping->flags);
-					else
-						set_bit(AS_EIO,
-							&mapping->flags);
+	pagevec_init(&pvec, 0);
+	index = 0;
+	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+
+			lock_page(page);
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (page->mapping == mapping && !PageWriteback(page) &&
+						test_clear_page_dirty(page)) {
+				if (writepage) {
+					ret = (*writepage)(page, wbc);
+					if (ret) {
+						if (ret == -ENOSPC)
+							set_bit(AS_ENOSPC,
+							  &mapping->flags);
+						else
+							set_bit(AS_EIO,
+							  &mapping->flags);
+					}
+				} else {
+					bio = mpage_writepage(bio, page,
+						get_block, &last_block_in_bio,
+						&ret, wbc);
+				}
+				if (ret || (--(wbc->nr_to_write) <= 0))
+					done = 1;
+				if (wbc->nonblocking &&
+						bdi_write_congested(bdi)) {
+					wbc->encountered_congestion = 1;
+					done = 1;
 				}
 			} else {
-				bio = mpage_writepage(bio, page, get_block,
-					&last_block_in_bio, &ret, wbc);
+				unlock_page(page);
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
-				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
-		} else {
-			unlock_page(page);
 		}
-		page_cache_release(page);
-		spin_lock_irq(&mapping->tree_lock);
+		pagevec_release(&pvec);
 	}
-	/*
-	 * Leave any remaining dirty pages on ->io_pages
-	 */
-	spin_unlock_irq(&mapping->tree_lock);
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff --git a/fs/xfs/linux/xfs_vnode.h b/fs/xfs/linux/xfs_vnode.h
index 514bc9cde057..6736f7aa2b97 100644
--- a/fs/xfs/linux/xfs_vnode.h
+++ b/fs/xfs/linux/xfs_vnode.h
@@ -600,7 +600,8 @@ static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
 	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
 	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
 #define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
-#define VN_DIRTY(vp)	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
+#define VN_DIRTY(vp)	mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
+					PAGECACHE_TAG_DIRTY)
 #define VMODIFY(vp)	VN_FLAGSET(vp, VMODIFIED)
 #define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 857e797b0ad2..f8954889d336 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -324,7 +324,6 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
-	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
 	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
@@ -371,6 +370,8 @@ struct block_device {
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
 
+int mapping_tagged(struct address_space *mapping, int tag);
+
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e552cb04a0ed..70d07dbfcd02 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -69,9 +69,10 @@ extern struct page * find_trylock_page(struct address_space *mapping,
 				unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
 				unsigned long index, unsigned int gfp_mask);
-extern unsigned int find_get_pages(struct address_space *mapping,
-				pgoff_t start, unsigned int nr_pages,
-				struct page **pages);
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+			unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+			int tag, unsigned int nr_pages, struct page **pages);
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2a332eed3d82..e6e43ce82b55 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,8 +22,11 @@ void __pagevec_free(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 void __pagevec_lru_add_active(struct pagevec *pvec);
 void pagevec_strip(struct pagevec *pvec);
-unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned int nr_pages);
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned nr_pages);
+unsigned pagevec_lookup_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, int tag,
+		unsigned nr_pages);
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 4d5e76ceaf29..cac8da0dd773 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -100,9 +100,7 @@ void __remove_from_page_cache(struct page *page)
 	struct address_space *mapping = page->mapping;
 
 	radix_tree_delete(&mapping->page_tree, page->index);
-	list_del(&page->list);
 	page->mapping = NULL;
-
 	mapping->nrpages--;
 	pagecache_acct(-1);
 }
@@ -148,9 +146,6 @@ static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 
-	spin_lock_irq(&mapping->tree_lock);
-	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock_irq(&mapping->tree_lock);
 	ret = do_writepages(mapping, &wbc);
 	return ret;
 }
@@ -190,11 +185,7 @@ restart:
 		struct page *page;
 
 		page = list_entry(mapping->locked_pages.next,struct page,list);
-		list_del(&page->list);
-		if (PageDirty(page))
-			list_add(&page->list, &mapping->dirty_pages);
-		else
-			list_add(&page->list, &mapping->clean_pages);
+		list_del_init(&page->list);
 
 		if (!PageWriteback(page)) {
 			if (++progress > 32) {
@@ -228,7 +219,6 @@ restart:
 
 	return ret;
 }
-
 EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
@@ -539,7 +529,7 @@ EXPORT_SYMBOL(find_or_create_page);
  *
  * find_get_pages() returns the number of pages which were found.
  */
-unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			    unsigned int nr_pages, struct page **pages)
 {
 	unsigned int i;
@@ -554,6 +544,27 @@ unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
 	return ret;
 }
 
+/*
+ * Like find_get_pages, except we only return pages which are tagged with
+ * `tag'.   We update *start to index the next page for the traversal.
+ */
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+			int tag, unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	spin_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
+				(void **)pages, *index, nr_pages, tag);
+	for (i = 0; i < ret; i++)
+		page_cache_get(pages[i]);
+	if (ret)
+		*index = pages[ret - 1]->index + 1;
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
 /*
  * Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bc4f3258daf2..fa5eeca766cf 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -472,12 +472,8 @@ int write_one_page(struct page *page, int wait)
 	if (wait)
 		wait_on_page_writeback(page);
 
-	spin_lock_irq(&mapping->tree_lock);
-	list_del(&page->list);
 	if (test_clear_page_dirty(page)) {
-		list_add(&page->list, &mapping->locked_pages);
 		page_cache_get(page);
-		spin_unlock_irq(&mapping->tree_lock);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
@@ -486,8 +482,6 @@ int write_one_page(struct page *page, int wait)
 		}
 		page_cache_release(page);
 	} else {
-		list_add(&page->list, &mapping->clean_pages);
-		spin_unlock_irq(&mapping->tree_lock);
 		unlock_page(page);
 	}
 	return ret;
@@ -495,9 +489,8 @@ int write_one_page(struct page *page, int wait)
 EXPORT_SYMBOL(write_one_page);
 
 /*
- * For address_spaces which do not use buffers.  Just set the page's dirty bit
- * and move it to the dirty_pages list.  Also perform space reservation if
- * required.
+ * For address_spaces which do not use buffers.  Just tag the page as dirty in
+ * its radix tree.
  *
  * __set_page_dirty_nobuffers() may return -ENOSPC.  But if it does, the page
  * is still safe, as long as it actually manages to find some blocks at
@@ -520,8 +513,6 @@ int __set_page_dirty_nobuffers(struct page *page)
 				BUG_ON(page->mapping != mapping);
 				if (!mapping->backing_dev_info->memory_backed)
 					inc_page_state(nr_dirty);
-				list_del(&page->list);
-				list_add(&page->list, &mapping->dirty_pages);
 				radix_tree_tag_set(&mapping->page_tree,
 					page->index, PAGECACHE_TAG_DIRTY);
 			}
@@ -646,3 +637,19 @@ int test_set_page_writeback(struct page *page)
 
 }
 EXPORT_SYMBOL(test_set_page_writeback);
+
+/*
+ * Return true if any of the pages in the mapping are marged with the
+ * passed tag.
+ */
+int mapping_tagged(struct address_space *mapping, int tag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	ret = radix_tree_tagged(&mapping->page_tree, tag);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c87ca3dd2f11..ae1636c3a422 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -682,6 +682,8 @@ nopage:
 	return NULL;
 got_pg:
 	kernel_map_pages(page, 1 << order, 1);
+	INIT_LIST_HEAD(&page->list);
+	INIT_LIST_HEAD(&page->lru);
 	return page;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 90a9ac490a3c..c20d079a0729 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -353,13 +353,21 @@ void pagevec_strip(struct pagevec *pvec)
  *
  * pagevec_lookup() returns the number of pages which were found.
  */
-unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned int nr_pages)
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned nr_pages)
 {
 	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 	return pagevec_count(pvec);
 }
 
+unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t *index, int tag, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages_tag(mapping, index, tag,
+					nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
 
 #ifdef CONFIG_SMP
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 736fd2b82300..77424e877e62 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,7 +27,6 @@ struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
-	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
 	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
 	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
 	.a_ops		= &swap_aops,
@@ -210,7 +209,6 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	if (!err) {
 		if (!swap_duplicate(entry))
 			BUG();
-		/* shift page from clean_pages to dirty_pages list */
 		BUG_ON(PageDirty(page));
 		set_page_dirty(page);
 		INC_CACHE_INFO(add_total);
@@ -245,7 +243,6 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 
 	if (!err) {
 		swap_free(entry);
-		/* shift page from clean_pages to dirty_pages list */
 		__clear_page_dirty(page);
 		set_page_dirty(page);
 	}
-- 
cgit v1.2.3


From 3c1ed9b2ce95145ba1c0434a7a7b63261fd7c15d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:11:47 -0700
Subject: [PATCH] remove address_space.io_pages

Now remove address_space.io_pages.
---
 fs/inode.c         | 1 -
 fs/mpage.c         | 8 +++-----
 include/linux/fs.h | 1 -
 mm/swap_state.c    | 1 -
 4 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 3ffd4e4fc522..ac8d22413404 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -180,7 +180,6 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
 	INIT_LIST_HEAD(&inode->i_data.locked_pages);
-	INIT_LIST_HEAD(&inode->i_data.io_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
diff --git a/fs/mpage.c b/fs/mpage.c
index 964a06035da8..6226bfe0a254 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -589,15 +589,13 @@ out:
  * This is a library function, which implements the writepages()
  * address_space_operation.
  *
- * (The next two paragraphs refer to code which isn't here yet, but they
- *  explain the presence of address_space.io_pages)
- *
  * If a page is already under I/O, generic_writepages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  So if called_for_sync()
- * is true, we must wait for existing IO to complete.
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
  */
 int
 mpage_writepages(struct address_space *mapping,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f8954889d336..7270490162a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -325,7 +325,6 @@ struct address_space {
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	locked_pages;	/* list of locked pages */
-	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 77424e877e62..e0396e7ada38 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,7 +27,6 @@ struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
-	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
 	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
-- 
cgit v1.2.3


From a15133091ee83b0a97913cd48d6131188af093e1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:12:01 -0700
Subject: [PATCH] Stop using address_space.locked_pages

Instead, use a radix-tree walk of the pages which are tagged as being under
writeback.

The new function wait_on_page_writeback_range() was generalised out of
filemap_fdatawait().  We can later use this to provide concurrent fsync of
just a section of a file.
---
 fs/inode.c         |  1 -
 include/linux/fs.h |  1 -
 include/linux/mm.h |  2 +-
 mm/filemap.c       | 72 +++++++++++++++++++++++++++---------------------------
 mm/swap_state.c    |  1 -
 5 files changed, 37 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index ac8d22413404..0c122d4a6529 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -179,7 +179,6 @@ void inode_init_once(struct inode *inode)
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
-	INIT_LIST_HEAD(&inode->i_data.locked_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7270490162a9..5194a645baf2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -324,7 +324,6 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
-	struct list_head	locked_pages;	/* list of locked pages */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fbd569b35b4f..af18e1da3bd5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -182,7 +182,7 @@ struct page {
 	atomic_t count;			/* Usage count, see below. */
 	struct list_head list;		/* ->mapping has some page lists. */
 	struct address_space *mapping;	/* The inode (or ...) we belong to. */
-	unsigned long index;		/* Our offset within mapping. */
+	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by zone->lru_lock !! */
 	union {
diff --git a/mm/filemap.c b/mm/filemap.c
index cac8da0dd773..692c9a837e61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -154,7 +154,6 @@ int filemap_fdatawrite(struct address_space *mapping)
 {
 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 }
-
 EXPORT_SYMBOL(filemap_fdatawrite);
 
 /*
@@ -165,51 +164,40 @@ int filemap_flush(struct address_space *mapping)
 {
 	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 }
-
 EXPORT_SYMBOL(filemap_flush);
 
-/**
- * filemap_fdatawait - walk the list of locked pages of the given address
- *                     space and wait for all of them.
- * @mapping: address space structure to wait for
+/*
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
  */
-int filemap_fdatawait(struct address_space * mapping)
+static int wait_on_page_writeback_range(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
 {
+	struct pagevec pvec;
+	int nr_pages;
 	int ret = 0;
-	int progress;
-
-restart:
-	progress = 0;
-	spin_lock_irq(&mapping->tree_lock);
-        while (!list_empty(&mapping->locked_pages)) {
-		struct page *page;
-
-		page = list_entry(mapping->locked_pages.next,struct page,list);
-		list_del_init(&page->list);
+	pgoff_t index;
 
-		if (!PageWriteback(page)) {
-			if (++progress > 32) {
-				if (need_resched()) {
-					spin_unlock_irq(&mapping->tree_lock);
-					__cond_resched();
-					goto restart;
-				}
-			}
-			continue;
-		}
+	if (end < start)
+		return 0;
 
-		progress = 0;
-		page_cache_get(page);
-		spin_unlock_irq(&mapping->tree_lock);
+	pagevec_init(&pvec, 0);
+	index = start;
+	while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_WRITEBACK,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
 
-		wait_on_page_writeback(page);
-		if (PageError(page))
-			ret = -EIO;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
 
-		page_cache_release(page);
-		spin_lock_irq(&mapping->tree_lock);
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
 	}
-	spin_unlock_irq(&mapping->tree_lock);
 
 	/* Check for outstanding write errors */
 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
@@ -219,6 +207,18 @@ restart:
 
 	return ret;
 }
+
+/**
+ * filemap_fdatawait - walk the list of under-writeback pages of the given
+ *     address space and wait for all of them.
+ *
+ * @mapping: address space structure to wait for
+ */
+int filemap_fdatawait(struct address_space *mapping)
+{
+	return wait_on_page_writeback_range(mapping, 0, -1);
+}
+
 EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0396e7ada38..8e3c3ca4ae4f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,7 +27,6 @@ struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
-	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
-- 
cgit v1.2.3


From d672c382411ffafbf2b8ed608dfdb8bd8e67307d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:12:13 -0700
Subject: [PATCH] stop using address_space.clean_pages

Remove remaining references to address_space.clean_pages.
---
 fs/inode.c              | 1 -
 include/linux/fs.h      | 1 -
 include/linux/pagemap.h | 1 -
 mm/swap_state.c         | 1 -
 4 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index 0c122d4a6529..282d86aed622 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -178,7 +178,6 @@ void inode_init_once(struct inode *inode)
 {
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
-	INIT_LIST_HEAD(&inode->i_data.clean_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5194a645baf2..dc8c46fb4b69 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -323,7 +323,6 @@ struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
-	struct list_head	clean_pages;	/* list of clean pages */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 70d07dbfcd02..5585675ab842 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -142,7 +142,6 @@ static inline unsigned long get_page_cache_size(void)
 static inline void ___add_to_page_cache(struct page *page,
 		struct address_space *mapping, unsigned long index)
 {
-	list_add(&page->list, &mapping->clean_pages);
 	page->mapping = mapping;
 	page->index = index;
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8e3c3ca4ae4f..22946f0d9ecf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -26,7 +26,6 @@ extern struct address_space_operations swap_aops;
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
-	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
-- 
cgit v1.2.3


From 0fcb51fd7ee151a03aab2d07493bbadf176a1457 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:13:47 -0700
Subject: [PATCH] stop using page->lru in compound pages

The compound page logic is using page->lru, and these get will scribbled on
in various places so switch the Compound page logic over to using ->mapping
and ->private.
---
 arch/i386/mm/hugetlbpage.c    |  1 -
 arch/ia64/mm/hugetlbpage.c    |  1 -
 arch/ppc64/mm/hugetlbpage.c   |  1 -
 arch/sparc64/mm/hugetlbpage.c |  1 -
 include/linux/mm.h            | 10 +++++-----
 mm/page_alloc.c               | 31 ++++++++++++++++---------------
 6 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 1777966f0186..0c73f414b5b1 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -278,7 +278,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 static void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d75ec2bfdb41..aa2a1945d2c2 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -246,7 +246,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index 3b67759defe1..032a1c9c5766 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -450,7 +450,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 static void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 867d8b788e6b..dd2a7549caef 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -248,7 +248,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 static void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index af18e1da3bd5..fa7beaefd038 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -242,24 +242,24 @@ extern void FASTCALL(__page_cache_release(struct page *));
 static inline int page_count(struct page *p)
 {
 	if (PageCompound(p))
-		p = (struct page *)p->lru.next;
+		p = (struct page *)p->private;
 	return atomic_read(&(p)->count);
 }
 
 static inline void get_page(struct page *page)
 {
 	if (PageCompound(page))
-		page = (struct page *)page->lru.next;
+		page = (struct page *)page->private;
 	atomic_inc(&page->count);
 }
 
 static inline void put_page(struct page *page)
 {
 	if (PageCompound(page)) {
-		page = (struct page *)page->lru.next;
+		page = (struct page *)page->private;
 		if (put_page_testzero(page)) {
-			if (page->lru.prev) {	/* destructor? */
-				(*(void (*)(struct page *))page->lru.prev)(page);
+			if (page[1].mapping) {	/* destructor? */
+				(*(void (*)(struct page *))page[1].mapping)(page);
 			} else {
 				__page_cache_release(page);
 			}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b79b7907e734..6cb630fec60e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,13 +71,14 @@ static int bad_range(struct zone *zone, struct page *page)
 
 static void bad_page(const char *function, struct page *page)
 {
-	printk("Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page);
-	printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n",
+	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+		function, current->comm, page);
+	printk(KERN_EMERG "flags:0x%08lx mapping:%p mapped:%d count:%d\n",
 		(unsigned long)page->flags, page->mapping,
 		page_mapped(page), page_count(page));
-	printk("Backtrace:\n");
+	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
-	printk("Trying to fix it up, but a reboot is needed\n");
+	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
@@ -99,13 +100,13 @@ static void bad_page(const char *function, struct page *page)
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
- * All pages have PG_compound set.  All pages have their lru.next pointing at
+ * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
- * The head page's lru.prev, if non-zero, holds the address of the compound
- * page's put_page() function.
+ * The first tail page's ->mapping, if non-zero, holds the address of the
+ * compound page's put_page() function.
  *
- * The order of the allocation is stored in the first tail page's lru.prev.
+ * The order of the allocation is stored in the first tail page's ->index
  * This is only for debug at present.  This usage means that zero-order pages
  * may not be compound.
  */
@@ -114,13 +115,13 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	page->lru.prev = NULL;
-	page[1].lru.prev = (void *)order;
+	page[1].mapping = 0;
+	page[1].index = order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
 		SetPageCompound(p);
-		p->lru.next = (void *)page;
+		p->private = (unsigned long)page;
 	}
 }
 
@@ -129,7 +130,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	if (page[1].lru.prev != (void *)order)
+	if (page[1].index != order)
 		bad_page(__FUNCTION__, page);
 
 	for (i = 0; i < nr_pages; i++) {
@@ -137,7 +138,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
-		if (p->lru.next != (void *)page)
+		if (p->private != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
@@ -512,14 +513,14 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
-		if (order && page)
-			prep_compound_page(page, order);
 	}
 
 	if (page != NULL) {
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
+		if (order)
+			prep_compound_page(page, order);
 	}
 	return page;
 }
-- 
cgit v1.2.3


From be5ceb401d4853c6b31f3f0c79d6b14ef5847288 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:14:26 -0700
Subject: [PATCH] remove page.list

Remove the now-unneeded page.list field.
---
 include/linux/mm.h | 1 -
 mm/page_alloc.c    | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa7beaefd038..94b0326d120f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -180,7 +180,6 @@ struct page {
 	page_flags_t flags;		/* atomic flags, some possibly
 					   updated asynchronously */
 	atomic_t count;			/* Usage count, see below. */
-	struct list_head list;		/* ->mapping has some page lists. */
 	struct address_space *mapping;	/* The inode (or ...) we belong to. */
 	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cb630fec60e..96fb97866a28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -683,8 +683,6 @@ nopage:
 	return NULL;
 got_pg:
 	kernel_map_pages(page, 1 << order, 1);
-	INIT_LIST_HEAD(&page->list);
-	INIT_LIST_HEAD(&page->lru);
 	return page;
 }
 
-- 
cgit v1.2.3


From d3eb546e203ab717237566e5762d97796e58f41f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:14:39 -0700
Subject: [PATCH] fdatasync integrity fix

fdatasync can fail to wait on some pages due to a race.

If some task (eg pdflush) is flushing the same mapping it can remove a page's
dirty tag but not then mark that page as being under writeback, because
pdflush hit a locked buffer in __block_write_full_page().  This will happen
because kjournald is writing the buffer.  In this situation
__block_write_full_page() will redirty the page so that fsync notices it, but
there is a window where the page eludes the radix tree dirty page walk.

Consequently a concurrent fsync will fail to notice the page when walking the
radix tree's dirty pages.

The approach taken by this patch is to leave the page marked as dirty in the
radix tree while ->writepage is working out what to do with it.  This ensures
that a concurrent write-for-sync will successfully locate the page and will
then block in lock_page() until the non-write-for-sync code has finished
altering the page state.
---
 fs/mpage.c          |  2 +-
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 35 ++++++++++++++++++++++++++++++++++-
 mm/vmscan.c         |  2 +-
 4 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mpage.c b/fs/mpage.c
index 9edb2d6042b2..fecfe9307a7e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -643,7 +643,7 @@ mpage_writepages(struct address_space *mapping,
 				wait_on_page_writeback(page);
 
 			if (page->mapping == mapping && !PageWriteback(page) &&
-						test_clear_page_dirty(page)) {
+						clear_page_dirty_for_io(page)) {
 				if (writepage) {
 					ret = (*writepage)(page, wbc);
 					if (ret) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 94b0326d120f..2ba5ab34cbdd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -472,6 +472,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
 int set_page_dirty_lock(struct page *page);
+int clear_page_dirty_for_io(struct page *page);
 
 /*
  * Prototype to add a shrinker callback for ageable caches.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fa5eeca766cf..113c4f67bb02 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -472,7 +472,7 @@ int write_one_page(struct page *page, int wait)
 	if (wait)
 		wait_on_page_writeback(page);
 
-	if (test_clear_page_dirty(page)) {
+	if (clear_page_dirty_for_io(page)) {
 		page_cache_get(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
@@ -573,6 +573,36 @@ int test_clear_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(test_clear_page_dirty);
 
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout.  We leave the page
+ * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and radix-tree tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		if (TestClearPageDirty(page)) {
+			if (!mapping->backing_dev_info->memory_backed)
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
 /*
  * Clear a page's dirty flag while ignoring dirty memory accounting
  */
@@ -629,6 +659,9 @@ int test_set_page_writeback(struct page *page)
 		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree, page->index,
 						PAGECACHE_TAG_WRITEBACK);
+		if (!PageDirty(page))
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index df658dd6c743..372ef182c478 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -354,7 +354,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
-			if (test_clear_page_dirty(page)) {
+			if (clear_page_dirty_for_io(page)) {
 				int res;
 				struct writeback_control wbc = {
 					.sync_mode = WB_SYNC_NONE,
-- 
cgit v1.2.3


From bd134f2720aa6fe1544a76360999d8e18e5f3e02 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:14:52 -0700
Subject: [PATCH] don't allow background writes to hide dirty buffers

If pdflush hits a locked-and-clean buffer in __block_write_full_page() it
will just pass over the buffer.  Typically the buffer is an ext3 data=ordered
buffer which is being written by kjournald, but a similar thing can happen
with blockdev buffers and ll_rw_block().

This is bad because the buffer is still under I/O and a subsequent fsync's
fdatawait() needs to know about it.

It is not practical to tag the page for writeback - only the submitter of the
I/O can do that, because the submitter has control of the end_io handler.

So instead, redirty the page so a subsequent fsync's fdatawrite() will wait on
the underway I/O.

There is a risk that pdflush::background_writeout() will lock up, repeatedly
trying and failing to write the same page.  This is prevented by ensuring
that background_writeout() always throttles when it made no progress.
---
 fs/buffer.c               | 19 ++++++++++++-------
 fs/fs-writeback.c         |  9 +++++++++
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       |  8 ++++----
 4 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index baae58828510..42b61de10bf3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1802,14 +1802,18 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		get_bh(bh);
 		if (!buffer_mapped(bh))
 			continue;
-		if (wbc->sync_mode != WB_SYNC_NONE) {
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
-		} else {
-			if (test_set_buffer_locked(bh)) {
-				if (buffer_dirty(bh))
-					__set_page_dirty_nobuffers(page);
-				continue;
-			}
+		} else if (test_set_buffer_locked(bh)) {
+			__set_page_dirty_nobuffers(page);
+			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			if (!buffer_uptodate(bh))
@@ -1857,6 +1861,7 @@ done:
 		if (uptodate)
 			SetPageUptodate(page);
 		end_page_writeback(page);
+		wbc->pages_skipped++;	/* We didn't write this page */
 	}
 	return err;
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bd6e0588066e..591c5eb79ba3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -279,6 +279,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 						struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		long pages_skipped;
 
 		if (bdi->memory_backed) {
 			if (sb == blockdev_superblock) {
@@ -326,6 +327,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 
 		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
+		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
 		if (wbc->sync_mode == WB_SYNC_HOLD) {
 			inode->dirtied_when = jiffies;
@@ -333,6 +335,13 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		}
 		if (current_is_pdflush())
 			writeback_release(bdi);
+		if (wbc->pages_skipped != pages_skipped) {
+			/*
+			 * writeback is not making progress due to locked
+			 * buffers.  Skip this inode for now.
+			 */
+			list_move(&inode->i_list, &sb->s_dirty);
+		}
 		spin_unlock(&inode_lock);
 		iput(inode);
 		spin_lock(&inode_lock);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1424811e1eab..7380d2cefb16 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -39,6 +39,7 @@ struct writeback_control {
 					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	long pages_skipped;		/* Pages which were not written */
 	int nonblocking;		/* Don't get stuck on request queues */
 	int encountered_congestion;	/* An output: a queue is full */
 	int for_kupdate;		/* A kupdate writeback */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 113c4f67bb02..1981309fa9c5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -261,13 +261,13 @@ static void background_writeout(unsigned long _min_pages)
 			break;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		if (wbc.nr_to_write > 0) {
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			if (wbc.encountered_congestion)
-				blk_congestion_wait(WRITE, HZ/10);
-			else
+			blk_congestion_wait(WRITE, HZ/10);
+			if (!wbc.encountered_congestion)
 				break;
 		}
 	}
-- 
cgit v1.2.3


From 9672a337305358ecc81dc17700e58ce3f42c11f6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:15:07 -0700
Subject: [PATCH] writeback efficiency and QoS improvements

The radix-tree walk for writeback has a couple of problems:

a) It always scans a file from its first dirty page, so if someone
   is repeatedly dirtying the front part of a file, pages near the end
   may be starved of writeout.  (Well, not completely: the `kupdate'
   function will write an entire file once the file's dirty timestamp
   has expired).

b) When the disk queues are huge (10000 requests), there can be a
   very large number of locked pages.  Scanning past these in writeback
   consumes quite some CPU time.

So in each address_space we record the index at which the last batch of
writeout terminated and start the next batch of writeback from that
point.
---
 fs/mpage.c         | 20 +++++++++++++++++++-
 include/linux/fs.h |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/mpage.c b/fs/mpage.c
index fecfe9307a7e..25fd33b53444 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -610,6 +610,7 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
+	int scanned = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -621,11 +622,18 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	index = 0;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		index = mapping->writeback_index; /* Start from prev offset */
+	} else {
+		index = 0;			  /* whole-file sweep */
+		scanned = 1;
+	}
+retry:
 	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 					PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
 		unsigned i;
 
+		scanned = 1;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
@@ -672,6 +680,16 @@ mpage_writepages(struct address_space *mapping,
 		}
 		pagevec_release(&pvec);
 	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dc8c46fb4b69..bacf6bcbc7b7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -324,6 +324,7 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	unsigned long		nrpages;	/* number of total pages */
+	pgoff_t			writeback_index;/* writeback starts here */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
-- 
cgit v1.2.3


From 3c7011b3e90508f2f3adb895d712d36b1cfdcfd2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:15:46 -0700
Subject: [PATCH] use compound pages for hugetlb pages only

The compound page logic is a little fragile - it relies on additional
metadata in the pageframes which some other kernel code likes to stomp on
(xfs was doing this).

Also, because we're treating all higher-order pages as compound pages it is
no longer possible to free individual lower-order pages from the middle of
higher-order pages.  At least one ARM driver insists on doing this.

We only really need the compound page logic for higher-order pages which can
be mapped into user pagetables and placed under direct-io.  This covers
hugetlb pages and, conceivably, soundcard DMA buffers which were allcoated
with a higher-order allocation but which weren't marked PageReserved.

The patch arranges for the hugetlb implications to allocate their pages with
compound page metadata, and all other higher-order allocations go back to the
old way.

(Andrea supplied the GFP_LEVEL_MASK fix)
---
 arch/i386/mm/hugetlbpage.c    |  3 ++-
 arch/ia64/mm/hugetlbpage.c    |  3 ++-
 arch/ppc64/mm/hugetlbpage.c   |  3 ++-
 arch/sh/mm/hugetlbpage.c      |  3 ++-
 arch/sparc64/mm/hugetlbpage.c |  3 ++-
 include/linux/gfp.h           |  6 ++++++
 include/linux/mm.h            |  4 ++--
 include/linux/slab.h          |  4 +---
 mm/page_alloc.c               | 22 +++++++++++-----------
 9 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 0c73f414b5b1..7224ddcb6a11 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -54,7 +54,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+				HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index aa2a1945d2c2..3dec8e2f4056 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -58,7 +58,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index 032a1c9c5766..a7b2c63c700f 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -78,7 +78,8 @@ static struct page *alloc_fresh_huge_page(void)
 	static int nid = 0;
 	struct page *page;
 
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	if (!page)
 		return NULL;
 
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index f458eb2d0e6e..6f72d865e8d2 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -60,7 +60,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index dd2a7549caef..5a674bbd5796 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -56,7 +56,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c9695427a435..679fc963f842 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -32,10 +32,16 @@
 #define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
 #define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
+#define __GFP_COMP	0x4000	/* Add compound page metadata */
 
 #define __GFP_BITS_SHIFT 16	/* Room for 16 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
 
+/* if you forget to add the bitmask here kernel will crash, period */
+#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
+			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
+			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ba5ab34cbdd..f827be900157 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -247,14 +247,14 @@ static inline int page_count(struct page *p)
 
 static inline void get_page(struct page *page)
 {
-	if (PageCompound(page))
+	if (unlikely(PageCompound(page)))
 		page = (struct page *)page->private;
 	atomic_inc(&page->count);
 }
 
 static inline void put_page(struct page *page)
 {
-	if (PageCompound(page)) {
+	if (unlikely(PageCompound(page))) {
 		page = (struct page *)page->private;
 		if (put_page_testzero(page)) {
 			if (page[1].mapping) {	/* destructor? */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 69be5b308a11..806cc52abd3a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -25,9 +25,7 @@ typedef struct kmem_cache_s kmem_cache_t;
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
-#define SLAB_LEVEL_MASK		(__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
-				__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT|\
-				__GFP_NOFAIL|__GFP_NORETRY)
+#define SLAB_LEVEL_MASK		GFP_LEVEL_MASK
 
 #define	SLAB_NO_GROW		__GFP_NO_GROW	/* don't grow a cache */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96fb97866a28..4148e94eee13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -130,6 +130,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
+	if (!PageCompound(page))
+		return;
+
 	if (page[1].index != order)
 		bad_page(__FUNCTION__, page);
 
@@ -487,10 +490,12 @@ void fastcall free_cold_page(struct page *page)
  * or two.
  */
 
-static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
+static struct page *
+buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
 {
 	unsigned long flags;
 	struct page *page = NULL;
+	int cold = !!(gfp_flags & __GFP_COLD);
 
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
@@ -519,7 +524,7 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
-		if (order)
+		if (order && (gfp_flags & __GFP_COMP))
 			prep_compound_page(page, order);
 	}
 	return page;
@@ -552,16 +557,11 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int cold;
 	int alloc_type;
 	int do_retry;
 
 	might_sleep_if(wait);
 
-	cold = 0;
-	if (gfp_mask & __GFP_COLD)
-		cold = 1;
-
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	if (zones[0] == NULL)     /* no zones in the zonelist */
 		return NULL;
@@ -583,7 +583,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
@@ -606,7 +606,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
@@ -620,7 +620,7 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
@@ -648,7 +648,7 @@ rebalance:
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
-- 
cgit v1.2.3


From e2ea83742133d581a0422f1b2d276e690a81f043 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:16:32 -0700
Subject: [PATCH] mremap: move_vma fixes and cleanup

From: Hugh Dickins <hugh@veritas.com>

Partial rewrite of mremap's move_vma.  Rajesh Venkatasubramanian has pointed
out that vmtruncate could miss ptes, leaving orphaned pages, because move_vma
only made the new vma visible after filling it.  We see no good reason for
that, and time to make move_vma more robust.

Removed all its vma merging decisions, leave them to mmap.c's vma_merge, with
copy_vma added.  Removed duplicated is_mergeable_vma test from vma_merge, and
duplicated validate_mm from insert_vm_struct.

move_vma move from old to new then unmap old; but on error move back from new
to old and unmap new.  Don't unwind within move_page_tables, let move_vma
call it explicitly to unwind, with the right source vma.  Get the
VM_ACCOUNTing right even when the final do_munmap fails.
---
 include/linux/mm.h |   2 +
 mm/mmap.c          |  49 ++++++++++++---
 mm/mremap.c        | 172 +++++++++++++++++------------------------------------
 3 files changed, 97 insertions(+), 126 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f827be900157..43335c61e0da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -541,6 +541,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
+extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
+	unsigned long addr, unsigned long len, unsigned long pgoff);
 extern void exit_mmap(struct mm_struct *);
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 000e377d4888..08e65e8ba699 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -385,7 +385,8 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  * whether that can be merged with its predecessor or its successor.  Or
  * both (it neatly fills a hole).
  */
-static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
+static struct vm_area_struct *vma_merge(struct mm_struct *mm,
+			struct vm_area_struct *prev,
 			struct rb_node *rb_parent, unsigned long addr, 
 			unsigned long end, unsigned long vm_flags,
 			struct file *file, unsigned long pgoff)
@@ -399,7 +400,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 	 * vma->vm_flags & VM_SPECIAL, too.
 	 */
 	if (vm_flags & VM_SPECIAL)
-		return 0;
+		return NULL;
 
 	i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
 
@@ -412,7 +413,6 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 	 * Can it merge with the predecessor?
 	 */
 	if (prev->vm_end == addr &&
-			is_mergeable_vma(prev, file, vm_flags) &&
 			can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 		struct vm_area_struct *next;
 		int need_up = 0;
@@ -443,12 +443,12 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 
 			mm->map_count--;
 			kmem_cache_free(vm_area_cachep, next);
-			return 1;
+			return prev;
 		}
 		spin_unlock(lock);
 		if (need_up)
 			up(i_shared_sem);
-		return 1;
+		return prev;
 	}
 
 	/*
@@ -459,7 +459,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
  merge_next:
 		if (!can_vma_merge_before(prev, vm_flags, file,
 				pgoff, (end - addr) >> PAGE_SHIFT))
-			return 0;
+			return NULL;
 		if (end == prev->vm_start) {
 			if (file)
 				down(i_shared_sem);
@@ -469,11 +469,11 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 			spin_unlock(lock);
 			if (file)
 				up(i_shared_sem);
-			return 1;
+			return prev;
 		}
 	}
 
-	return 0;
+	return NULL;
 }
 
 /*
@@ -1492,5 +1492,36 @@ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 	if (__vma && __vma->vm_start < vma->vm_end)
 		BUG();
 	vma_link(mm, vma, prev, rb_link, rb_parent);
-	validate_mm(mm);
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
+	unsigned long addr, unsigned long len, unsigned long pgoff)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *new_vma, *prev;
+	struct rb_node **rb_link, *rb_parent;
+
+	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len,
+			vma->vm_flags, vma->vm_file, pgoff);
+	if (!new_vma) {
+		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (new_vma) {
+			*new_vma = *vma;
+			INIT_LIST_HEAD(&new_vma->shared);
+			new_vma->vm_start = addr;
+			new_vma->vm_end = addr + len;
+			new_vma->vm_pgoff = pgoff;
+			if (new_vma->vm_file)
+				get_file(new_vma->vm_file);
+			if (new_vma->vm_ops && new_vma->vm_ops->open)
+				new_vma->vm_ops->open(new_vma);
+			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+		}
+	}
+	return new_vma;
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index b685f21c2d21..57c52111ea0b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -148,7 +148,7 @@ out:
 static int move_page_tables(struct vm_area_struct *vma,
 	unsigned long new_addr, unsigned long old_addr, unsigned long len)
 {
-	unsigned long offset = len;
+	unsigned long offset;
 
 	flush_cache_range(vma, old_addr, old_addr + len);
 
@@ -157,137 +157,75 @@ static int move_page_tables(struct vm_area_struct *vma,
 	 * easy way out on the assumption that most remappings will be
 	 * only a few pages.. This also makes error recovery easier.
 	 */
-	while (offset) {
-		offset -= PAGE_SIZE;
-		if (move_one_page(vma, old_addr + offset, new_addr + offset))
-			goto oops_we_failed;
+	for (offset = 0; offset < len; offset += PAGE_SIZE) {
+		if (move_one_page(vma, old_addr+offset, new_addr+offset) < 0)
+			break;
 	}
-	return 0;
-
-	/*
-	 * Ok, the move failed because we didn't have enough pages for
-	 * the new page table tree. This is unlikely, but we have to
-	 * take the possibility into account. In that case we just move
-	 * all the pages back (this will work, because we still have
-	 * the old page tables)
-	 */
-oops_we_failed:
-	flush_cache_range(vma, new_addr, new_addr + len);
-	while ((offset += PAGE_SIZE) < len)
-		move_one_page(vma, new_addr + offset, old_addr + offset);
-	zap_page_range(vma, new_addr, len);
-	return -1;
+	return offset;
 }
 
 static unsigned long move_vma(struct vm_area_struct *vma,
-	unsigned long addr, unsigned long old_len, unsigned long new_len,
-	unsigned long new_addr)
+		unsigned long old_addr, unsigned long old_len,
+		unsigned long new_len, unsigned long new_addr)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct vm_area_struct *new_vma, *next, *prev;
-	int allocated_vma;
+	struct vm_area_struct *new_vma;
+	unsigned long vm_flags = vma->vm_flags;
+	unsigned long new_pgoff;
+	unsigned long moved_len;
+	unsigned long excess = 0;
 	int split = 0;
 
-	new_vma = NULL;
-	next = find_vma_prev(mm, new_addr, &prev);
-	if (next) {
-		if (prev && prev->vm_end == new_addr &&
-		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
-					!(vma->vm_flags & VM_SHARED)) {
-			spin_lock(&mm->page_table_lock);
-			prev->vm_end = new_addr + new_len;
-			spin_unlock(&mm->page_table_lock);
-			new_vma = prev;
-			if (next != prev->vm_next)
-				BUG();
-			if (prev->vm_end == next->vm_start &&
-					can_vma_merge(next, prev->vm_flags)) {
-				spin_lock(&mm->page_table_lock);
-				prev->vm_end = next->vm_end;
-				__vma_unlink(mm, next, prev);
-				spin_unlock(&mm->page_table_lock);
-				if (vma == next)
-					vma = prev;
-				mm->map_count--;
-				kmem_cache_free(vm_area_cachep, next);
-			}
-		} else if (next->vm_start == new_addr + new_len &&
-			  	can_vma_merge(next, vma->vm_flags) &&
-				!vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
-			spin_lock(&mm->page_table_lock);
-			next->vm_start = new_addr;
-			spin_unlock(&mm->page_table_lock);
-			new_vma = next;
-		}
-	} else {
-		prev = find_vma(mm, new_addr-1);
-		if (prev && prev->vm_end == new_addr &&
-		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
-				!(vma->vm_flags & VM_SHARED)) {
-			spin_lock(&mm->page_table_lock);
-			prev->vm_end = new_addr + new_len;
-			spin_unlock(&mm->page_table_lock);
-			new_vma = prev;
-		}
+	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+	new_vma = copy_vma(vma, new_addr, new_len, new_pgoff);
+	if (!new_vma)
+		return -ENOMEM;
+
+	moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
+	if (moved_len < old_len) {
+		/*
+		 * On error, move entries back from new area to old,
+		 * which will succeed since page tables still there,
+		 * and then proceed to unmap new area instead of old.
+		 */
+		move_page_tables(new_vma, old_addr, new_addr, moved_len);
+		vma = new_vma;
+		old_len = new_len;
+		old_addr = new_addr;
+		new_addr = -ENOMEM;
 	}
 
-	allocated_vma = 0;
-	if (!new_vma) {
-		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
-		if (!new_vma)
-			goto out;
-		allocated_vma = 1;
+	/* Conceal VM_ACCOUNT so old reservation is not undone */
+	if (vm_flags & VM_ACCOUNT) {
+		vma->vm_flags &= ~VM_ACCOUNT;
+		excess = vma->vm_end - vma->vm_start - old_len;
+		if (old_addr > vma->vm_start &&
+		    old_addr + old_len < vma->vm_end)
+			split = 1;
 	}
 
-	if (!move_page_tables(vma, new_addr, addr, old_len)) {
-		unsigned long vm_locked = vma->vm_flags & VM_LOCKED;
-
-		if (allocated_vma) {
-			*new_vma = *vma;
-			INIT_LIST_HEAD(&new_vma->shared);
-			new_vma->vm_start = new_addr;
-			new_vma->vm_end = new_addr+new_len;
-			new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT;
-			if (new_vma->vm_file)
-				get_file(new_vma->vm_file);
-			if (new_vma->vm_ops && new_vma->vm_ops->open)
-				new_vma->vm_ops->open(new_vma);
-			insert_vm_struct(current->mm, new_vma);
-		}
+	if (do_munmap(mm, old_addr, old_len) < 0) {
+		/* OOM: unable to split vma, just get accounts right */
+		vm_unacct_memory(excess >> PAGE_SHIFT);
+		excess = 0;
+	}
 
-		/* Conceal VM_ACCOUNT so old reservation is not undone */
-		if (vma->vm_flags & VM_ACCOUNT) {
-			vma->vm_flags &= ~VM_ACCOUNT;
-			if (addr > vma->vm_start) {
-				if (addr + old_len < vma->vm_end)
-					split = 1;
-			} else if (addr + old_len == vma->vm_end)
-				vma = NULL;	/* it will be removed */
-		} else
-			vma = NULL;		/* nothing more to do */
-
-		do_munmap(current->mm, addr, old_len);
-
-		/* Restore VM_ACCOUNT if one or two pieces of vma left */
-		if (vma) {
-			vma->vm_flags |= VM_ACCOUNT;
-			if (split)
-				vma->vm_next->vm_flags |= VM_ACCOUNT;
-		}
+	/* Restore VM_ACCOUNT if one or two pieces of vma left */
+	if (excess) {
+		vma->vm_flags |= VM_ACCOUNT;
+		if (split)
+			vma->vm_next->vm_flags |= VM_ACCOUNT;
+	}
 
-		current->mm->total_vm += new_len >> PAGE_SHIFT;
-		if (vm_locked) {
-			current->mm->locked_vm += new_len >> PAGE_SHIFT;
-			if (new_len > old_len)
-				make_pages_present(new_addr + old_len,
-						   new_addr + new_len);
-		}
-		return new_addr;
+	mm->total_vm += new_len >> PAGE_SHIFT;
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += new_len >> PAGE_SHIFT;
+		if (new_len > old_len)
+			make_pages_present(new_addr + old_len,
+					   new_addr + new_len);
 	}
-	if (allocated_vma)
-		kmem_cache_free(vm_area_cachep, new_vma);
- out:
-	return -ENOMEM;
+
+	return new_addr;
 }
 
 /*
-- 
cgit v1.2.3


From 2039e7b519e24f743a708a5f10bdc95273d1e077 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:16:45 -0700
Subject: [PATCH] mremap: vma_relink_file race fix

From: Hugh Dickins <hugh@veritas.com>

Subtle point from Rajesh Venkatasubramanian: when mremap's move_vma fails and
so rewinds, before moving the file-based ptes back, we must move new_vma
before old vma in the i_mmap or i_mmap_shared list, so that when racing
against vmtruncate we cannot propagate pages to be truncated back from
new_vma into the just cleaned old_vma.
---
 include/linux/mm.h |  1 +
 mm/mmap.c          | 21 +++++++++++++++++++++
 mm/mremap.c        |  7 +++++++
 3 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43335c61e0da..6d6abe8c656e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -543,6 +543,7 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
 	unsigned long addr, unsigned long len, unsigned long pgoff);
+extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *);
 extern void exit_mmap(struct mm_struct *);
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 08e65e8ba699..eed4e083bca1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1525,3 +1525,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
 	}
 	return new_vma;
 }
+
+/*
+ * Position vma after prev in shared file list:
+ * for mremap move error recovery racing against vmtruncate.
+ */
+void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct address_space *mapping;
+
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+		if (mapping) {
+			down(&mapping->i_shared_sem);
+			spin_lock(&mm->page_table_lock);
+			list_move(&vma->shared, &prev->shared);
+			spin_unlock(&mm->page_table_lock);
+			up(&mapping->i_shared_sem);
+		}
+	}
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index 57c52111ea0b..c3502aa241f2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -187,7 +187,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
 		 * and then proceed to unmap new area instead of old.
+		 *
+		 * Subtle point from Rajesh Venkatasubramanian: before
+		 * moving file-based ptes, move new_vma before old vma
+		 * in the i_mmap or i_mmap_shared list, so when racing
+		 * against vmtruncate we cannot propagate pages to be
+		 * truncated back from new_vma into just cleaned old.
 		 */
+		vma_relink_file(vma, new_vma);
 		move_page_tables(new_vma, old_addr, new_addr, moved_len);
 		vma = new_vma;
 		old_len = new_len;
-- 
cgit v1.2.3


From 93d33a4885a483c708ccb7d24b56e0d5fef7bcab Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:17:38 -0700
Subject: [PATCH] laptop mode

From: Bart Samwel <bart@samwel.tk>

Adds /proc/sys/vm/laptop-mode: a special knob which says "this is a laptop".
In this mode the kernel will attempt to avoid spinning disks up.

Algorithm: the idea is to hold dirty data in memory for a long time, but to
flush everything which has been accumulated if the disk happens to spin up
for other reasons.

- Whenever a disk request completes (read or write), schedule a timer a few
  seconds hence.  If the timer was already pending, reset it to a few seconds
  hence.

- When the timer expires, write back the whole world.  We use
  sync_filesystems() for this because it will force ext3 journal commits as
  well.

- In balance_dirty_pages(), kick off background writeback when we hit the
  high threshold (dirty_ratio), not when we hit the low threshold.  This has
  the effect of causing "lumpy" writeback which is something I spent a year
  fixing, but in laptop mode, it is desirable.

- In try_to_free_pages(), only kick pdflush if the VM is getting into
  distress: we want to keep scanning for clean pages, deferring writeback.

- In page reclaim, avoid writing back the odd random dirty page off the
  LRU: only start I/O if the scanning is working harder.

The effect is to perform a sync() a few seconds after all I/O has ceased.

The value which was written into /proc/sys/vm/laptop-mode determines, in
seconds, the delay between the final I/O and the flush.

Additionally, the patch adds tools which help answer the question "why the
heck does my disk spin up all the time?".  The user may set
/proc/sys/vm/block_dump to a non-zero value and the kernel will print out
information which will identify the process which is performing disk reads or
which is dirtying pagecache.

The user should probably disable syslogd before setting block-dump.
---
 Documentation/laptop-mode.txt | 665 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c     |  14 +
 fs/buffer.c                   |   2 +
 fs/fs-writeback.c             |   3 +
 include/linux/sysctl.h        |   2 +
 include/linux/writeback.h     |   6 +-
 kernel/sysctl.c               |  20 ++
 mm/page-writeback.c           |  69 ++++-
 mm/vmscan.c                   |  61 ++--
 9 files changed, 815 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/laptop-mode.txt

(limited to 'include/linux')

diff --git a/Documentation/laptop-mode.txt b/Documentation/laptop-mode.txt
new file mode 100644
index 000000000000..9df8d2677bef
--- /dev/null
+++ b/Documentation/laptop-mode.txt
@@ -0,0 +1,665 @@
+How to conserve battery power using laptop-mode
+-----------------------------------------------
+
+Document Author: Bart Samwel (bart@samwel.tk)
+Date created: January 2, 2004
+Last modified: April 3, 2004
+
+Introduction
+------------
+
+Laptopmode is used to minimize the time that the hard disk needs to be spun up,
+to conserve battery power on laptops. It has been reported to cause significant
+power savings.
+
+Contents
+--------
+
+* Introduction
+* The short story
+* Caveats
+* The details
+* Tips & Tricks
+* Control script
+* ACPI integration
+* Monitoring tool
+
+
+The short story
+---------------
+
+If you just want to use it, run the laptop_mode control script (which is included
+at the end of this document) as follows:
+
+# laptop_mode start
+
+Then set your harddisk spindown time to a relatively low value with hdparm:
+
+hdparm -S 4 /dev/hda
+
+The value -S 4 means 20 seconds idle time before spindown. Your harddisk will
+now only spin up when a disk cache miss occurs, or at least once every 10
+minutes to write back any pending changes.
+
+To stop laptop_mode, run "laptop_mode stop".
+
+
+Caveats
+-------
+
+* The downside of laptop mode is that you have a chance of losing up
+  to 10 minutes of work. If you cannot afford this, don't use it! It's
+  wise to turn OFF laptop mode when you're almost out of battery --
+  although this will make the battery run out faster, at least you'll
+  lose less work when it actually runs out. I'm still looking for someone
+  to submit instructions on how to turn off laptop mode when battery is low,
+  e.g., using ACPI events. I don't have a laptop myself, so if you do and
+  you care to contribute such instructions, please do.
+
+* Most desktop hard drives have a very limited lifetime measured in spindown
+  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
+  Check your drive's rating, and don't wear down your drive's lifetime if you
+  don't need to.
+
+* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+  the control script will not be able to remount them correctly. You must set
+  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
+  wrong options -- or it will fail because it cannot write to /etc/mtab.
+
+* If you have your filesystems listed as type "auto" in fstab, like I did, then
+  the control script will not recognize them as filesystems that need remounting.
+
+* If you have XFS, make SURE that you set the XFS_HZ value in the control script
+  correctly, to the value of HZ of your running kernel. Laptop mode will not
+  work correctly if it is set too low, and you may lose data if it is set too
+  high. The reason for this problem is that XFS does not export its sysctl
+  variables in centisecs (like most other subsystems do) but in "jiffies",
+  which is an internal kernel measure. Once this is fixed things will get better.
+
+
+The details
+-----------
+
+Laptop-mode is controlled by the flag /proc/sys/vm/laptop_mode. When this
+flag is set, any physical disk read operation (that might have caused the
+hard disk to spin up) causes Linux to flush all dirty blocks. The result
+of this is that after a disk has spun down, it will not be spun up anymore
+to write dirty blocks, because those blocks had already been written
+immediately after the most recent read operation
+
+To increase the effectiveness of the laptop_mode strategy, the laptop_mode
+control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
+/proc/sys/vm to about 10 minutes (by default), which means that pages that are
+dirtied are not forced to be written to disk as often. The control script also
+changes the dirty background ratio, so that background writeback of dirty pages
+is not done anymore. Combined with a higher commit value (also 10 minutes) for
+ext3 or ReiserFS filesystems (also done automatically by the control script),
+this results in concentration of disk activity in a small time interval which
+occurs only once every 10 minutes, or whenever the disk is forced to spin up by
+a cache miss. The disk can then be spun down in the periods of inactivity.
+
+If you want to find out which process caused the disk to spin up, you can
+gather information by setting the flag /proc/sys/vm/block_dump. When this flag
+is set, Linux reports all disk read and write operations that take place, and
+all block dirtyings done to files. This makes it possible to debug why a disk
+needs to spin up, and to increase battery life even more. The output of
+block_dump is written to the kernel output, and it can be retrieved using
+"dmesg". When you use block_dump, you may want to turn off klogd, otherwise
+the output of block_dump will be logged, causing disk activity that is not
+normally there.
+
+If 10 minutes is too much or too little downtime for you, you can configure
+this downtime as follows. In the control script, set the MAX_AGE value to the
+maximum number of seconds of disk downtime that you would like. You should
+then set your filesystem's commit interval to the same value. The dirty ratio
+is also configurable from the control script.
+
+If you don't like the idea of the control script remounting your filesystems
+for you, you can change DO_REMOUNTS to 0 in the script.
+
+Thanks to Kiko Piris, the control script can be used to enable laptop mode on
+both the Linux 2.4 and 2.6 series.
+
+
+Tips & Tricks
+-------------
+
+* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
+  of his regular 3 to 3.5 hours) using very aggressive power management (hdparm
+  -B1) and a spindown time of 5 seconds (hdparm -S1).
+
+* You can spin down the disk while playing MP3, by setting the disk readahead
+  to 8MB (hdparm -a 16384). Effectively, the disk will read a complete MP3 at
+  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
+  Kania.)
+
+* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
+  of colours that my display uses it consumes less battery power. I've seen
+  this on powerbooks too. I hope that this is a piece of information that
+  might be useful to the Laptop Mode patch or it's users."
+
+* One thing which will cause disks to spin up is not-present application
+  and dynamic library text pages.  The kernel will load program text off disk
+  on-demand, so each time you invoke an application feature for the first
+  time, the kernel needs to spin the disk up to go and fetch that part of the
+  application.
+
+  So it is useful to increase the disk readahead parameter greatly, so that
+  the kernel will pull all of the executable's pages into memory on the first
+  pagefault.
+
+  The supplied script does this.
+
+* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
+  file after every logging. When you're using laptop-mode and your disk doesn't
+  spin down, this is a likely culprit.
+
+* Richard Atterer observed that laptop mode does not work well with noflushd
+  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
+  from doing its thing.
+
+
+Control script
+--------------
+
+Please note that this control script works for the Linux 2.4 and 2.6 series.
+
+--------------------CONTROL SCRIPT BEGIN------------------------------------------
+#! /bin/sh
+
+# start or stop laptop_mode, best run by a power management daemon when
+# ac gets connected/disconnected from a laptop
+#
+# install as /sbin/laptop_mode
+#
+# Contributors to this script:   Kiko Piris
+#				 Bart Samwel
+#				 Micha Feigin
+#				 Andrew Morton
+#				 Dax Kelson
+#
+# Original Linux 2.4 version by: Jens Axboe
+
+# Remove an option (the first parameter) of the form option=<number> from
+# a mount options string (the rest of the parameters).
+parse_mount_opts () {
+	OPT="$1"
+	shift
+	echo "$*"			| \
+	sed 's/.*/,&,/'			| \
+	sed 's/,'"$OPT"'=[0-9]*,/,/g'	| \
+	sed 's/,,*/,/g'			| \
+	sed 's/^,//'			| \
+	sed 's/,$//'			| \
+	cat -
+}
+
+# Remove an option (the first parameter) without any arguments from
+# a mount option string (the rest of the parameters).
+parse_nonumber_mount_opts () {
+	OPT="$1"
+	shift
+	echo "$*" 			| \
+	sed 's/.*/,&,/'			| \
+	sed 's/,'"$OPT"',/,/g'		| \
+	sed 's/,,*/,/g'			| \
+	sed 's/^,//'			| \
+	sed 's/,$//'			| \
+	cat -
+}
+
+# Find out the state of a yes/no option (e.g. "atime"/"noatime") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, the option name the second, and the default
+# value the third. The remainder is the mount options string.
+#
+# Example:
+# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
+#
+# If fstab contains, say, "rw" for this filesystem, then the result
+# will be "defaults,atime".
+parse_yesno_opts_wfstab () {
+	L_DEV=$1
+	shift
+	OPT=$1
+	shift
+	DEF_OPT=$1
+	shift
+	L_OPTS="$*"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
+	# Watch for a default atime in fstab
+	FSTAB_OPTS="$(cat /etc/fstab | sed 's/  / /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')"
+	if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT")" ] ; then
+		# option not specified in fstab -- choose the default.
+		echo "$PARSEDOPTS1,$DEF_OPT"
+	else
+		# option specified in fstab: extract the value and use it
+		if [ -z "$(echo "$FSTAB_OPTS" | grep "no$OPT")" ] ; then
+			# no$OPT not found -- so we must have $OPT.
+			echo "$PARSEDOPTS1,$OPT"
+		else
+			echo "$PARSEDOPTS1,no$OPT"
+		fi
+	fi
+}
+
+# Find out the state of a numbered option (e.g. "commit=NNN") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, and the option name the second. The
+# remainder is the mount options string in which the replacement
+# must be done.
+#
+# Example:
+# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
+#
+# If fstab contains, say, "commit=3,rw" for this filesystem, then the
+# result will be "rw,commit=3".
+parse_mount_opts_wfstab () {
+	L_DEV=$1
+	shift
+	OPT=$1
+	shift
+	L_OPTS="$*"
+
+	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
+	# Watch for a default commit in fstab
+	FSTAB_OPTS="$(cat /etc/fstab | sed 's/	/ /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')"
+	if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT=")" ] ; then
+		# option not specified in fstab: set it to 0
+		echo "$PARSEDOPTS1,$OPT=0"
+	else
+		# option specified in fstab: extract the value, and use it
+		echo -n "$PARSEDOPTS1,$OPT="
+		echo "$FSTAB_OPTS"	| \
+		sed 's/.*/,&,/'		| \
+		sed 's/.*,'"$OPT"'=//'	| \
+		sed 's/,.*//'		| \
+		cat -
+	fi
+}
+
+KLEVEL="$(uname -r | cut -c1-3)"
+case "$KLEVEL" in
+	"2.4"|"2.6")
+		true
+		;;
+	*)
+		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')"
+		exit 1
+		;;
+esac
+
+# Shall we remount journaled fs. with appropiate commit interval? (1=yes)
+DO_REMOUNTS=1
+
+# age time, in seconds. should be put into a sysconfig file
+MAX_AGE=600
+
+# Dirty synchronous ratio.  At this percentage of dirty pages the process which
+# calls write() does its own writeback
+DIRTY_RATIO=40
+
+#
+# Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
+# exceeded, the kernel will wake pdflush which will then reduce the amount
+# of dirty memory to dirty_background_ratio.  Set this nice and low, so once
+# some writeout has commenced, we do a lot of it.
+#
+DIRTY_BACKGROUND_RATIO=5
+
+READAHEAD=4096		# kilobytes
+
+# kernel default dirty buffer age
+DEF_AGE=30
+DEF_UPDATE=5
+DEF_DIRTY_BACKGROUND_RATIO=10
+DEF_DIRTY_RATIO=40
+DEF_XFS_AGE_BUFFER=15
+DEF_XFS_SYNC_INTERVAL=30
+
+# This must be adjusted manually to the value of HZ in the running kernel,
+# until the XFS people change their external interfaces to work in centisecs
+# like the rest of the external world. Unfortunately this cannot be automated. :(
+XFS_HZ=1000
+
+if [ ! -e /proc/sys/vm/laptop_mode ]; then
+	echo "Kernel is not patched with laptop_mode patch."
+	exit 1
+fi
+
+if [ ! -w /proc/sys/vm/laptop_mode ]; then
+	echo "You do not have enough privileges to enable laptop_mode."
+	exit 1
+fi
+
+case "$1" in
+	start)
+		AGE=$((100*$MAX_AGE))
+		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
+		echo -n "Starting laptop_mode"
+
+		if [ -d /proc/sys/vm/pagebuf ] ; then
+			# This only needs to be set, not reset -- it is only used when
+			# laptop mode is enabled.
+			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# The same goes for these.
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
+			# But not for these -- they are also used in normal
+			# operation.
+			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
+		fi
+
+		case "$KLEVEL" in
+			"2.4")
+				echo "1"				> /proc/sys/vm/laptop_mode
+				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "5"				> /proc/sys/vm/laptop_mode
+				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE,noatime
+						;;
+					"xfs")
+						mount $DEV -t $FST $MP -o remount,$OPTS,noatime
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra $(($READAHEAD * 2)) $DEV
+				fi
+			done
+		fi
+		echo "."
+		;;
+	stop)
+		U_AGE=$((100*$DEF_UPDATE))
+		B_AGE=$((100*$DEF_AGE))
+		echo -n "Stopping laptop_mode"
+		echo "0" > /proc/sys/vm/laptop_mode
+		if [ -f /proc/sys/fs/xfs/age_buffer ] && [ ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# These need to be restored though, if there are no lm_*.
+			echo "$(($XFS_HZ*$DEF_XFS_AGE_BUFFER))" 	> /proc/sys/fs/xfs/age_buffer
+			echo "$(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL))" 	> /proc/sys/fs/xfs/sync_interval
+		fi
+		case "$KLEVEL" in
+			"2.4")
+				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				# Reset commit and atime options to defaults.
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+					"xfs")
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra 256 $DEV
+				fi
+			done
+		fi
+		echo "."
+		;;
+	*)
+		echo "Usage: $0 {start|stop}"
+		;;
+
+esac
+
+exit 0
+
+--------------------CONTROL SCRIPT END--------------------------------------------
+
+
+ACPI integration
+----------------
+
+Dax Kelson submitted this so that the ACPI acpid daemon will
+kick off the laptop_mode script and run hdparm.
+
+---------------------------/etc/acpi/events/ac_adapter BEGIN-------------------------------------------
+event=ac_adapter
+action=/etc/acpi/actions/battery.sh
+---------------------------/etc/acpi/events/ac_adapter END-------------------------------------------
+
+---------------------------/etc/acpi/actions/battery.sh BEGIN-------------------------------------------
+#!/bin/sh
+
+# cpu throttling
+# cat /proc/acpi/processor/CPU0/throttling for more info
+ACAD_THR=0
+BATT_THR=2
+
+# spindown time for HD (man hdparm for valid values)
+# I prefer 2 hours for acad and 20 seconds for batt
+ACAD_HD=244
+BATT_HD=4
+
+# ac/battery event handler
+
+status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/AC/state`
+
+case $status in
+        "on-line")
+                echo "Setting HD spindown to 2 hours"
+                /sbin/laptop-mode stop
+                /sbin/hdparm -S $ACAD_HD /dev/hda > /dev/null 2>&1
+                /sbin/hdparm -B 255 /dev/hda > /dev/null 2>&1
+                #echo -n $ACAD_CPU:$ACAD_THR > /proc/acpi/processor/CPU0/limit
+                exit 0
+        ;;
+        "off-line")
+                echo "Setting HD spindown to 20 seconds"
+                /sbin/laptop-mode start
+                /sbin/hdparm -S $BATT_HD /dev/hda > /dev/null 2>&1
+                /sbin/hdparm -B 1 /dev/hda > /dev/null 2>&1
+                #echo -n $BATT_CPU:$BATT_THR > /proc/acpi/processor/CPU0/limit
+                exit 0
+        ;;
+esac
+---------------------------/etc/acpi/actions/battery.sh END-------------------------------------------
+
+Monitoring tool
+---------------
+
+Bartek Kania submitted this, it can be used to measure how much time your disk
+spends spun up/down.
+
+---------------------------dslm.c BEGIN-------------------------------------------
+/*
+ * Simple Disk Sleep Monitor
+ *  by Bartek Kania
+ * Licenced under the GPL
+ */
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/hdreg.h>
+
+#ifdef DEBUG
+#define D(x) x
+#else
+#define D(x)
+#endif
+
+int endit = 0;
+
+/* Check if the disk is in powersave-mode
+ * Most of the code is stolen from hdparm.
+ * 1 = active, 0 = standby/sleep, -1 = unknown */
+int check_powermode(int fd)
+{
+    unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0};
+    int state;
+
+    if (ioctl(fd, HDIO_DRIVE_CMD, &args)
+	&& (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */
+	&& ioctl(fd, HDIO_DRIVE_CMD, &args)) {
+	if (errno != EIO || args[0] != 0 || args[1] != 0) {
+	    state = -1; /* "unknown"; */
+	} else
+	    state = 0; /* "sleeping"; */
+    } else {
+	state = (args[2] == 255) ? 1 : 0;
+    }
+    D(printf(" drive state is:  %d\n", state));
+
+    return state;
+}
+
+char *state_name(int i)
+{
+    if (i == -1) return "unknown";
+    if (i == 0) return "sleeping";
+    if (i == 1) return "active";
+
+    return "internal error";
+}
+
+char *myctime(time_t time)
+{
+    char *ts = ctime(&time);
+    ts[strlen(ts) - 1] = 0;
+
+    return ts;
+}
+
+void measure(int fd)
+{
+    time_t start_time;
+    int last_state;
+    time_t last_time;
+    int curr_state;
+    time_t curr_time = 0;
+    time_t time_diff;
+    time_t active_time = 0;
+    time_t sleep_time = 0;
+    time_t unknown_time = 0;
+    time_t total_time = 0;
+    int changes = 0;
+    float tmp;
+
+    printf("Starting measurements\n");
+
+    last_state = check_powermode(fd);
+    start_time = last_time = time(0);
+    printf("  System is in state %s\n\n", state_name(last_state));
+
+    while(!endit) {
+	sleep(1);
+	curr_state = check_powermode(fd);
+
+	if (curr_state != last_state || endit) {
+	    changes++;
+	    curr_time = time(0);
+	    time_diff = curr_time - last_time;
+
+	    if (last_state == 1) active_time += time_diff;
+	    else if (last_state == 0) sleep_time += time_diff;
+	    else unknown_time += time_diff;
+
+	    last_state = curr_state;
+	    last_time = curr_time;
+
+	    printf("%s: State-change to %s\n", myctime(curr_time),
+		   state_name(curr_state));
+	}
+    }
+    changes--; /* Compensate for SIGINT */
+
+    total_time = time(0) - start_time;
+    printf("\nTotal running time:  %lus\n", curr_time - start_time);
+    printf(" State changed %d times\n", changes);
+
+    tmp = (float)sleep_time / (float)total_time * 100;
+    printf(" Time in sleep state:   %lus (%.2f%%)\n", sleep_time, tmp);
+    tmp = (float)active_time / (float)total_time * 100;
+    printf(" Time in active state:  %lus (%.2f%%)\n", active_time, tmp);
+    tmp = (float)unknown_time / (float)total_time * 100;
+    printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp);
+}
+
+void ender(int s)
+{
+    endit = 1;
+}
+
+void usage()
+{
+    puts("usage: dslm [-w <time>] <disk>");
+    exit(0);
+}
+
+int main(int ac, char **av)
+{
+    int fd;
+    char *disk = 0;
+    int settle_time = 60;
+
+    /* Parse the simple command-line */
+    if (ac == 2)
+	disk = av[1];
+    else if (ac == 4) {
+	settle_time = atoi(av[2]);
+	disk = av[3];
+    } else
+	usage();
+
+    if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) {
+	printf("Can't open %s, because: %s\n", disk, strerror(errno));
+	exit(-1);
+    }
+
+    if (settle_time) {
+	printf("Waiting %d seconds for the system to settle down to "
+	       "'normal'\n", settle_time);
+	sleep(settle_time);
+    } else
+	puts("Not waiting for system to settle down");
+
+    signal(SIGINT, ender);
+
+    measure(fd);
+
+    close(fd);
+
+    return 0;
+}
+---------------------------dslm.c END---------------------------------------------
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 7b9f0c75bffd..fc4b6c698fcf 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -27,6 +27,7 @@
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/writeback.h>
 
 /*
  * for max sense size
@@ -2471,6 +2472,16 @@ int submit_bio(int rw, struct bio *bio)
 		mod_page_state(pgpgout, count);
 	else
 		mod_page_state(pgpgin, count);
+
+	if (unlikely(block_dump)) {
+		char b[BDEVNAME_SIZE];
+		printk("%s(%d): %s block %Lu on %s\n",
+			current->comm, current->pid,
+			(rw & WRITE) ? "WRITE" : "READ",
+			(unsigned long long)bio->bi_sector,
+			bdevname(bio->bi_bdev,b));
+	}
+
 	generic_make_request(bio);
 	return 1;
 }
@@ -2754,6 +2765,9 @@ void end_that_request_last(struct request *req)
 	struct gendisk *disk = req->rq_disk;
 	struct completion *waiting = req->waiting;
 
+	if (unlikely(laptop_mode))
+		laptop_io_completion();
+
 	if (disk && blk_fs_request(req)) {
 		unsigned long duration = jiffies - req->start_time;
 		switch (rq_data_dir(req)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 42b61de10bf3..605ce2099aa5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -274,6 +274,8 @@ static void do_sync(unsigned long wait)
 	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
 	if (!wait)
 		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
 }
 
 asmlinkage long sys_sync(void)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591c5eb79ba3..23e367ed22f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -75,6 +75,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) == flags)
 		return;
 
+	if (unlikely(block_dump))
+		printk("%s(%d): dirtied file\n", current->comm, current->pid);
+
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3767428df94d..d2224f6617f9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -159,6 +159,8 @@ enum
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
+	VM_LAPTOP_MODE=23,	/* vm laptop mode */
+	VM_BLOCK_DUMP=24,	/* block dump mode */
 };
 
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7380d2cefb16..f557b55e8b0a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -72,12 +72,16 @@ static inline void wait_on_inode(struct inode *inode)
  * mm/page-writeback.c
  */
 int wakeup_bdflush(long nr_pages);
+void laptop_io_completion(void);
+void laptop_sync_completion(void);
 
-/* These 5 are exported to sysctl. */
+/* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
 extern int dirty_writeback_centisecs;
 extern int dirty_expire_centisecs;
+extern int block_dump;
+extern int laptop_mode;
 
 struct ctl_table;
 struct file;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f2c8c8ce4926..05ea59ae4276 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -744,6 +744,26 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= VM_LAPTOP_MODE,
+		.procname	= "laptop_mode",
+		.data		= &laptop_mode,
+		.maxlen		= sizeof(laptop_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_BLOCK_DUMP,
+		.procname	= "block_dump",
+		.data		= &block_dump,
+		.maxlen		= sizeof(block_dump),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1981309fa9c5..9cf47af10ccc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -28,6 +28,7 @@
 #include <linux/smp.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/syscalls.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -81,6 +82,16 @@ int dirty_writeback_centisecs = 5 * 100;
  */
 int dirty_expire_centisecs = 30 * 100;
 
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode".
+ */
+int laptop_mode;
+
 /* End of sysctl-exported parameters */
 
 
@@ -195,7 +206,19 @@ static void balance_dirty_pages(struct address_space *mapping)
 	if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 		dirty_exceeded = 0;
 
-	if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
+	if (writeback_in_progress(bdi))
+		return;		/* pdflush is already working this queue */
+
+	/*
+	 * In laptop mode, we wait until hitting the higher threshold before
+	 * starting background writeout, and then write out all the way down
+	 * to the lower threshold.  So slow writers cause minimal disk activity.
+	 *
+	 * In normal mode, we start background writeout at the lower
+	 * background_thresh, to keep the amount of dirty memory low.
+	 */
+	if ((laptop_mode && pages_written) ||
+	     (!laptop_mode && (nr_reclaimable > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -289,7 +312,13 @@ int wakeup_bdflush(long nr_pages)
 	return pdflush_operation(background_writeout, nr_pages);
 }
 
-static struct timer_list wb_timer;
+static void wb_timer_fn(unsigned long unused);
+static void laptop_timer_fn(unsigned long unused);
+
+static struct timer_list wb_timer =
+			TIMER_INITIALIZER(wb_timer_fn, 0, 0);
+static struct timer_list laptop_mode_wb_timer =
+			TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
 
 /*
  * Periodic writeback of "old" data.
@@ -368,7 +397,36 @@ static void wb_timer_fn(unsigned long unused)
 {
 	if (pdflush_operation(wb_kupdate, 0) < 0)
 		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+}
+
+static void laptop_flush(unsigned long unused)
+{
+	sys_sync();
+}
+
+static void laptop_timer_fn(unsigned long unused)
+{
+	pdflush_operation(laptop_flush, 0);
+}
 
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now.  If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(void)
+{
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+	del_timer(&laptop_mode_wb_timer);
 }
 
 /*
@@ -429,12 +487,7 @@ void __init page_writeback_init(void)
 		vm_dirty_ratio *= correction;
 		vm_dirty_ratio /= 100;
 	}
-
-	init_timer(&wb_timer);
-	wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
-	wb_timer.data = 0;
-	wb_timer.function = wb_timer_fn;
-	add_timer(&wb_timer);
+	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0728eadc0eb7..39e8ed0fcdd6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -246,7 +246,8 @@ static void handle_write_error(struct address_space *mapping,
  * shrink_list returns the number of reclaimed pages
  */
 static int
-shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
+shrink_list(struct list_head *page_list, unsigned int gfp_mask,
+		int *nr_scanned, int do_writepage)
 {
 	struct address_space *mapping;
 	LIST_HEAD(ret_pages);
@@ -354,6 +355,8 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
+			if (laptop_mode && !do_writepage)
+				goto keep_locked;
 			if (clear_page_dirty_for_io(page)) {
 				int res;
 				struct writeback_control wbc = {
@@ -473,7 +476,7 @@ keep:
  */
 static int
 shrink_cache(struct zone *zone, unsigned int gfp_mask,
-		int max_scan, int *total_scanned)
+		int max_scan, int *total_scanned, int do_writepage)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -521,7 +524,8 @@ shrink_cache(struct zone *zone, unsigned int gfp_mask,
 			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 		else
 			mod_page_state_zone(zone, pgscan_direct, nr_scan);
-		nr_freed = shrink_list(&page_list, gfp_mask, total_scanned);
+		nr_freed = shrink_list(&page_list, gfp_mask,
+					total_scanned, do_writepage);
 		*total_scanned += nr_taken;
 		if (current_is_kswapd())
 			mod_page_state(kswapd_steal, nr_freed);
@@ -735,7 +739,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
  */
 static int
 shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
-		int *total_scanned, struct page_state *ps)
+		int *total_scanned, struct page_state *ps, int do_writepage)
 {
 	unsigned long ratio;
 	int count;
@@ -764,7 +768,8 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
 	count = atomic_read(&zone->nr_scan_inactive);
 	if (count >= SWAP_CLUSTER_MAX) {
 		atomic_set(&zone->nr_scan_inactive, 0);
-		return shrink_cache(zone, gfp_mask, count, total_scanned);
+		return shrink_cache(zone, gfp_mask, count,
+					total_scanned, do_writepage);
 	}
 	return 0;
 }
@@ -787,7 +792,7 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
  */
 static int
 shrink_caches(struct zone **zones, int priority, int *total_scanned,
-		int gfp_mask, struct page_state *ps)
+		int gfp_mask, struct page_state *ps, int do_writepage)
 {
 	int ret = 0;
 	int i;
@@ -803,7 +808,8 @@ shrink_caches(struct zone **zones, int priority, int *total_scanned,
 			continue;	/* Let kswapd poll it */
 
 		max_scan = zone->nr_inactive >> priority;
-		ret += shrink_zone(zone, max_scan, gfp_mask, total_scanned, ps);
+		ret += shrink_zone(zone, max_scan, gfp_mask,
+					total_scanned, ps, do_writepage);
 	}
 	return ret;
 }
@@ -833,6 +839,8 @@ int try_to_free_pages(struct zone **zones,
 	int nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	int i;
+	unsigned long total_scanned = 0;
+	int do_writepage = 0;
 
 	inc_page_state(allocstall);
 
@@ -840,13 +848,13 @@ int try_to_free_pages(struct zone **zones,
 		zones[i]->temp_priority = DEF_PRIORITY;
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		int total_scanned = 0;
+		int scanned = 0;
 		struct page_state ps;
 
 		get_page_state(&ps);
-		nr_reclaimed += shrink_caches(zones, priority, &total_scanned,
-						gfp_mask, &ps);
-		shrink_slab(total_scanned, gfp_mask);
+		nr_reclaimed += shrink_caches(zones, priority, &scanned,
+						gfp_mask, &ps, do_writepage);
+		shrink_slab(scanned, gfp_mask);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
@@ -858,14 +866,20 @@ int try_to_free_pages(struct zone **zones,
 		if (!(gfp_mask & __GFP_FS))
 			break;		/* Let the caller handle it */
 		/*
-		 * Try to write back as many pages as we just scanned.  Not
-		 * sure if that makes sense, but it's an attempt to avoid
-		 * creating IO storms unnecessarily
+		 * Try to write back as many pages as we just scanned.  This
+		 * tends to cause slow streaming writers to write data to the
+		 * disk smoothly, at the dirtying rate, which is nice.   But
+		 * that's undesirable in laptop mode, where we *want* lumpy
+		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		wakeup_bdflush(total_scanned);
+		total_scanned += scanned;
+		if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
+			wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+			do_writepage = 1;
+		}
 
 		/* Take a nap, wait for some writeback to complete */
-		if (total_scanned && priority < DEF_PRIORITY - 2)
+		if (scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
@@ -908,6 +922,8 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 	int i;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long total_scanned = 0;
+	unsigned long total_reclaimed = 0;
+	int do_writepage = 0;
 
 	inc_page_state(pageoutrun);
 
@@ -969,16 +985,25 @@ scan:
 			zone->temp_priority = priority;
 			max_scan = zone->nr_inactive >> priority;
 			reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL,
-					&scanned, ps);
+					&scanned, ps, do_writepage);
 			total_scanned += scanned;
 			reclaim_state->reclaimed_slab = 0;
 			shrink_slab(scanned, GFP_KERNEL);
 			reclaimed += reclaim_state->reclaimed_slab;
+			total_reclaimed += reclaimed;
 			to_free -= reclaimed;
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned > zone->present_pages * 2)
 				zone->all_unreclaimable = 1;
+			/*
+			 * If we've done a decent amount of scanning and
+			 * the reclaim ratio is low, start doing writepage
+			 * even in laptop mode
+			 */
+			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			    total_scanned > total_reclaimed+total_reclaimed/2)
+				do_writepage = 1;
 		}
 		if (nr_pages && to_free > 0)
 			continue;	/* swsusp: need to do more work */
@@ -997,7 +1022,7 @@ out:
 
 		zone->prev_priority = zone->temp_priority;
 	}
-	return nr_pages - to_free;
+	return total_reclaimed;
 }
 
 /*
-- 
cgit v1.2.3


From 26f14a5727b9447f87ba4cdd8e83fb9e8af79631 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:17:51 -0700
Subject: [PATCH] Add commit=0 to ext3, meaning "set commit to default".

From: Bart Samwel <bart@samwel.tk>

Add support for the value "0" to ext3's "commit" option.  When this value
is given, ext3 substitutes it by the default commit interval.  Introduce a
constant JBD_DEFAULT_MAX_COMMIT_AGE for this.
---
 fs/ext3/super.c     | 4 ++++
 fs/jbd/journal.c    | 2 +-
 include/linux/jbd.h | 5 +++++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e6ae6c9e0f46..14d9f3c72055 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -735,6 +735,10 @@ static int parse_options (char * options, struct ext3_sb_info *sbi,
 		case Opt_commit:
 			if (match_int(&args[0], &option))
 				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			break;
 		case Opt_data_journal:
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7550f31d0249..897a8c886259 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -636,7 +636,7 @@ static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_list_lock);
 	spin_lock_init(&journal->j_state_lock);
 
-	journal->j_commit_interval = (HZ * 5);
+	journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JFS_ABORT;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 241387b13764..62c7f363ae74 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -42,6 +42,11 @@
  */
 #undef JBD_PARANOID_IOFAIL
 
+/*
+ * The default maximum commit age, in seconds.
+ */
+#define JBD_DEFAULT_MAX_COMMIT_AGE 5
+
 #ifdef CONFIG_JBD_DEBUG
 /*
  * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
-- 
cgit v1.2.3


From 95f238eac82907c4ccbc301cd5788e67db0715ce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:18:43 -0700
Subject: [PATCH] ia32: 4Kb stacks (and irqstacks) patch

From: Arjan van de Ven <arjanv@redhat.com>

Below is a patch to enable 4Kb stacks for x86. The goal of this is to

1) Reduce footprint per thread so that systems can run many more threads
   (for the java people)

2) Reduce the pressure on the VM for order > 0 allocations. We see real life
   workloads (granted with 2.4 but the fundamental fragmentation issue isn't
   solved in 2.6 and isn't solvable in theory) where this can be a problem.
   In addition order > 0 allocations can make the VM "stutter" and give more
   latency due to having to do much much more work trying to defragment

The first 2 bits of the patch actually affect compiler options in a generic
way: I propose to disable the -funit-at-a-time feature from gcc.  With this
enabled (and it's default with -O2), gcc will very agressively inline
functions, which is nice and all for userspace, but for the kernel this makes
us suffer a gcc deficiency more: gcc is extremely bad at sharing stackslots,
for example a situation like this:

if (some_condition)
	function_A();
else
	function_B();

with -funit-at-a-time, both function_A() and _B() might get inlined, however
the stack usage of both functions of the parent function grows the stack
usage of both functions COMBINED instead of the maximum of the two.  Even
with the normal 8Kb stacks this is a danger since we see some functions grow
3Kb to 4Kb of stack use this way.  With 4Kb stacks, 4Kb of stack usage growth
obviously is deadly ;-( but even with 8Kb stacks it's pure lottery.
Disabling -funit-at-a-time also exposes another thing in the -mm tree; the
attribute always_inline is considered harmful by gcc folks in that when gcc
makes a decision to NOT inline a function marked this way, it throws an
error.  Disabling -funit-at-a-time disables some of the agressive inlining
(eg of large functions that come later in the .c file) so this would make
your tree not compile.

The 4k stackness of the kernel is included in modversions, so people don't
load 4k-stack modules into 8k-stack kernels.

At present 4k stacks are selectable in config.  When the feature has settled
in we should remove the 8k option.  This will break the nvidia modules.  But
Fedora uses 4k stacks so a new nvidia driver is expected soon.
---
 arch/i386/Kconfig              |   9 +++
 arch/i386/Makefile             |   6 +-
 arch/i386/kernel/i8259.c       |   3 +
 arch/i386/kernel/irq.c         | 146 ++++++++++++++++++++++++++++++++++++++++-
 arch/i386/kernel/smpboot.c     |   2 +
 arch/i386/kernel/traps.c       |  18 +++--
 include/asm-alpha/irq.h        |   3 +
 include/asm-arm/irq.h          |   4 ++
 include/asm-arm26/irq.h        |   2 +
 include/asm-cris/irq.h         |   4 ++
 include/asm-h8300/irq.h        |   4 ++
 include/asm-i386/irq.h         |  25 +++++++
 include/asm-i386/module.h      |   8 ++-
 include/asm-i386/thread_info.h |  24 ++++++-
 include/asm-ia64/irq.h         |   4 ++
 include/asm-m68k/irq.h         |   4 ++
 include/asm-m68knommu/irq.h    |   4 ++
 include/asm-mips/irq.h         |   3 +
 include/asm-parisc/irq.h       |   3 +
 include/asm-ppc/irq.h          |   4 ++
 include/asm-ppc64/irq.h        |   4 ++
 include/asm-s390/irq.h         |   4 ++
 include/asm-sh/irq.h           |   4 ++
 include/asm-sparc/irq.h        |   4 ++
 include/asm-sparc64/irq.h      |   4 ++
 include/asm-um/irq.h           |   5 ++
 include/asm-v850/irq.h         |   4 ++
 include/asm-x86_64/irq.h       |   4 ++
 include/linux/compiler-gcc3.h  |   2 +-
 include/linux/irq.h            |   1 -
 kernel/softirq.c               |  70 ++++++++++++--------
 31 files changed, 342 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 03620021bd6b..c6439f846171 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1294,6 +1294,15 @@ config FRAME_POINTER
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame pointers.
 
+config 4KSTACKS
+	bool "Use 4Kb for kernel stacks instead of 8Kb"
+	help
+	  If you say Y here the kernel will use a 4Kb stacksize for the
+	  kernel stack attached to each process/thread. This facilitates
+	  running more threads on a system and also reduces the pressure
+	  on the VM subsystem for higher order allocations. This option
+	  will also use IRQ stacks to compensate for the reduced stackspace.
+
 config X86_FIND_SMP_CONFIG
 	bool
 	depends on X86_LOCAL_APIC || X86_VOYAGER
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 4c8f1c06f572..019544e08f1e 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -56,9 +56,9 @@ cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 GCC_VERSION			:= $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
 cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
 
-# Enable unit-at-a-time mode when possible. It shrinks the
-# kernel considerably.
-CFLAGS += $(call check_gcc,-funit-at-a-time,)
+# Disable unit-at-a-time mode, it makes gcc use a lot more stack
+# due to the lack of sharing of stacklots.
+CFLAGS += $(call check_gcc,-fno-unit-at-a-time,)
 
 CFLAGS += $(cflags-y)
 
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index f093e29b69a2..48fbaf5cee34 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -444,4 +444,7 @@ void __init init_IRQ(void)
 	 */
 	if (boot_cpu_data.hard_math && !cpu_has_fpu)
 		setup_irq(FPU_IRQ, &fpu_irq);
+
+	current_thread_info()->cpu = 0;
+	irq_ctx_init(0);
 }
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index c1385d668bf4..ea69f21994f6 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -74,6 +74,14 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 
 static void register_irq_proc (unsigned int irq);
 
+/*
+ * per-CPU IRQ handling stacks
+ */
+#ifdef CONFIG_4KSTACKS
+union irq_ctx *hardirq_ctx[NR_CPUS];
+union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
+
 /*
  * Special irq handlers.
  */
@@ -209,7 +217,7 @@ inline void synchronize_irq(unsigned int irq)
  * waste of time and is not what some drivers would
  * prefer.
  */
-int handle_IRQ_event(unsigned int irq,
+asmlinkage int handle_IRQ_event(unsigned int irq,
 		struct pt_regs *regs, struct irqaction *action)
 {
 	int status = 1;	/* Force the "do bottom halves" bit */
@@ -432,7 +440,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(esp < (sizeof(struct thread_info) + 1024))) {
+		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
 			printk("do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
@@ -480,11 +488,68 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 	 * useful for irq hardware that does not mask cleanly in an
 	 * SMP environment.
 	 */
+#ifdef CONFIG_4KSTACKS
+
 	for (;;) {
 		irqreturn_t action_ret;
+		u32 *isp;
+		union irq_ctx * curctx;
+		union irq_ctx * irqctx;
+
+		curctx = (union irq_ctx *) current_thread_info();
+		irqctx = hardirq_ctx[smp_processor_id()];
 
 		spin_unlock(&desc->lock);
+
+		/*
+		 * this is where we switch to the IRQ stack. However, if we are already using
+		 * the IRQ stack (because we interrupted a hardirq handler) we can't do that
+		 * and just have to keep using the current stack (which is the irq stack already
+		 * after all)
+		 */
+
+		if (curctx == irqctx)
+			action_ret = handle_IRQ_event(irq, &regs, action);
+		else {
+			/* build the stack frame on the IRQ stack */
+			isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+			irqctx->tinfo.task = curctx->tinfo.task;
+			irqctx->tinfo.previous_esp = current_stack_pointer();
+
+			*--isp = (u32) action;
+			*--isp = (u32) &regs;
+			*--isp = (u32) irq;
+
+			asm volatile(
+				"       xchgl   %%ebx,%%esp     \n"
+				"       call    handle_IRQ_event \n"
+				"       xchgl   %%ebx,%%esp     \n"
+				: "=a"(action_ret)
+				: "b"(isp)
+				: "memory", "cc", "edx", "ecx"
+			);
+
+
+		}
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (curctx != irqctx)
+			irqctx->tinfo.task = NULL;
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+
+#else
+
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
 		action_ret = handle_IRQ_event(irq, &regs, action);
+
 		spin_lock(&desc->lock);
 		if (!noirqdebug)
 			note_interrupt(irq, desc, action_ret);
@@ -492,6 +557,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 			break;
 		desc->status &= ~IRQ_PENDING;
 	}
+#endif
 	desc->status &= ~IRQ_INPROGRESS;
 
 out:
@@ -1049,3 +1115,79 @@ void init_irq_proc (void)
 		register_irq_proc(i);
 }
 
+
+#ifdef CONFIG_4KSTACKS
+static char softirq_stack[NR_CPUS * THREAD_SIZE]  __attribute__((__aligned__(THREAD_SIZE)));
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]  __attribute__((__aligned__(THREAD_SIZE)));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+	union irq_ctx *irqctx;
+
+	if (hardirq_ctx[cpu])
+		return;
+
+	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	hardirq_ctx[cpu] = irqctx;
+
+	irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	softirq_ctx[cpu] = irqctx;
+
+	printk("CPU %u irqstacks, hard=%p soft=%p\n",
+		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+	unsigned long flags;
+	struct thread_info *curctx;
+	union irq_ctx *irqctx;
+	u32 *isp;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	if (local_softirq_pending()) {
+		curctx = current_thread_info();
+		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx->tinfo.task = curctx->task;
+		irqctx->tinfo.previous_esp = current_stack_pointer();
+
+		/* build the stack frame on the softirq stack */
+		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+
+		asm volatile(
+			"       xchgl   %%ebx,%%esp     \n"
+			"       call    __do_softirq    \n"
+			"       movl    %%ebx,%%esp     \n"
+			: "=b"(isp)
+			: "0"(isp)
+			: "memory", "cc", "edx", "ecx", "eax"
+		);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index a15be84152c4..7baa4d420b73 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -815,6 +815,8 @@ static int __init do_boot_cpu(int apicid)
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	stack_start.esp = (void *) idle->thread.esp;
 
+	irq_ctx_init(cpu);
+
 	/*
 	 * This grunge runs the startup process for
 	 * the targeted processor.
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe976abcdea9..cf8da7ba4cdb 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -105,12 +105,20 @@ void show_trace(struct task_struct *task, unsigned long * stack)
 #ifdef CONFIG_KALLSYMS
 	printk("\n");
 #endif
-	while (!kstack_end(stack)) {
-		addr = *stack++;
-		if (kernel_text_address(addr)) {
-			printk(" [<%08lx>] ", addr);
-			print_symbol("%s\n", addr);
+	while (1) {
+		struct thread_info *context;
+		context = (struct thread_info*) ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+		while (!kstack_end(stack)) {
+			addr = *stack++;
+			if (kernel_text_address(addr)) {
+				printk(" [<%08lx>] ", addr);
+				print_symbol("%s\n", addr);
+			}
 		}
+		stack = (unsigned long*)context->previous_esp;
+		if (!stack)
+			break;
+		printk(" =======================\n");
 	}
 	printk("\n");
 }
diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index 551c7308c642..566db720000a 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -93,5 +93,8 @@ extern void enable_irq(unsigned int);
 struct pt_regs;
 extern void (*perf_irq)(unsigned long, struct pt_regs *);
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 
 #endif /* _ALPHA_IRQ_H */
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index a89f7345ed39..286be7cf7c63 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -44,5 +44,9 @@ void disable_irq_wake(unsigned int irq);
 void enable_irq_wake(unsigned int irq);
 int setup_irq(unsigned int, struct irqaction *);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
 
diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h
index 68712e576c6f..06bd5a543d13 100644
--- a/include/asm-arm26/irq.h
+++ b/include/asm-arm26/irq.h
@@ -45,6 +45,8 @@ extern void enable_irq(unsigned int);
 int set_irq_type(unsigned int irq, unsigned int type);
 
 int setup_irq(unsigned int, struct irqaction *);
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 
 #endif
 
diff --git a/include/asm-cris/irq.h b/include/asm-cris/irq.h
index caa45facb1b2..87f342517bb1 100644
--- a/include/asm-cris/irq.h
+++ b/include/asm-cris/irq.h
@@ -14,6 +14,10 @@ extern void enable_irq(unsigned int);
 #define disable_irq_nosync      disable_irq
 #define enable_irq_nosync       enable_irq
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif  /* _ASM_IRQ_H */
 
 
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index fabde1dd34a1..5027181ed067 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -68,4 +68,8 @@ extern void disable_irq(unsigned int);
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _H8300_IRQ_H_ */
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 69cb661b012a..5649b4a79bb2 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 /* include comes from machine specific directory */
 #include "irq_vectors.h"
+#include <asm/thread_info.h>
 
 static __inline__ int irq_canonicalize(int irq)
 {
@@ -30,4 +31,28 @@ extern int can_request_irq(unsigned int, unsigned long flags);
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+	struct thread_info      tinfo;
+	u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+extern union irq_ctx *hardirq_ctx[NR_CPUS];
+extern union irq_ctx *softirq_ctx[NR_CPUS];
+
+extern void irq_ctx_init(int cpu);
+
+#define __ARCH_HAS_DO_SOFTIRQ
+#else
+#define irq_ctx_init(cpu) do { ; } while (0)
+#endif
+
+struct irqaction;
+struct pt_regs;
+asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
+				struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index 76fc36f60ebe..8ec1dae638cb 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -60,6 +60,12 @@ struct mod_arch_specific
 #define MODULE_REGPARM ""
 #endif
 
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM
+#ifdef CONFIG_4KSTACKS
+#define MODULE_STACKSIZE "4KSTACKS "
+#else
+#define MODULE_STACKSIZE ""
+#endif
+
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
 
 #endif /* _ASM_I386_MODULE_H */
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 75f940011daa..da5c780f2c5c 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -9,6 +9,9 @@
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
+#include <asm/page.h>
+
 #ifndef __ASSEMBLY__
 #include <asm/processor.h>
 #endif
@@ -29,12 +32,16 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
 
+
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
 	struct restart_block    restart_block;
 
+	unsigned long           previous_esp;   /* ESP of the previous stack in case
+						   of nested (IRQ) stacks
+						*/
 	__u8			supervisor_stack[0];
 };
 
@@ -53,7 +60,13 @@ struct thread_info {
 #endif
 
 #define PREEMPT_ACTIVE		0x4000000
+#ifdef CONFIG_4KSTACKS
+#define THREAD_SIZE            (4096)
+#else
+#define THREAD_SIZE		(8192)
+#endif
 
+#define STACK_WARN             (THREAD_SIZE/8)
 /*
  * macros/functions for gaining access to the thread information structure
  *
@@ -77,7 +90,6 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
-#define THREAD_SIZE (2*PAGE_SIZE)
 
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
@@ -87,6 +99,14 @@ static inline struct thread_info *current_thread_info(void)
 	return ti;
 }
 
+/* how to get the current stack pointer from C */
+static inline unsigned long current_stack_pointer(void)
+{
+	unsigned long ti;
+	__asm__("movl %%esp,%0; ":"=r" (ti) : );
+	return ti;
+}
+
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
 #define alloc_thread_info(tsk)					\
@@ -108,8 +128,6 @@ static inline struct thread_info *current_thread_info(void)
 
 #else /* !__ASSEMBLY__ */
 
-#define THREAD_SIZE	8192
-
 /* how to get the thread information struct from ASM */
 #define GET_THREAD_INFO(reg) \
 	movl $-THREAD_SIZE, reg; \
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 79479e2c6966..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,4 +30,8 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IA64_IRQ_H */
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 02855ca536b0..5889bc919e80 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -124,4 +124,8 @@ extern volatile unsigned int num_spurious;
  */
 extern irq_node_t *new_irq_node(void);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 4c66ba93201a..208ccd969e4b 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -121,4 +121,8 @@ extern irq_node_t *new_irq_node(void);
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-mips/irq.h b/include/asm-mips/irq.h
index 90b4ae1258a8..d9667a8fbbfb 100644
--- a/include/asm-mips/irq.h
+++ b/include/asm-mips/irq.h
@@ -31,4 +31,7 @@ extern asmlinkage unsigned int do_IRQ(int irq, struct pt_regs *regs);
 
 extern void init_generic_irq(void);
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index 39db70230740..b7acca7de670 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -96,4 +96,7 @@ extern unsigned long txn_alloc_addr(int);
 /* soft power switch support (power.c) */
 extern struct tasklet_struct power_tasklet;
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif	/* _ASM_PARISC_IRQ_H */
diff --git a/include/asm-ppc/irq.h b/include/asm-ppc/irq.h
index bfa3de404d27..df5b76306f7a 100644
--- a/include/asm-ppc/irq.h
+++ b/include/asm-ppc/irq.h
@@ -211,5 +211,9 @@ extern unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
 extern unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
 extern atomic_t ppc_n_lost_interrupts;
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h
index 949e19f96be1..2cd77b4935fb 100644
--- a/include/asm-ppc64/irq.h
+++ b/include/asm-ppc64/irq.h
@@ -75,5 +75,9 @@ static __inline__ int irq_canonicalize(int irq)
 	return irq;
 }
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-s390/irq.h b/include/asm-s390/irq.h
index 25f1808531cc..cac6b3080725 100644
--- a/include/asm-s390/irq.h
+++ b/include/asm-s390/irq.h
@@ -21,6 +21,10 @@ enum interruption_class {
 
 #define touch_nmi_watchdog() do { } while(0)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __KERNEL__ */
 #endif
 
diff --git a/include/asm-sh/irq.h b/include/asm-sh/irq.h
index f470f758057a..7dd2a5ae10b5 100644
--- a/include/asm-sh/irq.h
+++ b/include/asm-sh/irq.h
@@ -329,4 +329,8 @@ static inline int generic_irq_demux(int irq)
 #define irq_canonicalize(irq)	(irq)
 #define irq_demux(irq)		__irq_demux(sh_mv.mv_irq_demux(irq))
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __ASM_SH_IRQ_H */
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index 5423905ffb40..cee356b0dae3 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -184,4 +184,8 @@ extern struct sun4m_intregs *sun4m_interrupts;
 #define SUN4M_INT_SBUS(x)	(1 << (x+7))
 #define SUN4M_INT_VME(x)	(1 << (x))
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h
index e3ba6bc2cc3e..3aef0ca67750 100644
--- a/include/asm-sparc64/irq.h
+++ b/include/asm-sparc64/irq.h
@@ -150,4 +150,8 @@ static __inline__ unsigned long get_softint(void)
 	return retval;
 }
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-um/irq.h b/include/asm-um/irq.h
index cd580acadc71..8300c209a1bc 100644
--- a/include/asm-um/irq.h
+++ b/include/asm-um/irq.h
@@ -32,4 +32,9 @@ extern int um_request_irq(unsigned int irq, int fd, int type,
 			  void (*handler)(int, void *, struct pt_regs *),
 			  unsigned long irqflags,  const char * devname,
 			  void *dev_id);
+
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 63e682d70de1..90c83aa053c8 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -65,4 +65,8 @@ extern void disable_irq_nosync (unsigned int irq);
 
 #endif /* !__ASSEMBLY__ */
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __V850_IRQ_H__ */
diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h
index ad5445ee7460..37c9fd65c97f 100644
--- a/include/asm-x86_64/irq.h
+++ b/include/asm-x86_64/irq.h
@@ -53,4 +53,8 @@ extern int can_request_irq(unsigned int, unsigned long flags);
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index c472cac3029d..265dad4c3cb4 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,7 +3,7 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
+#if __GNUC_MINOR__ >= 1  && __GNUC_MINOR__ < 4
 # define inline		__inline__ __attribute__((always_inline))
 # define __inline__	__inline__ __attribute__((always_inline))
 # define __inline	__inline__ __attribute__((always_inline))
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fa03b836c29a..5bc740d9bc47 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,7 +71,6 @@ extern irq_desc_t irq_desc [NR_IRQS];
 
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
-extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 extern int setup_irq(unsigned int , struct irqaction * );
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81c79736ff9e..58c915c202ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 
+#include <asm/irq.h>
 /*
    - No shared variables, all the data are CPU local.
    - If a softirq needs serialization, let it serialize itself
@@ -69,53 +70,66 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void do_softirq(void)
+asmlinkage void __do_softirq(void)
 {
-	int max_restart = MAX_SOFTIRQ_RESTART;
+	struct softirq_action *h;
 	__u32 pending;
-	unsigned long flags;
+	int max_restart = MAX_SOFTIRQ_RESTART;
 
-	if (in_interrupt())
-		return;
+	pending = local_softirq_pending();
 
-	local_irq_save(flags);
+	local_bh_disable();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	local_softirq_pending() = 0;
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1)
+			h->action(h);
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
 
 	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
 
-	if (pending) {
-		struct softirq_action *h;
+	if (pending)
+		wakeup_softirqd();
 
-		local_bh_disable();
-restart:
-		/* Reset the pending bitmask before enabling irqs */
-		local_softirq_pending() = 0;
+	__local_bh_enable();
+}
 
-		local_irq_enable();
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
 
-		h = softirq_vec;
+	if (in_interrupt())
+		return;
 
-		do {
-			if (pending & 1)
-				h->action(h);
-			h++;
-			pending >>= 1;
-		} while (pending);
+	local_irq_save(flags);
 
-		local_irq_disable();
+	pending = local_softirq_pending();
 
-		pending = local_softirq_pending();
-		if (pending && --max_restart)
-			goto restart;
-		if (pending)
-			wakeup_softirqd();
-		__local_bh_enable();
-	}
+	if (pending)
+		__do_softirq();
 
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
 
+#endif
+
 void local_bh_enable(void)
 {
 	__local_bh_enable();
-- 
cgit v1.2.3


From 2e061730cce0ec9d6157ee2f548625336647b7db Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:20:27 -0700
Subject: [PATCH] v4l: cropcap ioctl fix

From: Gerd Knorr <kraxel@bytesex.org>

The VIDIOC_CROPCAP ioctl had wrong R/W bits, this patch fixes it.
---
 drivers/media/video/videodev.c | 3 +++
 include/linux/videodev2.h      | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 526fec24ad6c..532c1b4fa3f3 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -160,6 +160,9 @@ video_fix_command(unsigned int cmd)
 	case VIDIOC_G_AUDOUT_OLD:
 		cmd = VIDIOC_G_AUDOUT;
 		break;
+	case VIDIOC_CROPCAP_OLD:
+		cmd = VIDIOC_CROPCAP;
+		break;
 	}
 	return cmd;
 }
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 1bfc45a4a430..a4ab8e826bbe 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -869,7 +869,7 @@ struct v4l2_streamparm
 #define VIDIOC_S_MODULATOR	_IOW  ('V', 55, struct v4l2_modulator)
 #define VIDIOC_G_FREQUENCY	_IOWR ('V', 56, struct v4l2_frequency)
 #define VIDIOC_S_FREQUENCY	_IOW  ('V', 57, struct v4l2_frequency)
-#define VIDIOC_CROPCAP		_IOR  ('V', 58, struct v4l2_cropcap)
+#define VIDIOC_CROPCAP		_IOWR ('V', 58, struct v4l2_cropcap)
 #define VIDIOC_G_CROP		_IOWR ('V', 59, struct v4l2_crop)
 #define VIDIOC_S_CROP		_IOW  ('V', 60, struct v4l2_crop)
 #define VIDIOC_G_JPEGCOMP	_IOR  ('V', 61, struct v4l2_jpegcompression)
@@ -887,6 +887,7 @@ struct v4l2_streamparm
 #define VIDIOC_S_CTRL_OLD      	_IOW  ('V', 28, struct v4l2_control)
 #define VIDIOC_G_AUDIO_OLD     	_IOWR ('V', 33, struct v4l2_audio)
 #define VIDIOC_G_AUDOUT_OLD    	_IOWR ('V', 49, struct v4l2_audioout)
+#define VIDIOC_CROPCAP_OLD     	_IOR  ('V', 58, struct v4l2_cropcap)
 
 #define BASE_VIDIOC_PRIVATE	192		/* 192-255 are private */
 
-- 
cgit v1.2.3


From 831434861116756312a982d2082d91d20fed1de0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:20:39 -0700
Subject: [PATCH] v4l: v4l1-compat fix

From: Gerd Knorr <kraxel@bytesex.org>

Minor tweak in the v4l1 compatibility layer: Make sure that capture actually
is active before going to wait for a frame so we don't block forever.
---
 drivers/media/video/v4l1-compat.c | 15 +++++++++++----
 include/linux/videodev.h          |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/media/video/v4l1-compat.c b/drivers/media/video/v4l1-compat.c
index 0f15efb6de56..441733d0b1b8 100644
--- a/drivers/media/video/v4l1-compat.c
+++ b/drivers/media/video/v4l1-compat.c
@@ -289,6 +289,7 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 {
 	struct v4l2_capability  *cap2 = NULL;
 	struct v4l2_format	*fmt2 = NULL;
+	enum v4l2_buf_type      captype = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 
 	struct v4l2_framebuffer fbuf2;
 	struct v4l2_input	input2;
@@ -465,6 +466,7 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 		fmt2 = kmalloc(sizeof(*fmt2),GFP_KERNEL);
 		memset(fmt2,0,sizeof(*fmt2));
 		fmt2->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+		drv(inode, file, VIDIOC_STREAMOFF, &fmt2->type);
 		err1 = drv(inode, file, VIDIOC_G_FMT, fmt2);
 		if (err1 < 0)
 			dprintk("VIDIOCSWIN / VIDIOC_G_FMT: %d\n",err);
@@ -503,11 +505,10 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 		int *on = arg;
 
 		if (0 == *on) {
-			enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 			/* dirty hack time.  But v4l1 has no STREAMOFF
 			 * equivalent in the API, and this one at
 			 * least comes close ... */
-			drv(inode, file, VIDIOC_STREAMOFF, &type);
+			drv(inode, file, VIDIOC_STREAMOFF, &captype);
 		}
 		err = drv(inode, file, VIDIOC_OVERLAY, arg);
 		if (err < 0)
@@ -858,7 +859,6 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 	case VIDIOCMCAPTURE: /*  capture a frame  */
 	{
 		struct video_mmap	*mm = arg;
-		enum v4l2_buf_type	type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 
 		fmt2 = kmalloc(sizeof(*fmt2),GFP_KERNEL);
 		memset(&buf2,0,sizeof(buf2));
@@ -899,7 +899,7 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 			dprintk("VIDIOCMCAPTURE / VIDIOC_QBUF: %d\n",err);
 			break;
 		}
-		err = drv(inode, file, VIDIOC_STREAMON, &type);
+		err = drv(inode, file, VIDIOC_STREAMON, &captype);
 		if (err < 0)
 			dprintk("VIDIOCMCAPTURE / VIDIOC_STREAMON: %d\n",err);
 		break;
@@ -922,6 +922,13 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 			break;
 		}
 
+		/* make sure capture actually runs so we don't block forever */
+		err = drv(inode, file, VIDIOC_STREAMON, &captype);
+		if (err < 0) {
+			dprintk("VIDIOCSYNC / VIDIOC_STREAMON: %d\n",err);
+			break;
+		}
+
 		/*  Loop as long as the buffer is queued, but not done  */
 		while ((buf2.flags &
 			(V4L2_BUF_FLAG_QUEUED | V4L2_BUF_FLAG_DONE))
diff --git a/include/linux/videodev.h b/include/linux/videodev.h
index 4e421d3d25ed..cfcf6f1cd0e2 100644
--- a/include/linux/videodev.h
+++ b/include/linux/videodev.h
@@ -430,6 +430,7 @@ struct video_code
 #define VID_HARDWARE_VICAM      34
 #define VID_HARDWARE_SF16FMR2	35
 #define VID_HARDWARE_W9968CF    36
+#define VID_HARDWARE_SAA7114H   37
 #endif /* __LINUX_VIDEODEV_H */
 
 /*
-- 
cgit v1.2.3


From 0109dc6d2d874ea81bc1e9e34bed612959d70365 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:24:32 -0700
Subject: [PATCH] Improve list.h documentation for _rcu() primitives

From: "Paul E. McKenney" <paulmck@us.ibm.com>

The attached patch improves the documentation of the _rcu list primitives.
---
 include/linux/list.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 5388098449cc..1269d0c64d33 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -104,6 +104,14 @@ static __inline__ void __list_add_rcu(struct list_head * new,
  *
  * Insert a new entry after the specified head.
  * This is good for implementing stacks.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
  */
 static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head)
 {
@@ -117,6 +125,14 @@ static __inline__ void list_add_rcu(struct list_head *new, struct list_head *hea
  *
  * Insert a new entry before the specified head.
  * This is useful for implementing queues.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_tail_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
  */
 static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head)
 {
@@ -159,6 +175,19 @@ static inline void list_del(struct list_head *entry)
  *
  * In particular, it means that we can not poison the forward 
  * pointers that may still be used for walking the list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_del_rcu()
+ * or list_add_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ *
+ * Note that the caller is not permitted to immediately free
+ * the newly deleted entry.  Instead, either synchronize_kernel()
+ * or call_rcu() must be used to defer freeing until an RCU
+ * grace period has elapsed.
  */
 static inline void list_del_rcu(struct list_head *entry)
 {
@@ -384,6 +413,10 @@ static inline void list_splice_init(struct list_head *list,
  * list_for_each_rcu	-	iterate over an rcu-protected list
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_rcu(pos, head) \
 	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
@@ -399,6 +432,10 @@ static inline void list_splice_init(struct list_head *list,
  * @pos:	the &struct list_head to use as a loop counter.
  * @n:		another &struct list_head to use as temporary storage
  * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_safe_rcu(pos, n, head) \
 	for (pos = (head)->next, n = pos->next; pos != (head); \
@@ -409,6 +446,10 @@ static inline void list_splice_init(struct list_head *list,
  * @pos:	the type * to use as a loop counter.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member)			\
 	for (pos = list_entry((head)->next, typeof(*pos), member),	\
@@ -424,6 +465,10 @@ static inline void list_splice_init(struct list_head *list,
  *			continuing after existing point.
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_continue_rcu(pos, head) \
 	for ((pos) = (pos)->next, prefetch((pos)->next); (pos) != (head); \
@@ -485,6 +530,14 @@ static __inline__ void hlist_del(struct hlist_node *n)
  *
  * In particular, it means that we can not poison the forward
  * pointers that may still be used for walking the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry().
  */
 static inline void hlist_del_rcu(struct hlist_node *n)
 {
@@ -512,6 +565,26 @@ static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h
 	n->pprev = &h->first; 
 } 
 
+
+/**
+ * hlist_add_head_rcu - adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry(), but only if smp_read_barrier_depends()
+ * is used to prevent memory-consistency problems on Alpha CPUs.
+ * Regardless of the type of CPU, the list-traversal primitive
+ * must be guarded by rcu_read_lock().
+ *
+ * OK, so why don't we have an hlist_for_each_entry_rcu()???
+ */
 static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) 
 { 
 	struct hlist_node *first = h->first;
-- 
cgit v1.2.3


From 492361a6d915137590a8eba787dd878d71137358 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:24:44 -0700
Subject: [PATCH] list.h cleanup

- s/__inline__/inline/

- Remove lots of extraneous andi-was-here trailing whitespace
---
 include/linux/list.h | 120 ++++++++++++++++++++++++++-------------------------
 1 file changed, 61 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index 1269d0c64d33..34fd74e050df 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -39,7 +39,7 @@ struct list_head {
 } while (0)
 
 /*
- * Insert a new entry between two known consecutive entries. 
+ * Insert a new entry between two known consecutive entries.
  *
  * This is only for internal list manipulation where we know
  * the prev/next entries already!
@@ -81,14 +81,13 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head)
 }
 
 /*
- * Insert a new entry between two known consecutive entries. 
+ * Insert a new entry between two known consecutive entries.
  *
  * This is only for internal list manipulation where we know
  * the prev/next entries already!
  */
-static __inline__ void __list_add_rcu(struct list_head * new,
-	struct list_head * prev,
-	struct list_head * next)
+static inline void __list_add_rcu(struct list_head * new,
+		struct list_head * prev, struct list_head * next)
 {
 	new->next = next;
 	new->prev = prev;
@@ -113,7 +112,7 @@ static __inline__ void __list_add_rcu(struct list_head * new,
  * the _rcu list-traversal primitives, such as
  * list_for_each_entry_rcu().
  */
-static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head)
+static inline void list_add_rcu(struct list_head *new, struct list_head *head)
 {
 	__list_add_rcu(new, head, head->next);
 }
@@ -134,7 +133,8 @@ static __inline__ void list_add_rcu(struct list_head *new, struct list_head *hea
  * the _rcu list-traversal primitives, such as
  * list_for_each_entry_rcu().
  */
-static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head)
+static inline void list_add_tail_rcu(struct list_head *new,
+					struct list_head *head)
 {
 	__list_add_rcu(new, head->prev, head);
 }
@@ -169,11 +169,11 @@ static inline void list_del(struct list_head *entry)
  * list_del_rcu - deletes entry from list without re-initialization
  * @entry: the element to delete from the list.
  *
- * Note: list_empty on entry does not return true after this, 
+ * Note: list_empty on entry does not return true after this,
  * the entry is in an undefined state. It is useful for RCU based
  * lockfree traversal.
  *
- * In particular, it means that we can not poison the forward 
+ * In particular, it means that we can not poison the forward
  * pointers that may still be used for walking the list.
  *
  * The caller must take whatever precautions are necessary
@@ -202,7 +202,7 @@ static inline void list_del_rcu(struct list_head *entry)
 static inline void list_del_init(struct list_head *entry)
 {
 	__list_del(entry->prev, entry->next);
-	INIT_LIST_HEAD(entry); 
+	INIT_LIST_HEAD(entry);
 }
 
 /**
@@ -335,7 +335,7 @@ static inline void list_splice_init(struct list_head *list,
 #define list_for_each_prev(pos, head) \
 	for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
         	pos = pos->prev, prefetch(pos->prev))
-        	
+
 /**
  * list_for_each_safe	-	iterate over a list safe against removal of list entry
  * @pos:	the &struct list_head to use as a loop counter.
@@ -421,11 +421,11 @@ static inline void list_splice_init(struct list_head *list,
 #define list_for_each_rcu(pos, head) \
 	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
         	pos = pos->next, ({ smp_read_barrier_depends(); 0;}), prefetch(pos->next))
-        	
+
 #define __list_for_each_rcu(pos, head) \
 	for (pos = (head)->next; pos != (head); \
         	pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
-        	
+
 /**
  * list_for_each_safe_rcu	-	iterate over an rcu-protected list safe
  *					against removal of list entry
@@ -461,7 +461,7 @@ static inline void list_splice_init(struct list_head *list,
 
 
 /**
- * list_for_each_continue_rcu	-	iterate over an rcu-protected list 
+ * list_for_each_continue_rcu	-	iterate over an rcu-protected list
  *			continuing after existing point.
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
@@ -474,46 +474,46 @@ static inline void list_splice_init(struct list_head *list,
 	for ((pos) = (pos)->next, prefetch((pos)->next); (pos) != (head); \
         	(pos) = (pos)->next, ({ smp_read_barrier_depends(); 0;}), prefetch((pos)->next))
 
-/* 
- * Double linked lists with a single pointer list head. 
- * Mostly useful for hash tables where the two pointer list head is 
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
  * too wasteful.
  * You lose the ability to access the tail in O(1).
- */ 
+ */
 
-struct hlist_head { 
-	struct hlist_node *first; 
-}; 
+struct hlist_head {
+	struct hlist_node *first;
+};
 
-struct hlist_node { 
-	struct hlist_node *next, **pprev; 
-}; 
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
 
-#define HLIST_HEAD_INIT { .first = NULL } 
+#define HLIST_HEAD_INIT { .first = NULL }
 #define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
-#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) 
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
 #define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
 
-static __inline__ int hlist_unhashed(const struct hlist_node *h) 
-{ 
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
 	return !h->pprev;
-} 
+}
 
-static __inline__ int hlist_empty(const struct hlist_head *h) 
-{ 
+static inline int hlist_empty(const struct hlist_head *h)
+{
 	return !h->first;
-} 
+}
 
-static __inline__ void __hlist_del(struct hlist_node *n) 
+static inline void __hlist_del(struct hlist_node *n)
 {
 	struct hlist_node *next = n->next;
 	struct hlist_node **pprev = n->pprev;
-	*pprev = next;  
-	if (next) 
+	*pprev = next;
+	if (next)
 		next->pprev = pprev;
-}  
+}
 
-static __inline__ void hlist_del(struct hlist_node *n)
+static inline void hlist_del(struct hlist_node *n)
 {
 	__hlist_del(n);
 	n->next = LIST_POISON1;
@@ -524,7 +524,7 @@ static __inline__ void hlist_del(struct hlist_node *n)
  * hlist_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
  *
- * Note: list_unhashed() on entry does not return true after this, 
+ * Note: list_unhashed() on entry does not return true after this,
  * the entry is in an undefined state. It is useful for RCU based
  * lockfree traversal.
  *
@@ -545,25 +545,25 @@ static inline void hlist_del_rcu(struct hlist_node *n)
 	n->pprev = LIST_POISON2;
 }
 
-static __inline__ void hlist_del_init(struct hlist_node *n) 
+static inline void hlist_del_init(struct hlist_node *n)
 {
 	if (n->pprev)  {
 		__hlist_del(n);
 		INIT_HLIST_NODE(n);
 	}
-}  
+}
 
 #define hlist_del_rcu_init hlist_del_init
 
-static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h) 
-{ 
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
 	struct hlist_node *first = h->first;
-	n->next = first; 
-	if (first) 
+	n->next = first;
+	if (first)
 		first->pprev = &n->next;
-	h->first = n; 
-	n->pprev = &h->first; 
-} 
+	h->first = n;
+	n->pprev = &h->first;
+}
 
 
 /**
@@ -585,28 +585,30 @@ static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h
  *
  * OK, so why don't we have an hlist_for_each_entry_rcu()???
  */
-static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) 
-{ 
+static inline void hlist_add_head_rcu(struct hlist_node *n,
+					struct hlist_head *h)
+{
 	struct hlist_node *first = h->first;
 	n->next = first;
-	n->pprev = &h->first; 
+	n->pprev = &h->first;
 	smp_wmb();
-	if (first) 
+	if (first)
 		first->pprev = &n->next;
-	h->first = n; 
-} 
+	h->first = n;
+}
 
 /* next must be != NULL */
-static __inline__ void hlist_add_before(struct hlist_node *n, struct hlist_node *next)
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
 {
 	n->pprev = next->pprev;
-	n->next = next; 
-	next->pprev = &n->next; 
+	n->next = next;
+	next->pprev = &n->next;
 	*(n->pprev) = n;
 }
 
-static __inline__ void hlist_add_after(struct hlist_node *n,
-				       struct hlist_node *next)
+static inline void hlist_add_after(struct hlist_node *n,
+					struct hlist_node *next)
 {
 	next->next	= n->next;
 	*(next->pprev)	= n;
@@ -618,7 +620,7 @@ static __inline__ void hlist_add_after(struct hlist_node *n,
 /* Cannot easily do prefetch unfortunately */
 #define hlist_for_each(pos, head) \
 	for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
-	     pos = pos->next) 
+	     pos = pos->next)
 
 #define hlist_for_each_safe(pos, n, head) \
 	for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
-- 
cgit v1.2.3


From 01cc53b25e1883ff537d19adc87097e1833deeaa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:24:57 -0700
Subject: [PATCH] Non-Exec stack support

From: Kurt Garloff <garloff@suse.de>

A patch to parse the elf binaries for a PT_GNU_STACK section to set the stack
non-executable if possible.  Most parts have been shamelessly stolen from
Ingo Molnar's more ambitious stackshield
http://people.redhat.com/mingo/exec-shield/exec-shield-2.6.4-C9

The toolchain has meanwhile support for marking the binaries with a
PT_GNU_STACK section wwithout x bit as needed.

If no such section is found, we leave the stack to whatever the arch defaults
to.  If there is one, we explicitly disabled the VM_EXEC bit if no x bit is
found, otherwise explicitly enable.
---
 arch/ia64/ia32/binfmt_elf32.c       | 16 +++++++++++-----
 arch/ia64/ia32/ia32priv.h           |  2 +-
 arch/mips/kernel/irixelf.c          |  2 +-
 arch/s390/kernel/binfmt_elf32.c     |  4 ++--
 arch/s390/kernel/compat_exec.c      |  3 ++-
 arch/sparc64/kernel/binfmt_aout32.c |  2 +-
 arch/x86_64/ia32/ia32_aout.c        |  4 ++--
 arch/x86_64/ia32/ia32_binfmt.c      | 15 ++++++++++-----
 fs/binfmt_aout.c                    |  2 +-
 fs/binfmt_elf.c                     | 12 +++++++++++-
 fs/binfmt_som.c                     |  2 +-
 fs/exec.c                           | 14 +++++++++++---
 include/asm-ia64/pgtable.h          |  3 ++-
 include/linux/binfmts.h             |  8 +++++++-
 include/linux/elf.h                 |  2 ++
 15 files changed, 65 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 7d8624cef402..679e68afd653 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -35,7 +35,7 @@ extern void ia64_elf32_init (struct pt_regs *regs);
 
 static void elf32_set_personality (void);
 
-#define setup_arg_pages(bprm)		ia32_setup_arg_pages(bprm)
+#define setup_arg_pages(bprm,exec)		ia32_setup_arg_pages(bprm,exec)
 #define elf_map				elf32_map
 
 #undef SET_PERSONALITY
@@ -149,7 +149,7 @@ ia64_elf32_init (struct pt_regs *regs)
 }
 
 int
-ia32_setup_arg_pages (struct linux_binprm *bprm)
+ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -178,8 +178,14 @@ ia32_setup_arg_pages (struct linux_binprm *bprm)
 		mpnt->vm_mm = current->mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = IA32_STACK_TOP;
-		mpnt->vm_page_prot = PAGE_COPY;
-		mpnt->vm_flags = VM_STACK_FLAGS;
+		if (executable_stack == EXSTACK_ENABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
+		else
+			mpnt->vm_flags = VM_STACK_FLAGS;
+		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)?
+					PAGE_COPY_EXEC: PAGE_COPY;
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
@@ -192,7 +198,7 @@ ia32_setup_arg_pages (struct linux_binprm *bprm)
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			put_dirty_page(current, page, stack_base, PAGE_COPY);
+			put_dirty_page(current, page, stack_base, mpnt->vm_page_prot);
 		}
 		stack_base += PAGE_SIZE;
 	}
diff --git a/arch/ia64/ia32/ia32priv.h b/arch/ia64/ia32/ia32priv.h
index a445206a8553..e6f95af15972 100644
--- a/arch/ia64/ia32/ia32priv.h
+++ b/arch/ia64/ia32/ia32priv.h
@@ -494,7 +494,7 @@ struct ia32_user_desc {
 struct linux_binprm;
 
 extern void ia32_init_addr_space (struct pt_regs *regs);
-extern int ia32_setup_arg_pages (struct linux_binprm *bprm);
+extern int ia32_setup_arg_pages (struct linux_binprm *bprm, int exec_stack);
 extern unsigned long ia32_do_mmap (struct file *, unsigned long, unsigned long, int, int, loff_t);
 extern void ia32_load_segment_descriptors (struct task_struct *task);
 
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index f925c6514aad..eac9e07f5d0f 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -688,7 +688,7 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * change some of these later.
 	 */
 	current->mm->rss = 0;
-	setup_arg_pages(bprm);
+	setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 
 	/* At this point, we assume that the image should be loaded at
diff --git a/arch/s390/kernel/binfmt_elf32.c b/arch/s390/kernel/binfmt_elf32.c
index 8f80960cdda8..0cc8e5be48ba 100644
--- a/arch/s390/kernel/binfmt_elf32.c
+++ b/arch/s390/kernel/binfmt_elf32.c
@@ -115,7 +115,7 @@ static inline int dump_regs32(struct pt_regs *ptregs, elf_gregset_t *regs)
 #include <linux/binfmts.h>
 #include <linux/compat.h>
 
-int setup_arg_pages32(struct linux_binprm *bprm);
+int setup_arg_pages32(struct linux_binprm *bprm, int executable_stack);
 
 #define elf_prstatus elf_prstatus32
 struct elf_prstatus32
@@ -166,7 +166,7 @@ struct elf_prpsinfo32
 
 #undef start_thread
 #define start_thread                    start_thread31 
-#define setup_arg_pages(bprm)           setup_arg_pages32(bprm)
+#define setup_arg_pages(bprm, exec)     setup_arg_pages32(bprm, exec)
 #define elf_map				elf_map32
 
 MODULE_DESCRIPTION("Binary format loader for compatibility with 32bit Linux for S390 binaries,"
diff --git a/arch/s390/kernel/compat_exec.c b/arch/s390/kernel/compat_exec.c
index 33832846833f..162deb2bb007 100644
--- a/arch/s390/kernel/compat_exec.c
+++ b/arch/s390/kernel/compat_exec.c
@@ -37,7 +37,7 @@
 #undef STACK_TOP
 #define STACK_TOP TASK31_SIZE
 
-int setup_arg_pages32(struct linux_binprm *bprm)
+int setup_arg_pages32(struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -66,6 +66,7 @@ int setup_arg_pages32(struct linux_binprm *bprm)
 		mpnt->vm_mm = mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = STACK_TOP;
+		/* executable stack setting would be applied here */
 		mpnt->vm_page_prot = PAGE_COPY;
 		mpnt->vm_flags = VM_STACK_FLAGS;
 		mpnt->vm_ops = NULL;
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index dcae86a7228b..4ba5d4801bae 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -310,7 +310,7 @@ beyond_if:
 	orig_thr_flags = current_thread_info()->flags;
 	current_thread_info()->flags |= _TIF_32BIT;
 
-	retval = setup_arg_pages(bprm);
+	retval = setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		current_thread_info()->flags = orig_thr_flags;
 
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 4c2d1100d2d7..040adf6991a2 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -35,7 +35,7 @@
 #undef WARN_OLD
 #undef CORE_DUMP /* probably broken */
 
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm);
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm, int exec_stack);
 
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
@@ -395,7 +395,7 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = ia32_setup_arg_pages(bprm); 
+	retval = ia32_setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		/* Someone check-me: is this error path enough? */ 
 		send_sig(SIGKILL, current, 0); 
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 82e9bc2ddc3f..92817f18e39b 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -272,8 +272,8 @@ do {							\
 #define load_elf_binary load_elf32_binary
 
 #define ELF_PLAT_INIT(r, load_addr)	elf32_init(r)
-#define setup_arg_pages(bprm)		ia32_setup_arg_pages(bprm)
-int ia32_setup_arg_pages(struct linux_binprm *bprm);
+#define setup_arg_pages(bprm, exec_stack)	ia32_setup_arg_pages(bprm, exec_stack)
+int ia32_setup_arg_pages(struct linux_binprm *bprm, int executable_stack);
 
 #undef start_thread
 #define start_thread(regs,new_rip,new_rsp) do { \
@@ -325,7 +325,7 @@ static void elf32_init(struct pt_regs *regs)
 	me->thread.es = __USER_DS;
 }
 
-int setup_arg_pages(struct linux_binprm *bprm)
+int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -354,7 +354,12 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		mpnt->vm_mm = mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = IA32_STACK_TOP;
-		mpnt->vm_flags = vm_stack_flags32; 
+		if (executable_stack == EXSTACK_ENABLE_X)
+			mpnt->vm_flags = vm_stack_flags32 |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = vm_stack_flags32 & ~VM_EXEC;
+		else
+			mpnt->vm_flags = vm_stack_flags32;
  		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? 
  			PAGE_COPY_EXEC : PAGE_COPY;
 		mpnt->vm_ops = NULL;
@@ -370,7 +375,7 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC);
+			put_dirty_page(current,page,stack_base,mpnt->vm_page_prot);
 		}
 		stack_base += PAGE_SIZE;
 	}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 849324bbf3e3..7827c1255848 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -413,7 +413,7 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = setup_arg_pages(bprm); 
+	retval = setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		/* Someone check-me: is this error path enough? */ 
 		send_sig(SIGKILL, current, 0); 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e5b79a294c80..c420a261e5f3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -476,6 +476,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
   	struct exec interp_ex;
 	char passed_fileno[6];
 	struct files_struct *files;
+	int executable_stack = EXSTACK_DEFAULT;
 	
 	/* Get the exec-header */
 	elf_ex = *((struct elfhdr *) bprm->buf);
@@ -599,6 +600,15 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		elf_ppnt++;
 	}
 
+	elf_ppnt = elf_phdata;
+	for (i = 0; i < elf_ex.e_phnum; i++, elf_ppnt++)
+		if (elf_ppnt->p_type == PT_GNU_STACK) {
+			if (elf_ppnt->p_flags & PF_X)
+				executable_stack = EXSTACK_ENABLE_X;
+			else
+				executable_stack = EXSTACK_DISABLE_X;
+		}
+
 	/* Some simple consistency checks for the interpreter */
 	if (elf_interpreter) {
 		interpreter_type = INTERPRETER_ELF | INTERPRETER_AOUT;
@@ -674,7 +684,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   change some of these later */
 	current->mm->rss = 0;
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
-	retval = setup_arg_pages(bprm);
+	retval = setup_arg_pages(bprm, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
 		goto out_free_dentry;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index a4e7b03ff836..cabf3ccc09e2 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -254,7 +254,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	set_binfmt(&som_format);
 	compute_creds(bprm);
-	setup_arg_pages(bprm);
+	setup_arg_pages(bprm, EXSTACK_DEFAULT);
 
 	create_som_tables(bprm);
 
diff --git a/fs/exec.c b/fs/exec.c
index 62bf2c537abd..26e3392b6369 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -342,7 +342,7 @@ out_sig:
 	return;
 }
 
-int setup_arg_pages(struct linux_binprm *bprm)
+int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -425,8 +425,16 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = STACK_TOP;
 #endif
-		mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7];
-		mpnt->vm_flags = VM_STACK_FLAGS;
+		/* Adjust stack execute permissions; explicitly enable
+		 * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X
+		 * and leave alone (arch default) otherwise. */
+		if (unlikely(executable_stack == EXSTACK_ENABLE_X))
+			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
+		else
+			mpnt->vm_flags = VM_STACK_FLAGS;
+		mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 86d7f7f91bfb..bec5c8cd0079 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -119,7 +119,8 @@
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_A)
 #define PAGE_SHARED	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RW)
 #define PAGE_READONLY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
-#define PAGE_COPY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX)
+#define PAGE_COPY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
+#define PAGE_COPY_EXEC	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX)
 #define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
 #define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
 #define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 3d9a86eff6ab..60726b29603c 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -58,7 +58,13 @@ extern int prepare_binprm(struct linux_binprm *);
 extern void remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
-extern int setup_arg_pages(struct linux_binprm * bprm);
+
+/* Stack area protections */
+#define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
+#define EXSTACK_DISABLE_X 1	/* Disable executable stacks */
+#define EXSTACK_ENABLE_X  2	/* Enable executable stacks */
+
+extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack);
 extern int copy_strings(int argc,char __user * __user * argv,struct linux_binprm *bprm); 
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern void compute_creds(struct linux_binprm *binprm);
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 2f8005729fb6..7f21bfaa2e71 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -35,6 +35,8 @@ typedef __s64	Elf64_Sxword;
 #define PT_HIPROC  0x7fffffff
 #define PT_GNU_EH_FRAME		0x6474e550
 
+#define PT_GNU_STACK	(PT_LOOS + 0x474e551)
+
 /* These constants define the different elf file types */
 #define ET_NONE   0
 #define ET_REL    1
-- 
cgit v1.2.3


From 7a10b433f0911d25b4fd1d1b033cbd119be8fc5f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:25:23 -0700
Subject: [PATCH] reiserfs: support for nested transactions

From: Chris Mason <mason@suse.com>

reiserfs support for nested transactions.  This originally came from Peter
Braam for 2.4.x and was ported forward by Jeff Mahoney.
---
 fs/reiserfs/inode.c            |  4 +++
 fs/reiserfs/journal.c          | 78 +++++++++++++++++++++++++++++++++++++++---
 fs/reiserfs/namei.c            |  1 -
 include/linux/reiserfs_fs_sb.h | 12 ++++---
 4 files changed, 86 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c01847228d2c..d1c8a83a7d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -206,6 +206,10 @@ static int file_capable (struct inode * inode, long block)
   struct super_block *s = th->t_super ;
   int len = th->t_blocks_allocated ;
 
+  /* we cannot restart while nested */
+  if (th->t_refcount > 1) {
+      return  ;
+  }
   pathrelse(path) ;
   reiserfs_update_sd(th, inode) ;
   journal_end(th, s, len) ;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 02351f60808a..53d425fd8ea5 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2157,6 +2157,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int n
   time_t now = get_seconds() ;
   if (reiserfs_dont_log(th->t_super)) 
     return 0 ;
+  /* cannot restart while nested */
+  if (th->t_refcount > 1)
+    return 0 ;
   if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 ||
        (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) || 
        atomic_read(&(SB_JOURNAL(th->t_super)->j_jlock)) ||
@@ -2212,6 +2215,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
     return 0 ;
   }
   PROC_INFO_INC( p_s_sb, journal.journal_being );
+  /* set here for journal_join */
+  th->t_refcount = 1;
+  th->t_super = p_s_sb ;
 
 relock:
   lock_journal(p_s_sb) ;
@@ -2268,9 +2274,7 @@ relock:
   SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
   th->t_blocks_logged = 0 ;
   th->t_blocks_allocated = nblocks ;
-  th->t_super = p_s_sb ;
   th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  th->t_caller = "Unknown" ;
   unlock_journal(p_s_sb) ;
   p_s_sb->s_dirt = 1; 
   return 0 ;
@@ -2278,11 +2282,47 @@ relock:
 
 
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+  struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+  /* this keeps do_journal_end from NULLing out the current->journal_info
+  ** pointer
+  */
+  th->t_handle_save = cur_th ;
+  if (cur_th && cur_th->t_refcount > 1) {
+      BUG() ;
+  }
   return do_journal_begin_r(th, p_s_sb, nblocks, 1) ;
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
-  return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+    int ret ;
+
+    th->t_handle_save = NULL ;
+    if (cur_th) {
+	/* we are nesting into the current transaction */
+	if (cur_th->t_super == p_s_sb) {
+	      cur_th->t_refcount++ ;
+	      memcpy(th, cur_th, sizeof(*th));
+	      if (th->t_refcount <= 1)
+		      printk("BAD: refcount <= 1, but journal_info != 0\n");
+	      return 0;
+	} else {
+	    /* we've ended up with a handle from a different filesystem.
+	    ** save it and restore on journal_end.  This should never
+	    ** really happen...
+	    */
+	    reiserfs_warning("clm-2100: nesting info a different FS\n") ;
+	    th->t_handle_save = current->journal_info ;
+	    current->journal_info = th;
+	}
+    } else {
+	current->journal_info = th;
+    }
+    ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    if (current->journal_info != th)
+        BUG() ;
+    return ret ;
 }
 
 /* not used at all */
@@ -2422,7 +2462,26 @@ int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct supe
 }
 
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  return do_journal_end(th, p_s_sb, nblocks, 0) ;
+  if (!current->journal_info && th->t_refcount > 1)
+    printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
+  if (th->t_refcount > 1) {
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+
+    /* we aren't allowed to close a nested transaction on a different
+    ** filesystem from the one in the task struct
+    */
+    if (cur_th->t_super != th->t_super)
+      BUG() ;
+
+    th->t_refcount--;
+    if (th != cur_th) {
+      memcpy(current->journal_info, th, sizeof(*th));
+      th->t_trans_id = 0;
+    }
+    return 0;
+  } else {
+    return do_journal_end(th, p_s_sb, nblocks, 0) ;
+  }
 }
 
 /* removes from the current transaction, relsing and descrementing any counters.  
@@ -2520,6 +2579,10 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) {
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
 
+  /* you can sync while nested, very, very bad */
+  if (th->t_refcount > 1) {
+    BUG() ;
+  }
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
     reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
     journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
@@ -2901,6 +2964,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   struct reiserfs_super_block *rs ; 
   int trans_half ;
 
+  if (th->t_refcount > 1)
+    BUG() ;
+
+  current->journal_info = th->t_handle_save;
   if (reiserfs_dont_log(th->t_super)) {
     return 0 ;
   }
@@ -2938,8 +3005,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   }
 
 #ifdef REISERFS_PREALLOCATE
+  /* quota ops might need to nest, setup the journal_info pointer for them */
+  current->journal_info = th ;
   reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
 				      * the transaction */
+  current->journal_info = th->t_handle_save ;
 #endif
   
   rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index bdbe89bf99f1..70dec0317a1f 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -575,7 +575,6 @@ static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode,
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    th.t_caller = "create" ;
     retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
     if (retval) {
         goto out_failed;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 4c675f30a8ae..e6d9fefce42c 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -157,13 +157,17 @@ struct reiserfs_list_bitmap {
 ** transaction handle which is passed around for all journal calls
 */
 struct reiserfs_transaction_handle {
-				/* ifdef it. -Hans */
-  char *t_caller ;              /* debugging use */
+  struct super_block *t_super ; /* super for this FS when journal_begin was
+				   called. saves calls to reiserfs_get_super
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle
+				*/
+  int t_refcount;
   int t_blocks_logged ;         /* number of blocks this writer has logged */
   int t_blocks_allocated ;      /* number of blocks this writer allocated */
   unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  struct super_block *t_super ; /* super for this FS when journal_begin was 
-                                   called. saves calls to reiserfs_get_super */
+  void *t_handle_save ;		/* save existing current->journal_info */
   int displace_new_blocks:1;	/* if new block allocation occurres, that block
 				   should be displaced from others */
 
-- 
cgit v1.2.3


From 8f57688237995959aee38d48a0c92e203dbec676 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:25:37 -0700
Subject: [PATCH] reiserfs: cleanups

From: Chris Mason <mason@suse.com>

reiserfs cleanup, get rid of old debugging code.
---
 fs/reiserfs/do_balan.c         |  2 --
 fs/reiserfs/file.c             |  3 ---
 fs/reiserfs/fix_node.c         |  5 ----
 fs/reiserfs/inode.c            | 12 ---------
 fs/reiserfs/journal.c          | 55 ------------------------------------------
 fs/reiserfs/namei.c            | 18 --------------
 fs/reiserfs/prints.c           |  1 -
 include/linux/reiserfs_fs.h    |  2 --
 include/linux/reiserfs_fs_sb.h |  1 -
 9 files changed, 99 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index eb23eb4ceda7..c90f0edbc167 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -54,9 +54,7 @@ inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
 	    tb->need_balance_dirty = 1;
 	}
     } else {
-	int windex = push_journal_writer("do_balance") ;
 	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-	pop_journal_writer(windex) ;
     }
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 47503bb3a3b3..99321f2fcdf6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -29,7 +29,6 @@ static int reiserfs_file_release (struct inode * inode, struct file * filp)
 {
 
     struct reiserfs_transaction_handle th ;
-    int windex ;
 
     if (!S_ISREG (inode->i_mode))
 	BUG ();
@@ -59,9 +58,7 @@ static int reiserfs_file_release (struct inode * inode, struct file * filp)
 	   appended (we append by unformatted node only) or its direct
 	   item(s) had to be converted, then it may have to be
 	   indirect2direct converted */
-	windex = push_journal_writer("file_release") ;
 	reiserfs_truncate_file(inode, 0) ;
-	pop_journal_writer(windex) ;
     }
     up (&inode->i_sem); 
     reiserfs_write_unlock(inode->i_sb);
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index e16c276dc82a..95a429ab77d6 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2280,7 +2280,6 @@ int fix_nodes (int n_op_mode,
     ** during wait_tb_buffers_run
     */
     int wait_tb_buffers_run = 0 ; 
-    int windex ;
     struct buffer_head  * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
 
     ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
@@ -2407,10 +2406,7 @@ int fix_nodes (int n_op_mode,
 		p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
     }
 
-    
-    windex = push_journal_writer("fix_nodes") ;
     if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
-	pop_journal_writer(windex) ;
 	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
 	    wait_tb_buffers_run = 1 ;
 	    n_ret_value = REPEAT_SEARCH ;
@@ -2420,7 +2416,6 @@ int fix_nodes (int n_op_mode,
 	}
     } else {
 	wait_tb_buffers_run = 1 ;
-	pop_journal_writer(windex) ;
 	goto repeat; 
     }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1c8a83a7d66..e1c7928d0633 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -29,9 +29,7 @@ static int reiserfs_get_block (struct inode * inode, sector_t block,
 void reiserfs_delete_inode (struct inode * inode)
 {
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2; 
-    int windex ;
     struct reiserfs_transaction_handle th ;
-
   
     reiserfs_write_lock(inode->i_sb);
 
@@ -41,10 +39,8 @@ void reiserfs_delete_inode (struct inode * inode)
 
 	journal_begin(&th, inode->i_sb, jbegin_count) ;
 	reiserfs_update_inode_transaction(inode) ;
-	windex = push_journal_writer("delete_inode") ;
 
 	reiserfs_delete_object (&th, inode);
-	pop_journal_writer(windex) ;
 
 	journal_end(&th, inode->i_sb, jbegin_count) ;
 
@@ -561,7 +557,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     __u32 * item;
     int done;
     int fs_gen;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
@@ -607,8 +602,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	 (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 	REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 
-    windex = push_journal_writer("reiserfs_get_block") ;
-  
     /* set the key of the first byte in the 'block'-th block of file */
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
@@ -687,7 +680,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
-	pop_journal_writer(windex) ;
 	if (transaction_started)
 	    journal_end(&th, inode->i_sb, jbegin_count) ;
 
@@ -933,7 +925,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
       reiserfs_update_sd(&th, inode) ;
       journal_end(&th, inode->i_sb, jbegin_count) ;
     }
-    pop_journal_writer(windex) ;
     reiserfs_write_unlock(inode->i_sb);
     reiserfs_check_path(&path) ;
     return retval;
@@ -1836,7 +1827,6 @@ unlock:
 */
 void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
     struct reiserfs_transaction_handle th ;
-    int windex ;
     /* we want the offset for the first byte after the end of the file */
     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
@@ -1871,14 +1861,12 @@ void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
        cut_from_item. 1 is for update_sd */
     journal_begin(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
     reiserfs_update_inode_transaction(p_s_inode) ;
-    windex = push_journal_writer("reiserfs_vfs_truncate_file") ;
     if (update_timestamps)
 	    /* we are doing real truncate: if the system crashes before the last
 	       transaction of truncating gets committed - on reboot the file
 	       either appears truncated properly or not truncated at all */
 	add_save_link (&th, p_s_inode, 1);
     reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
-    pop_journal_writer(windex) ;
     journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
 
     if (update_timestamps)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 53d425fd8ea5..95cf46212d68 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -408,7 +408,6 @@ void reiserfs_check_lock_depth(char *caller) {
 #ifdef CONFIG_SMP
   if (current->lock_depth < 0) {
     printk("%s called without kernel lock held\n", caller) ;
-    show_reiserfs_locks() ;
     BUG() ;
   }
 #else
@@ -444,52 +443,6 @@ static inline struct reiserfs_journal_cnode *get_journal_hash(struct super_block
   return cn ;
 }
 
-/* once upon a time, the journal would deadlock.  a lot.  Now, when
-** CONFIG_REISERFS_CHECK is defined, anytime someone enters a
-** transaction, it pushes itself into this ugly static list, and pops
-** itself off before calling journal_end.  I made a SysRq key to dump
-** the list, and tell me what the writers are when I'm deadlocked.  */
-
-				/* are you depending on the compiler
-                                   to optimize this function away
-                                   everywhere it is called? It is not
-                                   obvious how this works, but I
-                                   suppose debugging code need not be
-                                   clear.  -Hans */
-static char *journal_writers[512] ;
-int push_journal_writer(char *s) {
-#ifdef CONFIG_REISERFS_CHECK
-  int i ;
-  for (i = 0 ; i < 512 ; i++) {
-    if (!journal_writers[i]) {
-      journal_writers[i] = s ;
-      return i ;
-    }
-  }
-  return -1 ;
-#else
-  return 0 ;
-#endif
-}
-int pop_journal_writer(int index) {
-#ifdef CONFIG_REISERFS_CHECK
-  if (index >= 0) {
-    journal_writers[index] = NULL ;
-  }
-#endif
-  return 0 ;
-}
-
-int dump_journal_writers(void) {
-  int i ;
-  for (i = 0 ; i < 512 ; i++) {
-    if (journal_writers[i]) {
-      printk("%d: %s\n", i, journal_writers[i]) ;
-    }
-  }
-  return 0 ;
-}
-
 /*
 ** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
 ** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
@@ -2095,7 +2048,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   journal_list_init(p_s_sb) ;
 
   memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
-  memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
 
   INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ;
   spin_lock_init(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock) ;
@@ -2372,7 +2324,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
                             buffer_locked(bh) ? ' ' : '!',
 			    buffer_dirty(bh) ? ' ' : '!',
 			    buffer_journal_dirty(bh) ? ' ' : '!') ;
-    show_reiserfs_locks() ;
   }
   count_already_incd = clear_prepared_bits(bh) ;
 
@@ -2590,12 +2541,6 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
   return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
 }
 
-int show_reiserfs_locks(void) {
-
-  dump_journal_writers() ;
-  return 0 ;
-}
-
 /*
 ** used to get memory back from async commits that are floating around
 ** and to reclaim any blocks deleted but unusable because their commits
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 70dec0317a1f..5dae18f5b8e9 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -737,7 +737,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
 {
     int retval;
     struct inode * inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count; 
     INITIALIZE_PATH (path);
@@ -749,7 +748,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_rmdir") ;
 
     de.de_gen_number_bit_string = 0;
     if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
@@ -798,7 +796,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
     /* prevent empty directory from getting lost */
     add_save_link (&th, inode, 0/* not truncate */);
 
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -809,7 +806,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
        reiserfs_cut_from_item, or reiserfs_cut_from_item does not
        release path if operation was not complete */
     pathrelse (&path);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(dir->i_sb);
     return retval;	
@@ -821,7 +817,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
     struct inode * inode;
     struct reiserfs_dir_entry de;
     INITIALIZE_PATH (path);
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count;
     unsigned long savelink;
@@ -834,7 +829,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_unlink") ;
 	
     de.de_gen_number_bit_string = 0;
     if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
@@ -887,7 +881,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
        /* prevent file from getting lost */
        add_save_link (&th, inode, 0/* not truncate */);
 
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -895,7 +888,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
 
  end_unlink:
     pathrelse (&path);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -978,7 +970,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
 {
     int retval;
     struct inode *inode = old_dentry->d_inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; 
 
@@ -996,7 +987,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
     inode->i_nlink++;
 
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_link") ;
 
     /* create new entry */
     retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
@@ -1007,7 +997,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
 
     if (retval) {
 	inode->i_nlink--;
-	pop_journal_writer(windex) ;
 	journal_end(&th, dir->i_sb, jbegin_count) ;
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
@@ -1018,7 +1007,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
 
     atomic_inc(&inode->i_count) ;
     d_instantiate(dentry, inode);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(dir->i_sb);
     return 0;
@@ -1082,7 +1070,6 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
     struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ;
     struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
     struct inode * old_inode, * new_dentry_inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count ; 
     umode_t old_inode_mode;
@@ -1150,7 +1137,6 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
     }
 
     journal_begin(&th, old_dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_rename") ;
 
     /* add new entry (or find the existing one) */
     retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, 
@@ -1161,7 +1147,6 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
 			    "vs-7050: new entry is found, new inode == 0\n");
 	}
     } else if (retval) {
-	pop_journal_writer(windex) ;
 	journal_end(&th, old_dir->i_sb, jbegin_count) ;
 	reiserfs_write_unlock(old_dir->i_sb);
 	return retval;
@@ -1314,14 +1299,11 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
 	reiserfs_update_sd (&th, new_dentry_inode);
     }
 
-    pop_journal_writer(windex) ;
     journal_end(&th, old_dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(old_dir->i_sb);
     return 0;
 }
 
-
-
 /*
  * directories can handle most operations...
  */
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 862311f0365e..ac20f2dc94af 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -333,7 +333,6 @@ extern struct tree_balance * cur_tb;
 
 void reiserfs_panic (struct super_block * sb, const char * fmt, ...)
 {
-  show_reiserfs_locks() ;
   do_reiserfs_warning(fmt);
   printk ( KERN_EMERG "%s", error_buf);
   BUG ();
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 4e2b898a8d98..e4695e7b7ba3 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1727,8 +1727,6 @@ int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsi
 int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
 int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
-int push_journal_writer(char *w) ;
-int pop_journal_writer(int windex) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index e6d9fefce42c..b848ccd7ed41 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -475,7 +475,6 @@ void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
 int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int flush_old_commits(struct super_block *s, int) ;
-int show_reiserfs_locks(void) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 
 #define CARRY_ON                0
-- 
cgit v1.2.3


From 7c563ced265e3134a5c5c5b7ca2b31218993a204 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:25:50 -0700
Subject: [PATCH] reiserfs: logging rework

From: Chris Mason <mason@suse.com>

reiserfs logging rework, making things much faster for small transactions.
metadata buffers are dirtied when they are safe to write, so normal kernel
mechanisms can contribute to log cleaning.
---
 fs/reiserfs/do_balan.c         |   25 +-
 fs/reiserfs/fix_node.c         |   34 +-
 fs/reiserfs/ibalance.c         |    2 -
 fs/reiserfs/inode.c            |    4 +-
 fs/reiserfs/journal.c          | 1606 ++++++++++++++++++++++------------------
 fs/reiserfs/objectid.c         |    3 -
 fs/reiserfs/procfs.c           |    5 +-
 fs/reiserfs/super.c            |   31 +-
 include/linux/reiserfs_fs.h    |   29 +-
 include/linux/reiserfs_fs_i.h  |    4 +-
 include/linux/reiserfs_fs_sb.h |   70 +-
 11 files changed, 967 insertions(+), 846 deletions(-)

(limited to 'include/linux')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index c90f0edbc167..60baf14b580b 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -30,32 +30,11 @@ struct tree_balance * cur_tb = NULL; /* detects whether more than one
                                         is interrupting do_balance */
 #endif
 
-/*
- * AKPM: The __mark_buffer_dirty() call here will not
- * put the buffer on the dirty buffer LRU because we've just
- * set BH_Dirty.  That's a thinko in reiserfs.
- *
- * I'm reluctant to "fix" this bug because that would change
- * behaviour.  Using mark_buffer_dirty() here would make the
- * buffer eligible for VM and periodic writeback, which may
- * violate ordering constraints.  I'll just leave the code
- * as-is by removing the __mark_buffer_dirty call altogether.
- *
- * Chris says this code has "probably never been run" anyway.
- * It is due to go away.
- */
-
 inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
 					struct buffer_head * bh, int flag)
 {
-    if (reiserfs_dont_log(tb->tb_sb)) {
-	if (!test_set_buffer_dirty(bh)) {
-//	    __mark_buffer_dirty(bh) ;
-	    tb->need_balance_dirty = 1;
-	}
-    } else {
-	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-    }
+    journal_mark_dirty(tb->transaction_handle,
+                       tb->transaction_handle->t_super, bh) ;
 }
 
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 95a429ab77d6..b40c7de1c96f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2106,9 +2106,9 @@ static void tb_buffer_sanity_check (struct super_block * p_s_sb,
 {;}
 #endif
 
-static void clear_all_dirty_bits(struct super_block *s, 
+static int clear_all_dirty_bits(struct super_block *s,
                                  struct buffer_head *bh) {
-  reiserfs_prepare_for_journal(s, bh, 0) ;
+  return reiserfs_prepare_for_journal(s, bh, 0) ;
 }
 
 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
@@ -2137,11 +2137,11 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 					    p_s_tb->tb_path->path_length - i);
 		}
 #endif
-		clear_all_dirty_bits(p_s_tb->tb_sb, 
-				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ;
-
-		if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) )
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb,
+				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
+		{
 		    locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
+		}
 	    }
 	}
 
@@ -2151,22 +2151,19 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 
 		if ( p_s_tb->L[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ;
-		    if ( buffer_locked (p_s_tb->L[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
 			locked = p_s_tb->L[i];
 		}
 
 		if ( !locked && p_s_tb->FL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ;
-		    if ( buffer_locked (p_s_tb->FL[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
 			locked = p_s_tb->FL[i];
 		}
 
 		if ( !locked && p_s_tb->CFL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ;
-		    if ( buffer_locked (p_s_tb->CFL[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
 			locked = p_s_tb->CFL[i];
 		}
 
@@ -2176,23 +2173,20 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 
 		if ( p_s_tb->R[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ;
-		    if ( buffer_locked (p_s_tb->R[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
 			locked = p_s_tb->R[i];
 		}
 
        
 		if ( !locked && p_s_tb->FR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ;
-		    if ( buffer_locked (p_s_tb->FR[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
 			locked = p_s_tb->FR[i];
 		}
 
 		if ( !locked && p_s_tb->CFR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ;
-		    if ( buffer_locked (p_s_tb->CFR[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
 			locked = p_s_tb->CFR[i];
 		}
 	    }
@@ -2207,10 +2201,8 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 	*/
 	for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { 
 	    if ( p_s_tb->FEB[i] ) {
-		clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ;
-		if (buffer_locked(p_s_tb->FEB[i])) {
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
 		    locked = p_s_tb->FEB[i] ;
-		}
 	    }
 	}
 
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 78fc3b301c22..3df6dda7d776 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -633,7 +633,6 @@ static void balance_internal_when_delete (struct tree_balance * tb,
 		/* use check_internal if new root is an internal node */
 		check_internal (new_root);
 	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-	    tb->tb_sb->s_dirt = 1;
 
 	    /* do what is needed for buffer thrown from tree */
 	    reiserfs_invalidate_buffer(tb, tbSh);
@@ -951,7 +950,6 @@ int balance_internal (struct tree_balance * tb,			/* tree_balance structure 		*/
         PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
         PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
 	do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	tb->tb_sb->s_dirt = 1;
     }
 	
     if ( tb->blknum[h] == 2 ) {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e1c7928d0633..06635c7f18a9 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -964,7 +964,7 @@ static void init_inode (struct inode * inode, struct path * path)
     REISERFS_I(inode)->i_prealloc_block = 0;
     REISERFS_I(inode)->i_prealloc_count = 0;
     REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = NULL;
 
     if (stat_data_v1 (ih)) {
 	struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
@@ -1621,7 +1621,7 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
     REISERFS_I(inode)->i_prealloc_block = 0;
     REISERFS_I(inode)->i_prealloc_count = 0;
     REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = 0;
     REISERFS_I(inode)->i_attrs =
 	REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 95cf46212d68..cfff6ec0871f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -32,13 +32,6 @@
 **                      around too long.
 **		     -- Note, if you call this as an immediate flush from 
 **		        from within kupdate, it will ignore the immediate flag
-**
-** The commit thread -- a writer process for async commits.  It allows a 
-**                      a process to request a log flush on a task queue.
-**                      the commit will happen once the commit thread wakes up.
-**                      The benefit here is the writer (with whatever
-**                      related locks it has) doesn't have to wait for the
-**                      log blocks to hit disk if it doesn't want to.
 */
 
 #include <linux/config.h>
@@ -60,6 +53,14 @@
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#include <linux/writeback.h>
+
+
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
 
 /* the number of mounted filesystems.  This is used to decide when to
 ** start and kill the commit workqueue
@@ -78,6 +79,12 @@ static struct workqueue_struct *commit_wq;
 #define BLOCK_FREED_HOLDER 3    /* this block was freed during this transaction, and can't be written */
 
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
+#define BLOCK_DIRTIED 5
+
+
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY   2
 
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
@@ -86,6 +93,9 @@ static struct workqueue_struct *commit_wq;
 
 /* state bits for the journal */
 #define WRITERS_BLOCKED 1      /* set when new writers not allowed */
+#define WRITERS_QUEUED 2       /* set when log is full due to too many
+				* writers
+				*/
 
 static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
 static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
@@ -94,6 +104,9 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) ;
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
 static int release_journal_dev( struct super_block *super,
 				struct reiserfs_journal *journal );
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl);
+static void flush_async_commits(void *p);
 
 static void init_journal_hash(struct super_block *p_s_sb) {
   memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -105,8 +118,10 @@ static void init_journal_hash(struct super_block *p_s_sb) {
 ** more details.
 */
 static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
-  if (bh)
+  if (bh) {
     clear_buffer_dirty(bh);
+    clear_bit(BH_JTest, &bh->b_state);
+  }
   return 0 ;
 }
 
@@ -367,6 +382,7 @@ static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode
 
 static int clear_prepared_bits(struct buffer_head *bh) {
   clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state) ;
   return 0 ;
 }
 
@@ -471,11 +487,6 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 
   *next_zero_bit = 0 ; /* always start this at zero. */
 
-  /* we aren't logging all blocks are safe for reuse */
-  if (reiserfs_dont_log(p_s_sb)) {
-    return 0 ;
-  }
-
   PROC_INFO_INC( p_s_sb, journal.in_journal );
   /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
   ** if we crash before the transaction that freed it commits,  this transaction won't
@@ -503,6 +514,7 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 
   /* is it in the current transaction.  This should never happen */
   if ((cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, bl))) {
+    BUG();
     return 1; 
   }
 
@@ -527,18 +539,30 @@ inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct re
 
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
-  PROC_INFO_INC( p_s_sb, journal.lock_journal );
-  while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
-    PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
-    sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
-  }
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
+    PROC_INFO_INC( p_s_sb, journal.lock_journal );
+    down(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 /* unlock the current transaction */
 inline static void unlock_journal(struct super_block *p_s_sb) {
-  atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
-  wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
+    up(&SB_JOURNAL(p_s_sb)->j_lock);
+}
+
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+    jl->j_refcount++;
+}
+
+static inline void put_journal_list(struct super_block *s,
+                                   struct reiserfs_journal_list *jl)
+{
+    if (jl->j_refcount < 1) {
+        printk("trans id %lu, refcount at %d\n", jl->j_trans_id,
+	                                         jl->j_refcount);
+        BUG();
+    }
+    if (--jl->j_refcount == 0)
+        reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
 }
 
 /*
@@ -556,6 +580,83 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct re
   jl->j_list_bitmap = NULL ;
 }
 
+static int journal_list_still_alive(struct super_block *s,
+                                    unsigned long trans_id)
+{
+    struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
+    struct reiserfs_journal_list *jl;
+
+    if (!list_empty(entry)) {
+        jl = JOURNAL_LIST_ENTRY(entry->next);
+	if (jl->j_trans_id <= trans_id) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
+    struct reiserfs_journal_list *other_jl;
+    struct reiserfs_journal_list *first_jl;
+    struct list_head *entry;
+    unsigned long trans_id = jl->j_trans_id;
+    unsigned long other_trans_id;
+    unsigned long first_trans_id;
+
+find_first:
+    /*
+     * first we walk backwards to find the oldest uncommitted transation
+     */
+    first_jl = jl;
+    entry = jl->j_list.prev;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (entry == &SB_JOURNAL(s)->j_journal_list ||
+	    atomic_read(&other_jl->j_older_commits_done))
+	    break;
+
+        first_jl = other_jl;
+	entry = other_jl->j_list.prev;
+    }
+
+    /* if we didn't find any older uncommitted transactions, return now */
+    if (first_jl == jl) {
+        return 0;
+    }
+
+    first_trans_id = first_jl->j_trans_id;
+
+    entry = &first_jl->j_list;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	other_trans_id = other_jl->j_trans_id;
+
+	if (other_trans_id < trans_id) {
+	    if (atomic_read(&other_jl->j_commit_left) != 0) {
+		flush_commit_list(s, other_jl, 0);
+
+		/* list we were called with is gone, return */
+		if (!journal_list_still_alive(s, trans_id))
+		    return 1;
+
+		/* the one we just flushed is gone, this means all
+		 * older lists are also gone, so first_jl is no longer
+		 * valid either.  Go back to the beginning.
+		 */
+		if (!journal_list_still_alive(s, other_trans_id)) {
+		    goto find_first;
+		}
+	    }
+	    entry = entry->next;
+	    if (entry == &SB_JOURNAL(s)->j_journal_list)
+		return 0;
+	} else {
+	    return 0;
+	}
+    }
+    return 0;
+}
+
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
 **
@@ -564,13 +665,10 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct re
 **
 */
 static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
-  int i, count ;
-  int index = 0 ;
+  int i;
   int bn ;
-  int retry_count = 0 ;
-  int orig_commit_left = 0 ;
   struct buffer_head *tbh = NULL ;
-  struct reiserfs_journal_list *other_jl ;
+  unsigned long trans_id = jl->j_trans_id;
 
   reiserfs_check_lock_depth("flush_commit_list") ;
 
@@ -581,133 +679,100 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
   /* before we can put our commit blocks on disk, we have to make sure everyone older than
   ** us is on disk too
   */
-  if (jl->j_len <= 0) {
-    return 0 ;
-  }
+  if (jl->j_len <= 0)
+    BUG();
+  if (trans_id == SB_JOURNAL(s)->j_trans_id)
+    BUG();
+
+  get_journal_list(jl);
   if (flushall) {
-    /* we _must_ make sure the transactions are committed in order.  Start with the
-    ** index after this one, wrap all the way around 
-    */
-    index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
-    for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-      other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
-      if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && 
-          other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
-        flush_commit_list(s, other_jl, 0) ;
-      }
+    if (flush_older_commits(s, jl) == 1) {
+      /* list disappeared during flush_older_commits.  return */
+      goto put_jl;
     }
   }
 
-  count = 0 ;
-  /* don't flush the commit list for the current transactoin */
-  if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
-    return 0 ;
-  }
-
   /* make sure nobody is trying to flush this one at the same time */
-  if (atomic_read(&(jl->j_commit_flushing))) {
-    sleep_on(&(jl->j_commit_wait)) ;
-    if (flushall) {
-      atomic_set(&(jl->j_older_commits_done), 1) ;
-    }
-    return 0 ;
+  down(&jl->j_commit_lock);
+  if (!journal_list_still_alive(s, trans_id)) {
+    up(&jl->j_commit_lock);
+    goto put_jl;
   }
-  
+  if (jl->j_trans_id == 0)
+    BUG();
+
   /* this commit is done, exit */
   if (atomic_read(&(jl->j_commit_left)) <= 0) {
     if (flushall) {
       atomic_set(&(jl->j_older_commits_done), 1) ;
     }
-    return 0 ;
+    up(&jl->j_commit_lock);
+    goto put_jl;
   }
-  /* keeps others from flushing while we are flushing */
-  atomic_set(&(jl->j_commit_flushing), 1) ; 
 
-
-  if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
-    return 0 ;
+  /*
+   * for the description block and all the log blocks, submit any buffers
+   * that haven't already reached the disk
+   */
+  for (i = 0 ; i < (jl->j_len + 1) ; i++) {
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
+         SB_ONDISK_JOURNAL_SIZE(s);
+    tbh = journal_find_get_block(s, bn) ;
+    wait_on_buffer(tbh) ;
+    ll_rw_block(WRITE, 1, &tbh) ;
+    put_bh(tbh) ;
   }
 
-  orig_commit_left = atomic_read(&(jl->j_commit_left)) ; 
-
-  /* start by checking all the commit blocks in this transaction.  
-  ** Add anyone not on disk into tbh.  Stop checking once commit_left <= 1, because that means we
-  ** only have the commit block left 
-  */
-retry:
-  count = 0 ;
-  for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %  SB_ONDISK_JOURNAL_SIZE(s);
+  /* wait on everything written so far before writing the commit */
+  for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
+	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
     tbh = journal_find_get_block(s, bn) ;
 
-/* kill this sanity check */
-if (count > (orig_commit_left + 2)) {
-reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ;
-}
-    if (tbh) {
-      if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */
-	wait_on_buffer(tbh) ;
-	if (!buffer_uptodate(tbh)) {
-	  reiserfs_panic(s, "journal-584, buffer write failed\n") ;
-	}
-      } 
-      if (buffer_dirty(tbh)) {
-	printk("journal-569: flush_commit_list, block already dirty!\n") ;
-      } else {				
-	mark_buffer_dirty(tbh) ;
-      }
-      ll_rw_block(WRITE, 1, &tbh) ;
-      count++ ;
-      put_bh(tbh) ; /* once for our get_hash */
-    } 
-  }
-
-  /* wait on everyone in tbh before writing commit block*/
-  if (count > 0) {
-    for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && 
-                 i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-      bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
-      tbh = journal_find_get_block(s, bn) ;
-
-      wait_on_buffer(tbh) ;
-      if (!buffer_uptodate(tbh)) {
-	reiserfs_panic(s, "journal-601, buffer write failed\n") ;
-      }
-      put_bh(tbh) ; /* once for our get_hash */
-      bforget(tbh) ;    /* once due to original getblk in do_journal_end */
-      atomic_dec(&(jl->j_commit_left)) ;
+    wait_on_buffer(tbh) ;
+    if (buffer_dirty(tbh))
+      BUG();
+    if (!buffer_uptodate(tbh)) {
+      reiserfs_panic(s, "journal-601, buffer write failed\n") ;
     }
+    put_bh(tbh) ; /* once for journal_find_get_block */
+    put_bh(tbh) ;    /* once due to original getblk in do_journal_end */
+    atomic_dec(&(jl->j_commit_left)) ;
   }
 
-  if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */
-    if (retry_count < 2) {
-      printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ;
-      retry_count++ ;
-      goto retry;
-    }
-    reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %u, should be 1\n", 
-		   atomic_read(&(jl->j_commit_left)));
-  }
+  if (atomic_read(&(jl->j_commit_left)) != 1)
+    BUG();
 
+  if (buffer_dirty(jl->j_commit_bh))
+    BUG();
   mark_buffer_dirty(jl->j_commit_bh) ;
   sync_dirty_buffer(jl->j_commit_bh) ;
   if (!buffer_uptodate(jl->j_commit_bh)) {
     reiserfs_panic(s, "journal-615: buffer write failed\n") ;
   }
-  atomic_dec(&(jl->j_commit_left)) ;
   bforget(jl->j_commit_bh) ;
+  if (SB_JOURNAL(s)->j_last_commit_id != 0 &&
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
+      reiserfs_warning("clm-2200: last commit %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_commit_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
 
   /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
   cleanup_freed_for_journal_list(s, jl) ;
 
+  /* mark the metadata dirty */
+  dirty_one_transaction(s, jl);
+  atomic_dec(&(jl->j_commit_left)) ;
+
   if (flushall) {
     atomic_set(&(jl->j_older_commits_done), 1) ;
   }
-  atomic_set(&(jl->j_commit_flushing), 0) ;
-  wake_up(&(jl->j_commit_wait)) ;
+  up(&jl->j_commit_lock);
+put_jl:
+  put_journal_list(s, jl);
 
-  s->s_dirt = 1 ;
   return 0 ;
 }
 
@@ -804,22 +869,27 @@ static int update_journal_header_block(struct super_block *p_s_sb,
 ** flush any and all journal lists older than you are 
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
-  int i, index ;
-  struct reiserfs_journal_list *other_jl ;
-
-  index = jl - SB_JOURNAL_LIST(p_s_sb) ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
-    if (other_jl && other_jl->j_len > 0 && 
-        other_jl->j_trans_id > 0 && 
-	other_jl->j_trans_id < trans_id && 
-        other_jl != jl) {
-      /* do not flush all */
-      flush_journal_list(p_s_sb, other_jl, 0) ; 
+static int flush_older_journal_lists(struct super_block *p_s_sb,
+                                     struct reiserfs_journal_list *jl)
+{
+    struct list_head *entry;
+    struct reiserfs_journal_list *other_jl ;
+    unsigned long trans_id = jl->j_trans_id;
+
+    /* we know we are the only ones flushing things, no extra race
+     * protection is required.
+     */
+restart:
+    entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
+    other_jl = JOURNAL_LIST_ENTRY(entry);
+    if (other_jl->j_trans_id < trans_id) {
+	/* do not flush all */
+	flush_journal_list(p_s_sb, other_jl, 0) ;
+
+	/* other_jl is now deleted from the list */
+	goto restart;
     }
-  }
-  return 0 ;
+    return 0 ;
 }
 
 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
@@ -836,15 +906,27 @@ static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
     unlock_buffer(bh) ;
     put_bh(bh) ;
 }
+
 static void submit_logged_buffer(struct buffer_head *bh) {
-    lock_buffer(bh) ;
     get_bh(bh) ;
     bh->b_end_io = reiserfs_end_buffer_io_sync ;
     mark_buffer_notjournal_new(bh) ;
     clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
     submit_bh(WRITE, bh) ;
 }
 
+static void del_from_work_list(struct super_block *s,
+                               struct reiserfs_journal_list *jl) {
+    if (!list_empty(&jl->j_working_list)) {
+	list_del_init(&jl->j_working_list);
+	SB_JOURNAL(s)->j_num_work_lists--;
+    }
+}
+
 /* flush a journal list, both commit and real blocks
 **
 ** always set flushall to 1, unless you are calling from inside
@@ -865,29 +947,26 @@ static int flush_journal_list(struct super_block *s,
   unsigned long j_len_saved = jl->j_len ;
 
   if (j_len_saved <= 0) {
-    return 0 ;
+    BUG();
   }
 
   if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
     reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n",
                       atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
   }
-  /* if someone is getting the commit list, we must wait for them */
-  while (atomic_read(&(jl->j_commit_flushing))) { 
-    sleep_on(&(jl->j_commit_wait)) ;
-  }
-  /* if someone is flushing this list, we must wait for them */
-  while (atomic_read(&(jl->j_flushing))) {
-    sleep_on(&(jl->j_flush_wait)) ;
-  }
+  if (jl->j_trans_id == 0)
+    BUG();
 
-  /* this list is now ours, we can change anything we want */
-  atomic_set(&(jl->j_flushing), 1) ;
+  /* if flushall == 0, the lock is already held */
+  if (flushall) {
+      down(&SB_JOURNAL(s)->j_flush_sem);
+  } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
+      BUG();
+  }
 
   count = 0 ;
   if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
-    atomic_dec(&(jl->j_flushing)) ;
+    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
     return 0 ;
   }
 
@@ -902,6 +981,9 @@ static int flush_journal_list(struct super_block *s,
   */
   flush_commit_list(s, jl, 1) ;
 
+  if (!(jl->j_state & LIST_DIRTY))
+      BUG();
+
   /* are we done now? */
   if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
       atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -937,13 +1019,13 @@ static int flush_journal_list(struct super_block *s,
       get_bh(saved_bh) ;
 
       if (buffer_journal_dirty(saved_bh)) {
+	if (!can_dirty(cn))
+	  BUG();
         was_jwait = 1 ;
-	mark_buffer_notjournal_dirty(saved_bh) ;
-        /* undo the inc from journal_mark_dirty */
-	put_bh(saved_bh) ;
-      }
-      if (can_dirty(cn)) {
         was_dirty = 1 ;
+      } else if (can_dirty(cn)) {
+        /* everything with !pjl && jwait should be writable */
+	BUG();
       }
     }
 
@@ -951,7 +1033,8 @@ static int flush_journal_list(struct super_block *s,
     ** sure they are commited, and don't try writing it to disk
     */
     if (pjl) {
-      flush_commit_list(s, pjl, 1) ;
+      if (atomic_read(&pjl->j_commit_left))
+        flush_commit_list(s, pjl, 1) ;
       goto free_cnode ;
     }
 
@@ -970,22 +1053,17 @@ static int flush_journal_list(struct super_block *s,
 printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasction\n", (unsigned long long)saved_bh->b_blocknr,
         was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
     }
-    /* kupdate_one_transaction waits on the buffers it is writing, so we
-    ** should never see locked buffers here
-    */
-    if (buffer_locked(saved_bh)) {
-      printk("clm-2083: locked buffer %llu in flush_journal_list\n", 
-              (unsigned long long)saved_bh->b_blocknr) ;
-      wait_on_buffer(saved_bh) ;
-      if (!buffer_uptodate(saved_bh)) {
-        reiserfs_panic(s, "journal-923: buffer write failed\n") ;
-      }
-    } 
     if (was_dirty) { 
       /* we inc again because saved_bh gets decremented at free_cnode */
       get_bh(saved_bh) ;
       set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-      submit_logged_buffer(saved_bh) ;
+      lock_buffer(saved_bh);
+      if (cn->blocknr != saved_bh->b_blocknr)
+        BUG();
+      if (buffer_dirty(saved_bh))
+        submit_logged_buffer(saved_bh) ;
+      else
+        unlock_buffer(saved_bh);
       count++ ;
     } else {
       printk("clm-2082: Unable to flush buffer %llu in flush_journal_list\n",
@@ -1016,6 +1094,14 @@ free_cnode:
 	if (!buffer_uptodate(cn->bh)) {
 	  reiserfs_panic(s, "journal-949: buffer write failed\n") ;
 	}
+	/* note, we must clear the JDirty_wait bit after the up to date
+	** check, otherwise we race against our flushpage routine
+	*/
+	if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
+	    BUG();
+
+        /* undo the inc from journal_mark_dirty */
+	put_bh(cn->bh) ;
         brelse(cn->bh) ;
       }
       cn = cn->next ;
@@ -1029,7 +1115,7 @@ flush_older_and_return:
   ** replayed after a crash
   */
   if (flushall) {
-    flush_older_journal_lists(s, jl, jl->j_trans_id) ;
+    flush_older_journal_lists(s, jl);
   } 
   
   /* before we can remove everything from the hash tables for this 
@@ -1044,181 +1130,246 @@ flush_older_and_return:
     update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
   }
   remove_all_from_journal_list(s, jl, 0) ;
+  list_del(&jl->j_list);
+  SB_JOURNAL(s)->j_num_lists--;
+  del_from_work_list(s, jl);
+
+  if (SB_JOURNAL(s)->j_last_flush_id != 0 &&
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
+      reiserfs_warning("clm-2201: last flush %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_flush_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
+
+  /* not strictly required since we are freeing the list, but it should
+   * help find code using dead lists later on
+   */
   jl->j_len = 0 ;
   atomic_set(&(jl->j_nonzerolen), 0) ;
   jl->j_start = 0 ;
   jl->j_realblock = NULL ;
   jl->j_commit_bh = NULL ;
   jl->j_trans_id = 0 ;
-  atomic_dec(&(jl->j_flushing)) ;
-  wake_up(&(jl->j_flush_wait)) ;
+  jl->j_state = 0;
+  put_journal_list(s, jl);
+  if (flushall)
+    up(&SB_JOURNAL(s)->j_flush_sem);
   return 0 ;
 } 
 
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
 
-static int kupdate_one_transaction(struct super_block *s,
-                                    struct reiserfs_journal_list *jl) 
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE)
+        write_chunk(chunk);
+}
+
+static int write_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk)
 {
-    struct reiserfs_journal_list *pjl ; /* previous list for this cn */
-    struct reiserfs_journal_cnode *cn, *walk_cn ;
-    b_blocknr_t blocknr ;
-    int run = 0 ;
-    int orig_trans_id = jl->j_trans_id ;
-    struct buffer_head *saved_bh ; 
+    struct reiserfs_journal_cnode *cn;
     int ret = 0 ;
 
-    /* if someone is getting the commit list, we must wait for them */
-    while (atomic_read(&(jl->j_commit_flushing))) {
-        sleep_on(&(jl->j_commit_wait)) ;
-    }
-    /* if someone is flushing this list, we must wait for them */
-    while (atomic_read(&(jl->j_flushing))) {
-        sleep_on(&(jl->j_flush_wait)) ;
+    jl->j_state |= LIST_TOUCHED;
+    del_from_work_list(s, jl);
+    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+        return 0;
     }
-    /* was it flushed while we slept? */
-    if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
-        return 0 ;
-    }
-
-    /* this list is now ours, we can change anything we want */
-    atomic_set(&(jl->j_flushing), 1) ;
 
-loop_start:
     cn = jl->j_realblock ;
     while(cn) {
-        saved_bh = NULL ;
         /* if the blocknr == 0, this has been cleared from the hash,
         ** skip it
         */
         if (cn->blocknr == 0) {
             goto next ;
         }
+        if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+	    struct buffer_head *tmp_bh;
+	    /* we can race against journal_mark_freed when we try
+	     * to lock_buffer(cn->bh), so we have to inc the buffer
+	     * count, and recheck things after locking
+	     */
+	    tmp_bh = cn->bh;
+	    get_bh(tmp_bh);
+	    lock_buffer(tmp_bh);
+	    if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+		if (!buffer_journal_dirty(tmp_bh) ||
+		    reiserfs_buffer_prepared(tmp_bh))
+		    BUG();
+		add_to_chunk(chunk, tmp_bh);
+		ret++;
+	    } else {
+		/* note, cn->bh might be null now */
+		unlock_buffer(tmp_bh);
+	    }
+	    put_bh(tmp_bh);
+        }
+next:
+        cn = cn->next ;
+	cond_resched();
+    }
+    return ret ;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl)
+{
+    struct reiserfs_journal_cnode *cn;
+    struct reiserfs_journal_list *pjl;
+    int ret = 0 ;
+
+    jl->j_state |= LIST_DIRTY;
+    cn = jl->j_realblock ;
+    while(cn) {
         /* look for a more recent transaction that logged this
         ** buffer.  Only the most recent transaction with a buffer in
         ** it is allowed to send that buffer to disk
         */
-        pjl = find_newer_jl_for_cn(cn) ;
-        if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
-            can_dirty(cn)) 
-        {
-            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
-                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-		submit_logged_buffer(cn->bh) ;
-            } else {
-                /* someone else is using this buffer.  We can't 
-                ** send it to disk right now because they might
-                ** be changing/logging it.
-                */
-                ret = 1 ;
-            }
-        } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
-            clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-            if (!pjl && cn->bh) {
-                wait_on_buffer(cn->bh) ;
-            }
-            /* check again, someone could have logged while we scheduled */
-            pjl = find_newer_jl_for_cn(cn) ;
-
-            /* before the JDirty_wait bit is set, the 
-            ** buffer is added to the hash list.  So, if we are
-            ** run in the middle of a do_journal_end, we will notice
-            ** if this buffer was logged and added from the latest
-            ** transaction.  In this case, we don't want to decrement
-            ** b_count
-            */
-            if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) {
-                blocknr = cn->blocknr ;
-                walk_cn = cn ;
-                saved_bh= cn->bh ;
-                /* update all older transactions to show this block
-                ** was flushed
-                */
-                mark_buffer_notjournal_dirty(cn->bh) ;
-                while(walk_cn) {
-                    if (walk_cn->bh && walk_cn->blocknr == blocknr && 
-                         walk_cn->sb == cn->sb) {
-                        if (walk_cn->jlist) {
-                            atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ;
-                        }
-                        walk_cn->bh = NULL ;
-                    }
-                    walk_cn = walk_cn->hnext ;
-                }
-                if (atomic_read(&saved_bh->b_count) < 1) {
-                    reiserfs_warning("clm-2081: bad count on %lu\n", 
-                                      saved_bh->b_blocknr) ;
-                }
-                brelse(saved_bh) ;
-            }
-        }
-        /*
-        ** if the more recent transaction is committed to the log,
-        ** this buffer can be considered flushed.  Decrement our
-        ** counters to reflect one less buffer that needs writing.
-        **
-        ** note, this relies on all of the above code being
-        ** schedule free once pjl comes back non-null.
-        */
-        if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) {
-            atomic_dec(&cn->jlist->j_nonzerolen) ;
-            cn->bh = NULL ;
+	pjl = find_newer_jl_for_cn(cn) ;
+        if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
+	{
+	    if (!can_dirty(cn))
+	        BUG();
+	    /* if the buffer is prepared, it will either be logged
+	     * or restored.  If restored, we need to make sure
+	     * it actually gets marked dirty
+	     */
+	    mark_buffer_notjournal_new(cn->bh) ;
+	    if (test_bit(BH_JPrepared, &cn->bh->b_state)) {
+	        set_bit(BH_JRestore_dirty, &cn->bh->b_state);
+	    } else {
+	        set_bit(BH_JTest, &cn->bh->b_state);
+	        mark_buffer_dirty(cn->bh);
+	    }
         } 
-next:
         cn = cn->next ;
     }
-    /* the first run through the loop sends all the dirty buffers to
-    ** ll_rw_block.
-    ** the second run through the loop does all the accounting
-    */
-    if (run++ == 0) {
-        goto loop_start ;
+    return ret ;
+}
+
+static int kupdate_transactions(struct super_block *s,
+                                   struct reiserfs_journal_list *jl,
+				   struct reiserfs_journal_list **next_jl,
+				   unsigned long *next_trans_id,
+				   int num_blocks,
+				   int num_trans) {
+    int ret = 0;
+    int written = 0 ;
+    int transactions_flushed = 0;
+    unsigned long orig_trans_id = jl->j_trans_id;
+    struct buffer_chunk chunk;
+    struct list_head *entry;
+    chunk.nr = 0;
+
+    down(&SB_JOURNAL(s)->j_flush_sem);
+    if (!journal_list_still_alive(s, orig_trans_id)) {
+	goto done;
+    }
+
+    /* we've got j_flush_sem held, nobody is going to delete any
+     * of these lists out from underneath us
+     */
+    while((num_trans && transactions_flushed < num_trans) ||
+          (!num_trans && written < num_blocks)) {
+
+	if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+	    atomic_read(&jl->j_commit_left))
+	{
+	    del_from_work_list(s, jl);
+	    break;
+	}
+	ret = write_one_transaction(s, jl, &chunk);
+
+	if (ret < 0)
+	    goto done;
+	transactions_flushed++;
+	written += ret;
+	entry = jl->j_list.next;
+
+	/* did we wrap? */
+	if (entry == &SB_JOURNAL(s)->j_journal_list) {
+	    break;
+        }
+	jl = JOURNAL_LIST_ENTRY(entry);
+
+	/* don't bother with older transactions */
+	if (jl->j_trans_id <= orig_trans_id)
+	    break;
+    }
+    if (chunk.nr) {
+        write_chunk(&chunk);
     }
 
-    atomic_set(&(jl->j_flushing), 0) ;
-    wake_up(&(jl->j_flush_wait)) ;
-    return ret ;
+done:
+    up(&SB_JOURNAL(s)->j_flush_sem);
+    return ret;
 }
-/* since we never give dirty buffers to bdflush/kupdate, we have to
-** flush them ourselves.  This runs through the journal lists, finds
-** old metadata in need of flushing and sends it to disk.
-** this does not end transactions, commit anything, or free
-** cnodes.
+
+/* for o_sync and fsync heavy applications, they tend to use
+** all the journa list slots with tiny transactions.  These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
 **
-** returns the highest transaction id that was flushed last time
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
 */
-static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
-    struct reiserfs_journal_list *jl ;
-    int i ;
-    int start ;
-    time_t age ;
-    int ret = 0 ;
-
-    start = SB_JOURNAL_LIST_INDEX(s) ;
-
-    /* safety check to prevent flush attempts during a mount */
-    if (start < 0) {
-        return 0 ;
-    }
-    i = (start + 1) % JOURNAL_LIST_COUNT ;
-    while(i != start) {
-        jl = SB_JOURNAL_LIST(s) + i  ;
-        age = get_seconds() - jl->j_timestamp ;
-        if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && 
-            atomic_read(&(jl->j_nonzerolen)) > 0 &&
-            atomic_read(&(jl->j_commit_left)) == 0) {
-
-            if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
-                break ;
-            }
-            /* if ret was already 1, we want to preserve that */
-            ret |= kupdate_one_transaction(s, jl) ;
-        } 
-        if (atomic_read(&(jl->j_nonzerolen)) > 0) {
-            ret |= 1 ;
-        }
-        i = (i + 1) % JOURNAL_LIST_COUNT ;
+static int flush_used_journal_lists(struct super_block *s,
+                                    struct reiserfs_journal_list *jl) {
+    unsigned long len = 0;
+    unsigned long cur_len;
+    int ret;
+    int i;
+    struct reiserfs_journal_list *tjl;
+    struct reiserfs_journal_list *flush_jl;
+    unsigned long trans_id;
+
+    flush_jl = tjl = jl;
+
+    /* flush for 256 transactions or 256 blocks, whichever comes first */
+    for(i = 0 ; i < 256 && len < 256 ; i++) {
+	if (atomic_read(&tjl->j_commit_left) ||
+	    tjl->j_trans_id < jl->j_trans_id) {
+	    break;
+	}
+	cur_len = atomic_read(&tjl->j_nonzerolen);
+	if (cur_len > 0) {
+	    tjl->j_state &= ~LIST_TOUCHED;
+	}
+	len += cur_len;
+	flush_jl = tjl;
+	if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
+	    break;
+	tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+    }
+    /* try to find a group of blocks we can flush across all the
+    ** transactions, but only bother if we've actually spanned
+    ** across multiple lists
+    */
+    if (flush_jl != jl) {
+        ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
     }
-    return ret ;
+    flush_journal_list(s, flush_jl, 1);
+    return 0;
 }
 
 /*
@@ -1262,6 +1413,10 @@ void remove_journal_hash(struct super_block *sb,
 }
 
 static void free_journal_ram(struct super_block *p_s_sb) {
+  reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl,
+                 sizeof(struct reiserfs_journal_list), p_s_sb);
+  SB_JOURNAL(p_s_sb)->j_num_lists--;
+
   vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
   free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
   free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
@@ -1392,7 +1547,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffe
     }
     brelse(c_bh) ;
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
-                   "transaction start offset %lu, len %d id %d\n", 
+                   "transaction start offset %llu, len %d id %d\n",
 		   d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
     return 1 ;
@@ -1432,7 +1587,7 @@ static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cu
   desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
   trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
   reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
-                 "journal_read_transaction, offset %lu, len %d mount_id %d\n", 
+                 "journal_read_transaction, offset %llu, len %d mount_id %d\n",
 		 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		 get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
   if (get_desc_trans_id(desc) < oldest_trans_id) {
@@ -1460,7 +1615,7 @@ static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cu
   commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
   if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
-                   "commit offset %ld had bad time %d or length %d\n", 
+                   "commit offset %llu had bad time %d or length %d\n",
 		   c_bh->b_blocknr -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_commit_trans_id(commit), get_commit_trans_len(commit));
     brelse(c_bh) ;
@@ -1628,7 +1783,7 @@ static int journal_read(struct super_block *p_s_sb) {
   printk("reiserfs: checking transaction log (%s) for (%s)\n",
 	 bdevname(SB_JOURNAL(p_s_sb)->j_dev_bd, b),
 	 reiserfs_bdevname(p_s_sb));
-  start = get_seconds() ;
+  start = get_seconds();
 
   /* step 1, read in the journal header block.  Check the transaction it says 
   ** is the first unflushed, and if that transaction is not valid, 
@@ -1688,7 +1843,7 @@ static int journal_read(struct super_block *p_s_sb) {
 	oldest_start = d_bh->b_blocknr ;
 	newest_mount_id = get_desc_mount_id(desc) ;
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
-	               "oldest_start to offset %lu, trans_id %lu\n", 
+	               "oldest_start to offset %llu, trans_id %lu\n",
 		       oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		       oldest_trans_id) ;
       } else if (oldest_trans_id > get_desc_trans_id(desc)) { 
@@ -1716,7 +1871,7 @@ start_log_replay:
   cur_dblock = oldest_start ;
   if (oldest_trans_id)  {
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
-                   "from offset %lu, trans_id %lu\n", 
+                   "from offset %llu, trans_id %lu\n",
 		   cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   oldest_trans_id) ;
 
@@ -1770,70 +1925,26 @@ start_log_replay:
   return 0 ;
 }
 
-
-struct reiserfs_journal_commit_task {
-  struct super_block *p_s_sb ;
-  int jindex ;
-  int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
-                       ** is zero, we free the whole struct on finish
-		       */
-  struct reiserfs_journal_commit_task *self ;
-  struct work_struct work;
-} ;
-
-static void reiserfs_journal_commit_task_func(void *__ct) {
-  struct reiserfs_journal_commit_task *ct = __ct;
-  struct reiserfs_journal_list *jl ;
-
-  reiserfs_write_lock(ct->p_s_sb);
-
-  jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
-
-  flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; 
-
-  if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
-      atomic_read(&(jl->j_commit_left)) == 0) {
-    kupdate_one_transaction(ct->p_s_sb, jl) ;
-  }
-  reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
-  reiserfs_write_unlock(ct->p_s_sb);
-}
-
-static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
-                                  struct super_block *p_s_sb, 
-				  int jindex) {
-  if (!ct) {
-    reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
-  }
-  ct->p_s_sb = p_s_sb ;
-  ct->jindex = jindex ;
-  INIT_WORK(&ct->work, reiserfs_journal_commit_task_func, ct);
-  ct->self = ct ;
-}
-
-static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
-  struct reiserfs_journal_commit_task *ct ;
-  /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
-  ** to start/join a transaction, which will deadlock
-  */
-  ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
-  if (ct) {
-    setup_commit_task_arg(ct, p_s_sb, jindex) ;
-    queue_work(commit_wq, &ct->work) ;
-  } else {
-#ifdef CONFIG_REISERFS_CHECK
-    reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ;
-#endif
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
-  }
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+    struct reiserfs_journal_list *jl;
+retry:
+    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
+    if (!jl) {
+	yield();
+	goto retry;
+    }
+    memset(jl, 0, sizeof(*jl));
+    INIT_LIST_HEAD(&jl->j_list);
+    INIT_LIST_HEAD(&jl->j_working_list);
+    sema_init(&jl->j_commit_lock, 1);
+    SB_JOURNAL(s)->j_num_lists++;
+    get_journal_list(jl);
+    return jl;
 }
 
 static void journal_list_init(struct super_block *p_s_sb) {
-  int i ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
-  }
+    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 }
 
 static int release_journal_dev( struct super_block *super,
@@ -1924,6 +2035,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
     struct reiserfs_super_block * rs;
     struct reiserfs_journal_header *jh;
     struct reiserfs_journal *journal;
+    struct reiserfs_journal_list *jl;
     char b[BDEVNAME_SIZE];
 
     journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
@@ -1934,6 +2046,8 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
     memset(journal, 0, sizeof(struct reiserfs_journal)) ;
     INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
     INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_working_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_journal_list);
     reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, 
  				   SB_BMAP_NR(p_s_sb)) ;
     allocate_bitmap_nodes(p_s_sb) ;
@@ -2041,10 +2155,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   brelse (bhjh);
      
   SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
-
-  /* clear out the journal list array */
-  memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; 
   journal_list_init(p_s_sb) ;
 
   memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -2061,13 +2171,13 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
   SB_JOURNAL(p_s_sb)->j_first = NULL ;     
   init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-  init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; 
+  sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
+  sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
 
   SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;  
   SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; 
   SB_JOURNAL(p_s_sb)->j_state = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
   SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
   SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
   SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ;
@@ -2075,8 +2185,9 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
 
   init_journal_hash(p_s_sb) ;
-  SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
-  if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+  jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+  if (!jl->j_list_bitmap) {
     reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ;
     goto free_and_return;
   }
@@ -2084,16 +2195,12 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
     reiserfs_warning("Replay Failure, unable to mount\n") ;
     goto free_and_return;
   }
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this
-                                         where it belongs */
-
-  if (reiserfs_dont_log (p_s_sb))
-    return 0;
 
   reiserfs_mounted_fs_count++ ;
   if (reiserfs_mounted_fs_count <= 1)
     commit_wq = create_workqueue("reiserfs");
 
+  INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
   return 0 ;
 free_and_return:
   free_journal_ram(p_s_sb);
@@ -2107,8 +2214,6 @@ free_and_return:
 */
 int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
   time_t now = get_seconds() ;
-  if (reiserfs_dont_log(th->t_super)) 
-    return 0 ;
   /* cannot restart while nested */
   if (th->t_refcount > 1)
     return 0 ;
@@ -2148,6 +2253,35 @@ void reiserfs_wait_on_write_block(struct super_block *s) {
                !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
 }
 
+static void queue_log_writer(struct super_block *s) {
+    set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
+    sleep_on(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void wake_queued_writers(struct super_block *s) {
+    if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state))
+        wake_up(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb,
+                                 unsigned long trans_id)
+{
+    unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
+    while(1) {
+	yield();
+        while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
+	        atomic_read(&SB_JOURNAL(sb)->j_jlock)) &&
+	       SB_JOURNAL(sb)->j_trans_id == trans_id) {
+	    queue_log_writer(sb);
+	}
+	if (SB_JOURNAL(sb)->j_trans_id != trans_id)
+	    break;
+	if (bcount == SB_JOURNAL(sb)->j_bcount)
+	    break;
+	bcount = SB_JOURNAL(sb)->j_bcount;
+    }
+}
+
 /* join == true if you must join an existing transaction.
 ** join == false if you can deal with waiting for others to finish
 **
@@ -2157,15 +2291,14 @@ void reiserfs_wait_on_write_block(struct super_block *s) {
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
   time_t now = get_seconds() ;
   int old_trans_id  ;
+  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+  struct reiserfs_transaction_handle myth;
+  int sched_count = 0;
 
   reiserfs_check_lock_depth("journal_begin") ;
   RFALSE( p_s_sb->s_flags & MS_RDONLY, 
 	  "clm-2078: calling journal_begin on readonly FS") ;
 
-  if (reiserfs_dont_log(p_s_sb)) {
-    th->t_super = p_s_sb ; /* others will check this for the don't log flag */
-    return 0 ;
-  }
   PROC_INFO_INC( p_s_sb, journal.journal_being );
   /* set here for journal_join */
   th->t_refcount = 1;
@@ -2173,66 +2306,76 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
 
 relock:
   lock_journal(p_s_sb) ;
+  journal->j_bcount++;
 
-  if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
+  if (test_bit(WRITERS_BLOCKED, &journal->j_state)) {
     unlock_journal(p_s_sb) ;
     reiserfs_wait_on_write_block(p_s_sb) ;
     PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
     goto relock ;
   }
+  now = get_seconds();
 
   /* if there is no room in the journal OR
   ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
   ** we don't sleep if there aren't other writers
   */
 
-  if (  (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
-     ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || 
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-      (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
-     (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
+  if ( (!join && journal->j_must_wait > 0) ||
+     ( !join && (journal->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) ||
+     (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 &&
+      (now - journal->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
+     (!join && atomic_read(&journal->j_jlock)) ||
+     (!join && journal->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
 
+    old_trans_id = journal->j_trans_id;
     unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
 
-    /* if writer count is 0, we can just force this transaction to end, and start
-    ** a new one afterwards.
-    */
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
-      struct reiserfs_transaction_handle myth ;
-      journal_join(&myth, p_s_sb, 1) ;
-      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-      journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-      do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
+    if (!join && (journal->j_len_alloc + nblocks + 2) >=
+        SB_JOURNAL_MAX_BATCH(p_s_sb) &&
+	((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
+    {
+	if (atomic_read(&journal->j_wcount) > 10) {
+	    sched_count++;
+	    queue_log_writer(p_s_sb);
+	    goto relock;
+	}
+    }
+    /* don't mess with joining the transaction if all we have to do is
+     * wait for someone else to do a commit
+     */
+    if (atomic_read(&journal->j_jlock)) {
+	while (journal->j_trans_id == old_trans_id &&
+	       atomic_read(&journal->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        }
+	goto relock;
+    }
+    journal_join(&myth, p_s_sb, 1) ;
+
+    /* someone might have ended the transaction while we joined */
+    if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
+        do_journal_end(&myth, p_s_sb, 1, 0) ;
     } else {
-      /* but if the writer count isn't zero, we have to wait for the current writers to finish.
-      ** They won't batch on transaction end once we set j_jlock
-      */
-      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
-      old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
-            SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
+        do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
     }
+
     PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
     goto relock ;
   }
-
-  if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
-    SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
+  /* we are the first writer, set trans_id */
+  if (journal->j_trans_start_time == 0) {
+    journal->j_trans_start_time = get_seconds();
   }
-  atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
-  SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
+  atomic_inc(&(journal->j_wcount)) ;
+  journal->j_len_alloc += nblocks ;
   th->t_blocks_logged = 0 ;
   th->t_blocks_allocated = nblocks ;
-  th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  th->t_trans_id = journal->j_trans_id ;
   unlock_journal(p_s_sb) ;
-  p_s_sb->s_dirt = 1; 
   return 0 ;
 }
 
-
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -2277,11 +2420,6 @@ int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  *
     return ret ;
 }
 
-/* not used at all */
-int journal_prepare(struct super_block  * p_s_sb, struct buffer_head *bh) {
-  return 0 ;
-}
-
 /*
 ** puts bh into the current transaction.  If it was already there, reorders removes the
 ** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
@@ -2297,18 +2435,14 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
   int prepared = 0 ;
 
   PROC_INFO_INC( p_s_sb, journal.mark_dirty );
-  if (reiserfs_dont_log(th->t_super)) {
-    mark_buffer_dirty(bh) ;
-    return 0 ;
-  }
-
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
                    th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
   }
-  p_s_sb->s_dirt = 1 ;
+  p_s_sb->s_dirt = 1;
 
   prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state);
   /* already in this transaction, we are done */
   if (buffer_journaled(bh)) {
     PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
@@ -2319,13 +2453,12 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
   ** a dirty or journal_dirty or locked buffer to be logged, as some changes
   ** could get to disk too early.  NOT GOOD.
   */
-  if (!prepared || buffer_locked(bh)) {
+  if (!prepared || buffer_locked(bh) || buffer_dirty(bh)) {
     printk("journal-1777: buffer %llu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', 
                             buffer_locked(bh) ? ' ' : '!',
 			    buffer_dirty(bh) ? ' ' : '!',
 			    buffer_journal_dirty(bh) ? ' ' : '!') ;
   }
-  count_already_incd = clear_prepared_bits(bh) ;
 
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
     printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
@@ -2344,14 +2477,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
     mark_buffer_notjournal_dirty(bh) ;
   }
 
-  if (buffer_dirty(bh)) {
-    clear_buffer_dirty(bh) ;
-  }
-
-  if (buffer_journaled(bh)) { /* must double check after getting lock */
-    goto done ;
-  }
-
   if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) {
     SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ;
   }
@@ -2391,24 +2516,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
     SB_JOURNAL(p_s_sb)->j_first = cn ;
     SB_JOURNAL(p_s_sb)->j_last = cn ;
   }
-done:
-  return 0 ;
-}
-
-/*
-** if buffer already in current transaction, do a journal_mark_dirty
-** otherwise, just mark it dirty and move on.  Used for writes to meta blocks
-** that don't need journaling
-*/
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
-  if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || 
-      buffer_journal_dirty(bh)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  if (get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_blocknr)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  mark_buffer_dirty(bh) ;
   return 0 ;
 }
 
@@ -2474,7 +2581,6 @@ static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t block
     if (atomic_read(&(bh->b_count)) < 0) {
       printk("journal-1752: remove from trans, b_count < 0\n") ;
     }
-    if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; 
     ret = 1 ;
   }
   SB_JOURNAL(p_s_sb)->j_len-- ;
@@ -2500,7 +2606,7 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) {
   int can_dirty = 1 ;
   
   /* first test hprev.  These are all newer than cn, so any node here
-  ** with the name block number and dev means this node can't be sent
+  ** with the same block number and dev means this node can't be sent
   ** to disk right now.
   */
   while(cur && can_dirty) {
@@ -2551,72 +2657,56 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
 ** change flush_commit_lists to have a repeat parameter too.
 **
 */
-void flush_async_commits(struct super_block *p_s_sb) {
-  int i ;
+static void flush_async_commits(void *p) {
+  struct super_block *p_s_sb = p;
+  struct reiserfs_journal_list *jl;
+  struct list_head *entry;
 
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; 
-    }
+  lock_kernel();
+  if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+      /* last entry is the youngest, commit it and you get everything */
+      entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
+      jl = JOURNAL_LIST_ENTRY(entry);
+      flush_commit_list(p_s_sb, jl, 1);
   }
+  unlock_kernel();
 }
 
 /*
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
-**
-** also calls flush_journal_list with old_only == 1, which allows me to reclaim
-** memory and such from the journal lists whose real blocks are all on disk.
-**
-** called by sync_dev_journal from buffer.c
 */
-int flush_old_commits(struct super_block *p_s_sb, int immediate) {
-  int i ;
-  int count = 0;
-  int start ; 
-  time_t now ; 
-  struct reiserfs_transaction_handle th ; 
-
-  start =  SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  now = get_seconds() ;
-
-  /* safety check so we don't flush while we are replaying the log during mount */
-  if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
-    return 0  ;
-  }
-  /* starting with oldest, loop until we get to the start */
-  i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
-  while(i != start) {
-    if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
-       immediate)) {
-      /* we have to check again to be sure the current transaction did not change */
-      if (i != SB_JOURNAL_LIST_INDEX(p_s_sb))  {
-	flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
-      }
-    }
-    i = (i + 1) % JOURNAL_LIST_COUNT ;
-    count++ ;
-  }
-  /* now, check the current transaction.  If there are no writers, and it is too old, finish it, and
-  ** force the commit blocks to disk
-  */
-  if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
-     SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-     SB_JOURNAL(p_s_sb)->j_len > 0 && 
-     (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
-  } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case.  If they say to 
-                             flush, we must be sure old transactions hit the disk too. */
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-  }
-   reiserfs_journal_kupdate(p_s_sb) ;
-   return 0 ;
+int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
+    time_t now ;
+    struct reiserfs_transaction_handle th ;
+
+    now = get_seconds();
+    /* safety check so we don't flush while we are replaying the log during
+     * mount
+     */
+    if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+	return 0  ;
+    }
+
+    /* check the current transaction.  If there are no writers, and it is
+     * too old, finish it, and force the commit blocks to disk
+     */
+    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
+        SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
+        SB_JOURNAL(p_s_sb)->j_len > 0 &&
+        (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) >
+	SB_JOURNAL_MAX_TRANS_AGE(p_s_sb))
+    {
+	journal_join(&th, p_s_sb, 1) ;
+	reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+	journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+
+	/* we're only being called from kreiserfsd, it makes no sense to do
+	** an async commit so that kreiserfsd can do it later
+	*/
+	do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
+    }
+    return p_s_sb->s_dirt;
 }
 
 /*
@@ -2637,6 +2727,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   int flush = flags & FLUSH_ALL ;
   int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
+  struct reiserfs_journal_list *jl;
 
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
@@ -2653,13 +2744,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   ** care of in this trans
   */
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
-    unlock_journal(p_s_sb) ;
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))  > 0 && wcount <= 0) {
-      atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
-      wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    }
-    return 0 ;
+    BUG();
   }
   /* if wcount > 0, and we are called to with flush or commit_now,
   ** we wait on j_join_wait.  We will wake up when the last writer has
@@ -2669,24 +2754,37 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   */
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
     if (flush || commit_now) {
-      int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      unsigned trans_id ;
+
+      jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+      trans_id = jl->j_trans_id;
+
       atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
       if (flush) {
         SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
       }
       unlock_journal(p_s_sb) ;
+
       /* sleep while the current transaction is still j_jlocked */
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && 
-            SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
-      if (commit_now) {
-	if (wait_on_commit) {
-	  flush_commit_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-	} else {
-	  commit_flush_async(p_s_sb, orig_jindex) ; 
+      while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        } else {
+	    lock_journal(p_s_sb);
+	    if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	        atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
+	    }
+	    unlock_journal(p_s_sb);
 	}
       }
+      if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+          BUG();
+      }
+      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
+          wait_on_commit)
+      {
+	  flush_commit_list(p_s_sb, jl, 1) ;
+      }
       return 0 ;
     } 
     unlock_journal(p_s_sb) ;
@@ -2694,7 +2792,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   }
 
   /* deal with old transactions where we are the last writers */
-  now = get_seconds() ;
+  now = get_seconds();
   if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
     commit_now = 1 ;
     SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ;
@@ -2734,25 +2832,21 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
   struct buffer_head *bh = NULL ;
   struct reiserfs_list_bitmap *jb = NULL ;
   int cleaned = 0 ;
-  
-  if (reiserfs_dont_log(th->t_super)) {
-    bh = sb_find_get_block(p_s_sb, blocknr) ;
-    if (bh && buffer_dirty (bh)) {
-      printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %d\n", bh->b_state, blocknr);
-      BUG ();
-    }
-    brelse (bh);
-    return 0 ;
+
+  cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, blocknr);
+  if (cn && cn->bh) {
+      bh = cn->bh ;
+      get_bh(bh) ;
   }
-  bh = sb_find_get_block(p_s_sb, blocknr) ;
   /* if it is journal new, we just remove it from this transaction */
   if (bh && buffer_journal_new(bh)) {
     mark_buffer_notjournal_new(bh) ;
     clear_prepared_bits(bh) ;
+    reiserfs_clean_and_file_buffer(bh) ;
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
   } else {
     /* set the bit for this block in the journal bitmap for this transaction */
-    jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
+    jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
     if (!jb) {
       reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
     }
@@ -2762,6 +2856,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
 
     if (bh) {
       clear_prepared_bits(bh) ;
+      reiserfs_clean_and_file_buffer(bh) ;
     }
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
 
@@ -2793,7 +2888,6 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
   }
 
   if (bh) {
-    reiserfs_clean_and_file_buffer(bh) ;
     put_bh(bh) ; /* get_hash grabs the buffer */
     if (atomic_read(&(bh->b_count)) < 0) {
       printk("journal-2165: bh->b_count < 0\n") ;
@@ -2803,50 +2897,84 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
 }
 
 void reiserfs_update_inode_transaction(struct inode *inode) {
-  
-  REISERFS_I(inode)->i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
-
+  REISERFS_I(inode)->i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
   REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
-static int reiserfs_inode_in_this_transaction(struct inode *inode) {
-  if (REISERFS_I(inode)->i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id || 
-      REISERFS_I(inode)->i_trans_id == 0) {
-    return 1; 
-  } 
-  return 0 ;
+static void __commit_trans_jl(struct inode *inode, unsigned long id,
+                                 struct reiserfs_journal_list *jl)
+{
+    struct reiserfs_transaction_handle th ;
+    struct super_block *sb = inode->i_sb ;
+
+    /* is it from the current transaction, or from an unknown transaction? */
+    if (id == SB_JOURNAL(sb)->j_trans_id) {
+	jl = SB_JOURNAL(sb)->j_current_jl;
+	/* try to let other writers come in and grow this transaction */
+	let_transaction_grow(sb, id);
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    goto flush_commit_only;
+	}
+
+	journal_begin(&th, sb, 1) ;
+
+	/* someone might have ended this transaction while we joined */
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
+	    journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
+	    journal_end(&th, sb, 1) ;
+	    goto flush_commit_only;
+	}
+
+	journal_end_sync(&th, sb, 1) ;
+
+    } else {
+	/* this gets tricky, we have to make sure the journal list in
+	 * the inode still exists.  We know the list is still around
+	 * if we've got a larger transaction id than the oldest list
+	 */
+flush_commit_only:
+	if (journal_list_still_alive(inode->i_sb, id)) {
+	    flush_commit_list(sb, jl, 1) ;
+	}
+    }
+    /* otherwise the list is gone, and long since committed */
 }
 
 void reiserfs_commit_for_inode(struct inode *inode) {
-  struct reiserfs_journal_list *jl ;
-  struct reiserfs_transaction_handle th ;
-  struct super_block *sb = inode->i_sb ;
-
-  jl = SB_JOURNAL_LIST(sb) + REISERFS_I(inode)->i_trans_index ;
-
-  /* is it from the current transaction, or from an unknown transaction? */
-  if (reiserfs_inode_in_this_transaction(inode)) {
-    journal_join(&th, sb, 1) ;
-    reiserfs_update_inode_transaction(inode) ;
-    journal_end_sync(&th, sb, 1) ;
-  } else if (jl->j_trans_id == REISERFS_I(inode)->i_trans_id) {
-    flush_commit_list(sb, jl, 1) ;
-  }
-  /* if the transaction id does not match, this list is long since flushed
-  ** and we don't have to do anything here
-  */
+    unsigned long id = REISERFS_I(inode)->i_trans_id;
+    struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
+
+    /* for the whole inode, assume unset id means it was
+     * changed in the current transaction.  More conservative
+     */
+    if (!id || !jl) {
+	reiserfs_update_inode_transaction(inode) ;
+	id = REISERFS_I(inode)->i_trans_id;
+	/* jl will be updated in __commit_trans_jl */
+    }
+
+    __commit_trans_jl(inode, id, jl);
 }
 
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
                                       struct buffer_head *bh) {
-  PROC_INFO_INC( p_s_sb, journal.restore_prepared );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
-
-  if (!bh) {
-    return ;
-  }
-  clear_bit(BH_JPrepared, &bh->b_state) ;
+    PROC_INFO_INC( p_s_sb, journal.restore_prepared );
+    if (!bh) {
+	return ;
+    }
+    if (test_and_clear_bit(BH_JRestore_dirty, &bh->b_state) &&
+	buffer_journal_dirty(bh)) {
+	struct reiserfs_journal_cnode *cn;
+	cn = get_journal_hash_dev(p_s_sb,
+	                          SB_JOURNAL(p_s_sb)->j_list_hash_table,
+				  bh->b_blocknr);
+	if (cn && can_dirty(cn)) {
+	    set_bit(BH_JTest, &bh->b_state);
+	    mark_buffer_dirty(bh);
+        }
+    }
+    clear_bit(BH_JPrepared, &bh->b_state) ;
 }
 
 extern struct tree_balance *cur_tb ;
@@ -2857,29 +2985,39 @@ extern struct tree_balance *cur_tb ;
 ** wait on it.
 ** 
 */
-void reiserfs_prepare_for_journal(struct super_block *p_s_sb, 
+int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
                                   struct buffer_head *bh, int wait) {
-  int retry_count = 0 ;
-
   PROC_INFO_INC( p_s_sb, journal.prepare );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
 
-  while(!test_bit(BH_JPrepared, &bh->b_state) ||
-        (wait && buffer_locked(bh))) {
-    if (buffer_journaled(bh)) {
-      set_bit(BH_JPrepared, &bh->b_state) ;
-      return ;
+    if (test_set_buffer_locked(bh)) {
+	if (!wait)
+	    return 0;
+	lock_buffer(bh);
     }
-    set_bit(BH_JPrepared, &bh->b_state) ;
-    if (wait) {
-      RFALSE( buffer_locked(bh) && cur_tb != NULL,
-	      "waiting while do_balance was running\n") ;
-      wait_on_buffer(bh) ;
+    set_bit(BH_JPrepared, &bh->b_state);
+    if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh))  {
+	clear_bit(BH_JTest, &bh->b_state);
+	set_bit(BH_JRestore_dirty, &bh->b_state);
+    }
+    unlock_buffer(bh);
+    return 1;
+}
+
+static void flush_old_journal_lists(struct super_block *s) {
+    struct reiserfs_journal_list *jl;
+    struct list_head *entry;
+    time_t now = get_seconds();
+
+    while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
+        entry = SB_JOURNAL(s)->j_journal_list.next;
+	jl = JOURNAL_LIST_ENTRY(entry);
+	/* this check should always be run, to send old lists to disk */
+	if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+	    flush_used_journal_lists(s, jl);
+	} else {
+	    break;
+	}
     }
-    PROC_INFO_INC( p_s_sb, journal.prepare_retry );
-    retry_count++ ;
-  }
 }
 
 /* 
@@ -2898,23 +3036,24 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   struct buffer_head *c_bh ; /* commit bh */
   struct buffer_head *d_bh ; /* desc bh */
   int cur_write_start = 0 ; /* start index of current log write */
-  int cur_blocks_left = 0 ; /* number of journal blocks left to write */
   int old_start ;
   int i ;
-  int jindex ;
-  int orig_jindex ;
   int flush = flags & FLUSH_ALL ;
-  int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
-  struct reiserfs_super_block *rs ; 
-  int trans_half ;
+  struct reiserfs_journal_list *jl, *temp_jl;
+  struct list_head *entry, *safe;
+  unsigned long jindex;
+  unsigned long commit_trans_id;
+  int trans_half;
 
   if (th->t_refcount > 1)
     BUG() ;
 
   current->journal_info = th->t_handle_save;
-  if (reiserfs_dont_log(th->t_super)) {
-    return 0 ;
+  reiserfs_check_lock_depth("journal end");
+  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
+      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
   }
 
   lock_journal(p_s_sb) ;
@@ -2923,24 +3062,24 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
     flush = 1 ;
   }
   if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    flags |= COMMIT_NOW ;
-    commit_now = 1 ;
+    flags |= COMMIT_NOW | WAIT;
+    wait_on_commit = 1;
   }
 
   /* check_journal_end locks the journal, and unlocks if it does not return 1 
   ** it tells us if we should continue with the journal_end, or just return
   */
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-    return 0 ;
+    p_s_sb->s_dirt = 1;
+    wake_queued_writers(p_s_sb);
+    goto out ;
   }
 
   /* check_journal_end might set these, check again */
   if (SB_JOURNAL(p_s_sb)->j_next_full_flush) {
     flush = 1 ;
   }
-  if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    commit_now = 1 ;
-  }
+
   /*
   ** j must wait means we have to flush the log blocks, and the real blocks for
   ** this transaction
@@ -2957,10 +3096,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   current->journal_info = th->t_handle_save ;
 #endif
   
-  rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
   /* setup description block */
   d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start) ; 
-  set_buffer_uptodate(d_bh) ;
+  set_buffer_uptodate(d_bh);
   desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
   memset(d_bh->b_data, 0, d_bh->b_size) ;
   memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
@@ -2975,28 +3113,33 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   set_buffer_uptodate(c_bh) ;
 
   /* init this journal list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;  
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-
-  /* which is faster, locking/unlocking at the start and end of the for
-  ** or locking once per iteration around the insert_journal_hash?
-  ** eitherway, we are write locking insert_journal_hash.  The ENTIRE FOR
-  ** LOOP MUST not cause schedule to occur.
-  */
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
 
-  /* for each real block, add it to the journal list hash,
+  /* we lock the commit before doing anything because
+   * we want to make sure nobody tries to run flush_commit_list until
+   * the new transaction is fully setup, and we've already flushed the
+   * ordered bh list
+   */
+  down(&jl->j_commit_lock);
+
+  /* save the transaction id in case we need to commit it later */
+  commit_trans_id = jl->j_trans_id;
+
+  atomic_set(&jl->j_older_commits_done, 0) ;
+  jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
+  jl->j_commit_bh = c_bh ;
+  jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
+  jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;
+  atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
+  atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
+  jl->j_realblock = NULL ;
+
+  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+  **  for each real block, add it to the journal list hash,
   ** copy into real block index array in the commit or desc block
   */
-  trans_half = journal_trans_half(p_s_sb->s_blocksize) ;
+  trans_half = journal_trans_half(p_s_sb->s_blocksize);
   for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
     if (test_bit(BH_JDirty, &cn->bh->b_state) ) {
       jl_cn = get_cnode(p_s_sb) ;
@@ -3004,7 +3147,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
         reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
       }
       if (i == 0) {
-        SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
+        jl->j_realblock = jl_cn ;
       }
       jl_cn->prev = last_cn ;
       jl_cn->next = NULL ;
@@ -3020,9 +3163,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
       }
       jl_cn->blocknr = cn->bh->b_blocknr ; 
       jl_cn->state = 0 ;
-      jl_cn->sb = p_s_sb ;
+      jl_cn->sb = p_s_sb;
       jl_cn->bh = cn->bh ;
-      jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      jl_cn->jlist = jl;
       insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; 
       if (i < trans_half) {
 	desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
@@ -3033,7 +3176,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
       i-- ;
     }
   }
-  
   set_desc_trans_len(desc, SB_JOURNAL(p_s_sb)->j_len) ;
   set_desc_mount_id(desc, SB_JOURNAL(p_s_sb)->j_mount_id) ;
   set_desc_trans_id(desc, SB_JOURNAL(p_s_sb)->j_trans_id) ;
@@ -3041,53 +3183,35 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
 
   /* special check in case all buffers in the journal were marked for not logging */
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    brelse(d_bh) ;
-    brelse(c_bh) ;
-    unlock_journal(p_s_sb) ;
-    printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
-    atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-    wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    return 0 ;
+    BUG();
   }
 
+  /* we're about to dirty all the log blocks, mark the description block
+   * dirty now too.  Don't mark the commit block dirty until all the
+   * others are on disk
+   */
+  mark_buffer_dirty(d_bh);
+
   /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
   cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
-  cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len  ;
   cn = SB_JOURNAL(p_s_sb)->j_first ;
   jindex = 1 ; /* start at one so we don't get the desc again */
-  while(cur_blocks_left > 0) {
+  while(cn) {
+    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
     /* copy all the real blocks into log area.  dirty log blocks */
     if (test_bit(BH_JDirty, &cn->bh->b_state)) {
       struct buffer_head *tmp_bh ;
       tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
 		       ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-      set_buffer_uptodate(tmp_bh) ;
+      set_buffer_uptodate(tmp_bh);
       memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;  
+      mark_buffer_dirty(tmp_bh);
       jindex++ ;
-    } else {
-      /* JDirty cleared sometime during transaction.  don't log this one */
-      printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
-    }
-    cn = cn->next ;
-    cur_blocks_left-- ;
-  }
-
-  /* we are done  with both the c_bh and d_bh, but
-  ** c_bh must be written after all other commit blocks,
-  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-  */
-
-  /* now loop through and mark all buffers from this transaction as JDirty_wait
-  ** clear the JDirty bit, clear BH_JNew too.  
-  ** if they weren't JDirty, they weren't logged, just relse them and move on
-  */
-  cn = SB_JOURNAL(p_s_sb)->j_first ; 
-  while(cn) {
-    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
-    if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
       set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; 
       clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
     } else {
+      /* JDirty cleared sometime during transaction.  don't log this one */
+      reiserfs_warning("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
       brelse(cn->bh) ;
     }
     next = cn->next ;
@@ -3095,30 +3219,17 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
     cn = next ;
   }
 
-  /* unlock the journal list for committing and flushing */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
-
-  orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; 
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
+  /* we are done  with both the c_bh and d_bh, but
+  ** c_bh must be written after all other commit blocks,
+  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+  */
 
-  /* write any buffers that must hit disk before this commit is done */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 
-  /* honor the flush and async wishes from the caller */
-  if (flush) {
-  
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;  
-  } else if (commit_now) {
-    if (wait_on_commit) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    } else {
-      commit_flush_async(p_s_sb, orig_jindex) ; 
-    }
-  }
+  /* now it is safe to insert this transaction on the main list */
+  list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
+  list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
+  SB_JOURNAL(p_s_sb)->j_num_work_lists++;
 
   /* reset journal values for the next transaction */
   old_start = SB_JOURNAL(p_s_sb)->j_start ;
@@ -3130,57 +3241,96 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_id++ ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  /* tail conversion targets have to hit the disk before we end the
+   * transaction.  Otherwise a later transaction might repack the tail
+   * before this transaction commits, leaving the data block unflushed and
+   * clean, if we crash before the later transaction commits, the data block
+   * is lost.
+   */
+  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
+		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  up(&jl->j_commit_lock);
+
+  /* honor the flush wishes from the caller, simple commits can
+  ** be done outside the journal lock, they are done below
+  */
+  if (flush) {
+    flush_commit_list(p_s_sb, jl, 1) ;
+    flush_journal_list(p_s_sb, jl, 1) ;
+  }
+
+
   /* if the next transaction has any chance of wrapping, flush 
   ** transactions that might get overwritten.  If any journal lists are very 
   ** old flush them as well.  
   */
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    jindex = i ;
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; 
+first_jl:
+  list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
+    temp_jl = JOURNAL_LIST_ENTRY(entry);
+    if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
+      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >=
+          temp_jl->j_start)
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else if ((SB_JOURNAL(p_s_sb)->j_start +
+                  SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) <
+		  SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+      {
+          /* if we don't cross into the next transaction and we don't
+	   * wrap, there is no way we can overlap any later transactions
+	   * break now
+	   */
+	  break;
       }
-    } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= 
-            SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
+    } else if ((SB_JOURNAL(p_s_sb)->j_start +
+                SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >
+		SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+    {
+      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) %
+            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else {
+	  /* we don't overlap anything from out start to the end of the
+	   * log, and our wrapped portion doesn't overlap anything at
+	   * the start of the log.  We can break
+	   */
+	  break;
       }
-    } 
-    /* this check should always be run, to send old lists to disk */
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < 
-	      (get_seconds() - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
     }
   }
+  flush_old_journal_lists(p_s_sb);
 
-  /* if the next journal_list is still in use, flush it */
-  if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
-    flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; 
-  }
-
-  /* we don't want anyone flushing the new transaction's list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + 
-											 SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
 
-  if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
+  if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
     reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
   }
-  unlock_journal(p_s_sb) ;
+
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
+  unlock_journal(p_s_sb) ;
   /* wake up any body waiting to join. */
+  clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+
+  if (!flush) {
+      if (wait_on_commit) {
+	  if (journal_list_still_alive(p_s_sb, commit_trans_id))
+	      flush_commit_list(p_s_sb, jl, 1) ;
+      } else {
+          queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
+      }
+  }
+out:
+  reiserfs_check_lock_depth("journal end2");
   return 0 ;
 }
-
-
-
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 8d47a4edabd9..f6a289f4532c 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -86,7 +86,6 @@ __u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th)
     }
 
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 1;
     return unused_objectid;
 }
 
@@ -105,8 +104,6 @@ void reiserfs_release_objectid (struct reiserfs_transaction_handle *th,
 
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); 
-    s->s_dirt = 1;
-
 
     /* start at the beginning of the objectid map (i = 0) and go to
        the end of it (i = disk_sb->s_oid_cursize).  Linear search is
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 0b4db272a566..d7c20a7c0e46 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -87,7 +87,7 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
     
 	seq_printf(m,	"state: \t%s\n"
-			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s%s\n"
+			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
 			"gen. counter: \t%i\n"
 			"s_kmallocs: \t%i\n"
 			"s_disk_reads: \t%i\n"
@@ -131,7 +131,6 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 			reiserfs_test4( sb ) ? "TEST4 " : "",
 			have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ",
 			replay_only( sb ) ? "REPLAY_ONLY " : "",
-			reiserfs_dont_log( sb ) ? "DONT_LOG " : "LOG ",
 			convert_reiserfs( sb ) ? "CONV " : "",
 
 			atomic_read( &r -> s_generation_counter ),
@@ -370,7 +369,6 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 			"j_first_unflushed_offset: \t%lu\n"
 			"j_last_flush_trans_id: \t%lu\n"
 			"j_trans_start_time: \t%li\n"
-			"j_journal_list_index: \t%i\n"
 			"j_list_bitmap_index: \t%i\n"
 			"j_must_wait: \t%i\n"
 			"j_next_full_flush: \t%i\n"
@@ -416,7 +414,6 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 			JF( j_first_unflushed_offset ),
 			JF( j_last_flush_trans_id ),
 			JF( j_trans_start_time ),
-			JF( j_journal_list_index ),
 			JF( j_list_bitmap_index ),
 			JF( j_must_wait ),
 			JF( j_next_full_flush ),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index df7baf79e889..f75349fe4787 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -59,22 +59,26 @@ static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs)
 static int reiserfs_remount (struct super_block * s, int * flags, char * data);
 static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf);
 
-static void reiserfs_write_super (struct super_block * s)
+static void reiserfs_sync_fs (struct super_block * s)
 {
+    if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
+	reiserfs_write_lock(s);
+	journal_begin(&th, s, 1);
+	journal_end_sync(&th, s, 1);
+	reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;
+	reiserfs_write_unlock(s);
+    }
+}
 
-  int dirty = 0 ;
-  reiserfs_write_lock(s);
-  if (!(s->s_flags & MS_RDONLY)) {
-    dirty = flush_old_commits(s, 1) ;
-  }
-  s->s_dirt = dirty;
-  reiserfs_write_unlock(s);
+static void reiserfs_write_super(struct super_block *s)
+{
+    reiserfs_sync_fs(s);
 }
 
 static void reiserfs_write_super_lockfs (struct super_block * s)
 {
-
-  int dirty = 0 ;
   struct reiserfs_transaction_handle th ;
   reiserfs_write_lock(s);
   if (!(s->s_flags & MS_RDONLY)) {
@@ -84,7 +88,7 @@ static void reiserfs_write_super_lockfs (struct super_block * s)
     reiserfs_block_writes(&th) ;
     journal_end(&th, s, 1) ;
   }
-  s->s_dirt = dirty;
+  s->s_dirt = 0;
   reiserfs_write_unlock(s);
 }
 
@@ -805,7 +809,6 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state );
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
   } else {
     /* remount read-write */
     if (!(s->s_flags & MS_RDONLY))
@@ -822,12 +825,12 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     set_sb_umount_state( rs, REISERFS_ERROR_FS );
     /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
     REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ;
   }
   /* this will force a full flush of all journal lists */
   SB_JOURNAL(s)->j_must_wait = 1 ;
   journal_end(&th, s, 10) ;
+  s->s_dirt = 0;
 
   if (!( *mount_flags & MS_RDONLY ) )
     finish_unfinished( s );
@@ -1392,8 +1395,6 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
 	
 	/* look for files which were to be removed in previous session */
 	finish_unfinished (s);
-
-	s->s_dirt = 0;
     } else {
 	if ( old_format_only(s) && !silent) {
 	    reiserfs_warning("reiserfs: using 3.5.x disk format\n") ;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index e4695e7b7ba3..fb0bf2af7fd7 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1702,23 +1702,39 @@ struct reiserfs_journal_header {
 	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
 #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
 
-/* finds n'th buffer with 0 being the start of this commit.  Needs to go away, j_ap_blocks has changed
-** since I created this.  One chunk of code in journal.c needs changing before deleting it
-*/
-#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
-
 // We need these to make journal.c code more readable
 #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
+/*
+** transaction handle which is passed around for all journal calls
+*/
+struct reiserfs_transaction_handle {
+  struct super_block *t_super ; /* super for this FS when journal_begin was
+				   called. saves calls to reiserfs_get_super
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle
+				*/
+  int t_refcount;
+  int t_blocks_logged ;         /* number of blocks this writer has logged */
+  int t_blocks_allocated ;      /* number of blocks this writer allocated */
+  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
+  void *t_handle_save ;		/* save existing current->journal_info */
+  int displace_new_blocks:1;	/* if new block allocation occurres, that block
+				   should be displaced from others */
+} ;
+
+int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
 void reiserfs_wait_on_write_block(struct super_block *s) ;
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
 void reiserfs_allow_writes(struct super_block *s) ;
 void reiserfs_check_lock_depth(char *caller) ;
-void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
 void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
 int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ;
 int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
@@ -1730,7 +1746,6 @@ int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
-void flush_async_commits(struct super_block *p_s_sb) ;
 
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index 87e1b74e1125..e689a12bcb9b 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -3,6 +3,8 @@
 
 #include <linux/list.h>
 
+struct reiserfs_journal_list;
+
 /** bitmasks for i_flags field in reiserfs-specific part of inode */
 typedef enum {
     /** this says what format of key do all items (but stat data) of
@@ -48,7 +50,7 @@ struct reiserfs_inode_info {
     ** needs to be committed in order for this inode to be properly
     ** flushed */
     unsigned long i_trans_id ;
-    unsigned long i_trans_index ;
+    struct reiserfs_journal_list *i_jl;
     struct inode vfs_inode;
 };
 
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index b848ccd7ed41..e1fe3ebe33c0 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -106,7 +106,6 @@ typedef enum {
 #define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_LIST_COUNT 64
 
 /* these are bh_state bit flag offset numbers, for use in the buffer head */
 
@@ -121,6 +120,7 @@ typedef enum {
 */
 #define BH_JPrepared 20		/* block has been prepared for the log */
 #define BH_JRestore_dirty 22    /* restore the dirty bit later */
+#define BH_JTest 23             /* debugging use only */
 
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
@@ -153,26 +153,6 @@ struct reiserfs_list_bitmap {
   struct reiserfs_bitmap_node **bitmaps ;
 } ;
 
-/*
-** transaction handle which is passed around for all journal calls
-*/
-struct reiserfs_transaction_handle {
-  struct super_block *t_super ; /* super for this FS when journal_begin was
-				   called. saves calls to reiserfs_get_super
-				   also used by nested transactions to make
-				   sure they are nesting on the right FS
-				   _must_ be first in the handle
-				*/
-  int t_refcount;
-  int t_blocks_logged ;         /* number of blocks this writer has logged */
-  int t_blocks_allocated ;      /* number of blocks this writer allocated */
-  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  void *t_handle_save ;		/* save existing current->journal_info */
-  int displace_new_blocks:1;	/* if new block allocation occurres, that block
-				   should be displaced from others */
-
-} ;
-
 /*
 ** one of these for each transaction.  The most important part here is the j_realblock.
 ** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
@@ -181,23 +161,25 @@ struct reiserfs_transaction_handle {
 ** to be overwritten */
 struct reiserfs_journal_list {
   unsigned long j_start ;
+  unsigned long j_state;
   unsigned long j_len ;
   atomic_t j_nonzerolen ;
   atomic_t j_commit_left ;
-  atomic_t j_flushing ;
-  atomic_t j_commit_flushing ;
   atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
+  struct semaphore j_commit_lock;
   unsigned long j_trans_id ;
   time_t j_timestamp ;
   struct reiserfs_list_bitmap *j_list_bitmap ;
   struct buffer_head *j_commit_bh ; /* commit buffer head */
   struct reiserfs_journal_cnode *j_realblock  ;
   struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
-  wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
-  wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
-} ;
+  /* time ordered list of all active transactions */
+  struct list_head j_list;
 
-struct reiserfs_page_list  ; /* defined in reiserfs_fs.h */
+  /* time ordered list of all transactions we haven't tried to flush yet */
+  struct list_head j_working_list;
+  int j_refcount;
+} ;
 
 struct reiserfs_journal {
   struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */
@@ -220,16 +202,11 @@ struct reiserfs_journal {
   unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
   struct buffer_head *j_header_bh ;   
 
-  /* j_flush_pages must be flushed before the current transaction can
-  ** commit
-  */
-  struct reiserfs_page_list *j_flush_pages ;
   time_t j_trans_start_time ;         /* time this transaction started */
-  wait_queue_head_t j_wait ;         /* wait  journal_end to finish I/O */
-  atomic_t j_wlock ;                       /* lock for j_wait */
+  struct semaphore j_lock;
+  struct semaphore j_flush_sem;
   wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
   atomic_t j_jlock ;                       /* lock for j_join_wait */
-  int j_journal_list_index ;	      /* journal list number of the current trans */
   int j_list_bitmap_index ;	      /* number of next list bitmap to use */
   int j_must_wait ;		       /* no more journal begins allowed. MUST sleep on j_join_wait */
   int j_next_full_flush ;             /* next journal_end will flush all journal list */
@@ -246,19 +223,37 @@ struct reiserfs_journal {
   struct reiserfs_journal_cnode *j_cnode_free_list ;
   struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
 
+  struct reiserfs_journal_list *j_current_jl;
   int j_free_bitmap_nodes ;
   int j_used_bitmap_nodes ;
+
+  int j_num_lists;      /* total number of active transactions */
+  int j_num_work_lists; /* number that need attention from kreiserfsd */
+
+  /* debugging to make sure things are flushed in order */
+  int j_last_flush_id;
+
+  /* debugging to make sure things are committed in order */
+  int j_last_commit_id;
+
   struct list_head j_bitmap_nodes ;
   struct list_head j_dirty_buffers ;
   spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */
+
+  /* list of all active transactions */
+  struct list_head j_journal_list;
+  /* lists that haven't been touched by writeback attempts */
+  struct list_head j_working_list;
+
   struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;	/* array of bitmaps to record the deleted blocks */
-  struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ;	    /* array of all the journal lists */
   struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; 	    /* hash table for real buffer heads in current trans */ 
   struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all 
   										the transactions */
   struct list_head j_prealloc_list;     /* list of inodes which have preallocated blocks */
   unsigned long j_max_trans_size ;
   unsigned long j_max_batch_size ;
+
+  struct work_struct j_work;
 };
 
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
@@ -417,7 +412,6 @@ struct reiserfs_sb_info
 #define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
 #define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
 #define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_NOLOG 4      /* -o nolog: turn journalling off */
 #define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
@@ -473,8 +467,6 @@ struct reiserfs_sb_info
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
-int flush_old_commits(struct super_block *s, int) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 
 #define CARRY_ON                0
@@ -484,8 +476,6 @@ int reiserfs_resize(struct super_block *, unsigned long) ;
 #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
 #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
-#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) 
 #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) 
 #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
 
-- 
cgit v1.2.3


From bb0d96728fb63cf1d2294bb1dcafd60926e49cd5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:26:08 -0700
Subject: [PATCH] reiserfs: data=ordered support

From: Chris Mason <mason@suse.com>

reiserfs data=ordered support.
---
 fs/buffer.c                    |   1 +
 fs/reiserfs/file.c             | 167 ++++++++++++-----
 fs/reiserfs/inode.c            | 285 +++++++++++++++++++++-------
 fs/reiserfs/ioctl.c            |   7 +-
 fs/reiserfs/journal.c          | 411 +++++++++++++++++++++++++++++++++--------
 fs/reiserfs/super.c            |  52 ++++++
 include/linux/reiserfs_fs.h    |  44 ++++-
 include/linux/reiserfs_fs_sb.h |  65 ++++---
 8 files changed, 803 insertions(+), 229 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 605ce2099aa5..be9cc963a178 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1583,6 +1583,7 @@ int try_to_release_page(struct page *page, int gfp_mask)
 		return mapping->a_ops->releasepage(page, gfp_mask);
 	return try_to_free_buffers(page);
 }
+EXPORT_SYMBOL(try_to_release_page);
 
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 99321f2fcdf6..4b461667b231 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -9,6 +9,8 @@
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
 
 /*
 ** We pack the tails of files on file close, not at the time they are written.
@@ -150,6 +152,7 @@ out:
    Maps all unmapped but prepared pages from the list.
    Updates metadata with newly allocated blocknumbers as needed */
 int reiserfs_allocate_blocks_for_region(
+				struct reiserfs_transaction_handle *th,
 				struct inode *inode, /* Inode we work with */
 				loff_t pos, /* Writing position */
 				int num_pages, /* number of pages write going
@@ -167,7 +170,6 @@ int reiserfs_allocate_blocks_for_region(
     struct cpu_key key; // cpu key of item that we are going to deal with
     struct item_head *ih; // pointer to item head that we are going to deal with
     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
-    struct reiserfs_transaction_handle th; // transaction handle for transaction we are going to create.
     __u32 * item; // pointer to item we are going to deal with
     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
     b_blocknr_t allocated_blocks[blocks_to_allocate]; // Pointer to a place where allocated blocknumbers would be stored. Right now statically allocated, later that will change.
@@ -194,7 +196,7 @@ int reiserfs_allocate_blocks_for_region(
     /* If we came here, it means we absolutely need to open a transaction,
        since we need to allocate some blocks */
     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
-    journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
+    journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
     reiserfs_update_inode_transaction(inode) ;
 
     /* Look for the in-tree position of our write, need path for block allocator */
@@ -206,7 +208,7 @@ int reiserfs_allocate_blocks_for_region(
    
     /* Allocate blocks */
     /* First fill in "hint" structure for block allocator */
-    hint.th = &th; // transaction handle.
+    hint.th = th; // transaction handle.
     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
     hint.inode = inode; // Inode is needed by block allocator too.
     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
@@ -222,7 +224,7 @@ int reiserfs_allocate_blocks_for_region(
 	    /* We flush the transaction in case of no space. This way some
 	       blocks might become free */
 	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
-	    restart_transaction(&th, inode, &path);
+	    restart_transaction(th, inode, &path);
 
 	    /* We might have scheduled, so search again */
 	    res = search_for_position_by_key(inode->i_sb, &key, &path);
@@ -296,7 +298,7 @@ int reiserfs_allocate_blocks_for_region(
 		    /* Ok, there is existing indirect item already. Need to append it */
 		    /* Calculate position past inserted item */
 		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-		    res = reiserfs_paste_into_item( &th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
+		    res = reiserfs_paste_into_item( th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
 		    if ( res ) {
 			kfree(zeros);
 			goto error_exit_free_blocks;
@@ -326,7 +328,7 @@ int reiserfs_allocate_blocks_for_region(
 		        kfree(zeros);
 			goto error_exit_free_blocks;
 		    }
-		    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)zeros);
+		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, (char *)zeros);
 		} else {
 		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
 		}
@@ -336,8 +338,8 @@ int reiserfs_allocate_blocks_for_region(
 		}
 		/* Now we want to check if transaction is too full, and if it is
 		   we restart it. This will also free the path. */
-		if (journal_transaction_should_end(&th, th.t_blocks_allocated))
-		    restart_transaction(&th, inode, &path);
+		if (journal_transaction_should_end(th, th->t_blocks_allocated))
+		    restart_transaction(th, inode, &path);
 
 		/* Well, need to recalculate path and stuff */
 		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
@@ -368,7 +370,7 @@ retry:
 	       one. */
 	    /* First if we are already modifying current item, log it */
 	    if ( modifying_this_item ) {
-		journal_mark_dirty (&th, inode->i_sb, bh);
+		journal_mark_dirty (th, inode->i_sb, bh);
 		modifying_this_item = 0;
 	    }
 	    /* Then set the key to look for a new indirect item (offset of old
@@ -432,7 +434,7 @@ retry:
 
     if ( modifying_this_item ) { // We need to log last-accessed block, if it
 				 // was modified, but not logged yet.
-	journal_mark_dirty (&th, inode->i_sb, bh);
+	journal_mark_dirty (th, inode->i_sb, bh);
     }
 
     if ( curr_block < blocks_to_allocate ) {
@@ -443,7 +445,7 @@ retry:
 	    // position. We do not need to recalculate path as it should
 	    // already point to correct place.
 	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-	    res = reiserfs_paste_into_item( &th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
+	    res = reiserfs_paste_into_item( th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
 	    if ( res ) {
 		goto error_exit_free_blocks;
 	    }
@@ -474,29 +476,18 @@ retry:
 		goto error_exit_free_blocks;
 	    }
 	    /* Insert item into the tree with the data as its body */
-	    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
+	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
 	} else {
 	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
 	}
     }
 
-    /* Now the final thing, if we have grew the file, we must update it's size*/
-    if ( pos + write_bytes > inode->i_size) {
-	inode->i_size = pos + write_bytes; // Set new size
-	/* If the file have grown so much that tail packing is no longer possible, reset
-	   "need to pack" flag */
-	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
-	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
-    }
-
-    /* Amount of on-disk blocks used by file have changed, update it */
+    // the caller is responsible for closing the transaction
+    // unless we return an error, they are also responsible for logging
+    // the inode.
+    //
     inode->i_blocks += blocks_to_allocate << (inode->i_blkbits - 9);
-    reiserfs_update_sd(&th, inode); // And update on-disk metadata
-    // finish all journal stuff now, We are not going to play with metadata
-    // anymore.
     pathrelse(&path);
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     // go through all the pages/buffers and map the buffers to newly allocated
@@ -527,6 +518,7 @@ retry:
 	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
 		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
 		curr_block++;
+		set_buffer_new(bh);
 	    }
 	}
     }
@@ -540,10 +532,11 @@ error_exit_free_blocks:
     pathrelse(&path);
     // free blocks
     for( i = 0; i < blocks_to_allocate; i++ )
-	reiserfs_free_block( &th, le32_to_cpu(allocated_blocks[i]));
+	reiserfs_free_block(th, le32_to_cpu(allocated_blocks[i]));
 
 error_exit:
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
+    reiserfs_update_sd(th, inode); // update any changes we made to blk count
+    journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     return res;
@@ -603,12 +596,63 @@ int reiserfs_copy_from_user_to_file_region(
     return page_fault?-EFAULT:0;
 }
 
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+    unsigned block_start, block_end;
+    int partial = 0;
+    unsigned blocksize;
+    struct buffer_head *bh, *head;
+    unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+    int new;
+
+    blocksize = 1 << inode->i_blkbits;
+
+    for(bh = head = page_buffers(page), block_start = 0;
+        bh != head || !block_start;
+	block_start=block_end, bh = bh->b_this_page)
+    {
+
+	new = buffer_new(bh);
+	clear_buffer_new(bh);
+	block_end = block_start + blocksize;
+	if (block_end <= from || block_start >= to) {
+	    if (!buffer_uptodate(bh))
+		    partial = 1;
+	} else {
+	    set_buffer_uptodate(bh);
+	    if (!buffer_dirty(bh)) {
+		mark_buffer_dirty(bh);
+		/* do data=ordered on any page past the end
+		 * of file and any buffer marked BH_New.
+		 */
+		if (reiserfs_data_ordered(inode->i_sb) &&
+		    (new || page->index >= i_size_index)) {
+		    reiserfs_add_ordered_list(inode, bh);
+	        }
+	    }
+	}
+    }
+
+    /*
+     * If this is a partial write which happened to make all buffers
+     * uptodate then we can optimize away a bogus readpage() for
+     * the next read(). Here we 'discover' whether the page went
+     * uptodate as a result of this (potentially partial) write.
+     */
+    if (!partial)
+	SetPageUptodate(page);
+    return 0;
+}
 
 
 /* Submit pages for write. This was separated from actual file copying
    because we might want to allocate block numbers in-between.
    This function assumes that caller will adjust file size to correct value. */
 int reiserfs_submit_file_region_for_write(
+				struct reiserfs_transaction_handle *th,
+				struct inode *inode,
 				loff_t pos, /* Writing position offset */
 				int num_pages, /* Number of pages to write */
 				int write_bytes, /* number of bytes to write */
@@ -619,12 +663,14 @@ int reiserfs_submit_file_region_for_write(
     int retval = 0; // Return value we are going to return.
     int i; // loop counter
     int offset; // Writing offset in page.
+    int orig_write_bytes = write_bytes;
+    int sd_update = 0;
 
     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 	struct page *page=prepared_pages[i]; // Current page we process.
 
-	status = block_commit_write(page, offset, offset+count);
+	status = reiserfs_commit_page(inode, page, offset, offset+count);
 	if ( status )
 	    retval = status; // To not overcomplicate matters We are going to
 			     // submit all the pages even if there was error.
@@ -636,6 +682,41 @@ int reiserfs_submit_file_region_for_write(
 			  // to grab_cache_page
 	page_cache_release(page);
     }
+    /* now that we've gotten all the ordered buffers marked dirty,
+     * we can safely update i_size and close any running transaction
+     */
+    if ( pos + orig_write_bytes > inode->i_size) {
+	inode->i_size = pos + orig_write_bytes; // Set new size
+	/* If the file have grown so much that tail packing is no
+	 * longer possible, reset "need to pack" flag */
+	if ( (have_large_tails (inode->i_sb) &&
+	      inode->i_size > i_block_size (inode)*4) ||
+	     (have_small_tails (inode->i_sb) &&
+	     inode->i_size > i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+        else if ( (have_large_tails (inode->i_sb) &&
+	          inode->i_size < i_block_size (inode)*4) ||
+	          (have_small_tails (inode->i_sb) &&
+		  inode->i_size < i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+
+	if (th->t_trans_id) {
+	    reiserfs_write_lock(inode->i_sb);
+	    reiserfs_update_sd(th, inode); // And update on-disk metadata
+	    reiserfs_write_unlock(inode->i_sb);
+	} else
+	    inode->i_sb->s_op->dirty_inode(inode);
+
+        sd_update = 1;
+    }
+    if (th->t_trans_id) {
+	reiserfs_write_lock(inode->i_sb);
+	if (!sd_update)
+	    reiserfs_update_sd(th, inode);
+	journal_end(th, th->t_super, th->t_blocks_allocated);
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    th->t_trans_id = 0;
     return retval;
 }
 
@@ -1003,19 +1084,18 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
     loff_t pos; // Current position in the file.
     size_t res; // return value of various functions that we call.
     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
-    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
 				/* To simplify coding at this time, we store
 				   locked pages in array for now */
-    if ( count <= PAGE_CACHE_SIZE )
-        return generic_file_write(file, buf, count, ppos);
+    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
+    struct reiserfs_transaction_handle th;
+    th.t_trans_id = 0;
 
-    if ( file->f_flags & O_DIRECT) { // Direct IO needs some special threating.
+    if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
 	int result, after_file_end = 0;
 	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
 	    /* If we are appending a file, we need to put this savelink in here.
 	       If we will crash while doing direct io, finish_unfinished will
 	       cut the garbage from the file end. */
-	    struct reiserfs_transaction_handle th;
 	    reiserfs_write_lock(inode->i_sb);
 	    journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
 	    reiserfs_update_inode_transaction(inode);
@@ -1040,7 +1120,6 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	return result;
     }
 
-
     if ( unlikely((ssize_t) count < 0 ))
         return -EINVAL;
 
@@ -1146,11 +1225,7 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 
 	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
 	    /* Fill in all the possible holes and append the file if needed */
-	    res = reiserfs_allocate_blocks_for_region(inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
-	} else if ( pos + write_bytes > inode->i_size ) {
-	    /* File might have grown even though no new blocks were added */
-	    inode->i_size = pos + write_bytes;
-	    inode->i_sb->s_op->dirty_inode(inode);
+	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
 	}
 
 	/* well, we have allocated the blocks, so it is time to free
@@ -1173,7 +1248,8 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	}
 
 	/* Send the pages to disk and unlock them. */
-	res = reiserfs_submit_file_region_for_write(pos, num_pages, write_bytes, prepared_pages);
+	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
+	                                            write_bytes,prepared_pages);
 	if ( res )
 	    break;
 
@@ -1184,10 +1260,17 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	balance_dirty_pages_ratelimited(inode->i_mapping);
     }
 
+    /* this is only true on error */
+    if (th.t_trans_id) {
+        reiserfs_write_lock(inode->i_sb);
+	journal_end(&th, th.t_super, th.t_blocks_allocated);
+        reiserfs_write_unlock(inode->i_sb);
+    }
     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
 	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
 
     up(&inode->i_sem);
+    reiserfs_async_progress_wait(inode->i_sb);
     return (already_written != 0)?already_written:res;
 
 out:
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 06635c7f18a9..cf88e52a2cfc 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,9 +22,12 @@ extern int reiserfs_default_io_size; /* default io size devuned in super.c */
 #define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
 
 static int reiserfs_get_block (struct inode * inode, sector_t block,
 			       struct buffer_head * bh_result, int create);
+static int reiserfs_commit_write(struct file *f, struct page *page,
+                                 unsigned from, unsigned to);
 
 void reiserfs_delete_inode (struct inode * inode)
 {
@@ -103,12 +106,6 @@ inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key
     put_ih_entry_count( ih, entry_count );
 }
 
-static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
-
-    buffer_insert_list(&j->j_dirty_buffers_lock, bh, &j->j_dirty_buffers) ;
-}
-
 //
 // FIXME: we might cache recently accessed indirect item
 
@@ -437,7 +434,8 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
        reiserfs_get_block() */
     bh_result->b_size = (1 << inode->i_blkbits);
 
-    ret = reiserfs_get_block(inode, iblock, bh_result, create) ;
+    ret = reiserfs_get_block(inode, iblock, bh_result,
+                             create | GET_BLOCK_NO_DANGLE) ;
 
     /* don't allow direct io onto tail pages */
     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
@@ -510,15 +508,14 @@ static int convert_tail_for_hole(struct inode *inode,
     ** won't trigger a get_block in this case.
     */
     fix_tail_page_for_writing(tail_page) ;
-    retval = block_prepare_write(tail_page, tail_start, tail_end, 
-                                 reiserfs_get_block) ; 
+    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
     if (retval)
         goto unlock ;
 
     /* tail conversion might change the data in the page */
     flush_dcache_page(tail_page) ;
 
-    retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
+    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 
 unlock:
     if (tail_page != hole_page) {
@@ -557,7 +554,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     __u32 * item;
     int done;
     int fs_gen;
-    struct reiserfs_transaction_handle th ;
+    struct reiserfs_transaction_handle *th = NULL;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
         . 1 block involved into reiserfs_update_sd()
@@ -565,12 +562,11 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
        can incur (much) more that 3 balancings. */
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
     int version;
-    int transaction_started = 0 ;
+    int dangle = 1;
     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 
 				/* bad.... */
     reiserfs_write_lock(inode->i_sb);
-    th.t_trans_id = 0 ;
     version = get_inode_item_key_version (inode);
 
     if (block < 0) {
@@ -594,6 +590,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	reiserfs_write_unlock(inode->i_sb);
 	return ret;
     }
+    /*
+     * if we're already in a transaction, make sure to close
+     * any new transactions we start in this func
+     */
+    if ((create & GET_BLOCK_NO_DANGLE) ||
+        reiserfs_transaction_running(inode->i_sb))
+        dangle = 0;
 
     /* If file is of such a size, that it might have a tail and tails are enabled
     ** we should mark it as possibly needing tail packing on close
@@ -606,9 +609,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
+start_trans:
+	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+	if (!th) {
+	    retval = -ENOMEM;
+	    goto failure;
+	}
 	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
     }
  research:
 
@@ -628,23 +635,21 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 
     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 	/* we have to allocate block for the unformatted node */
-	if (!transaction_started) {
+	if (!th) {
 	    pathrelse(&path) ;
-	    journal_begin(&th, inode->i_sb, jbegin_count) ;
-	    reiserfs_update_inode_transaction(inode) ;
-	    transaction_started = 1 ;
-	    goto research ;
+	    goto start_trans;
 	}
 
-	repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
+	repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 
 	if (repeat == NO_DISK_SPACE) {
 	    /* restart the transaction to give the journal a chance to free
 	    ** some blocks.  releases the path, so we have to go back to
 	    ** research if we succeed on the second try
 	    */
-	    restart_transaction(&th, inode, &path) ; 
-	    repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
+	    SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+	    restart_transaction(th, inode, &path) ;
+	    repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 
 	    if (repeat != NO_DISK_SPACE) {
 		goto research ;
@@ -672,16 +677,18 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		goto research;
 	    }
 	    set_buffer_new(bh_result);
+	    if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
+	    	reiserfs_add_ordered_list(inode, bh_result);
 	    put_block_num(item, pos_in_item, allocated_block_nr) ;
             unfm_ptr = allocated_block_nr;
-	    journal_mark_dirty (&th, inode->i_sb, bh);
+	    journal_mark_dirty (th, inode->i_sb, bh);
 	    inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
-	    reiserfs_update_sd(&th, inode) ;
+	    reiserfs_update_sd(th, inode) ;
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
-	if (transaction_started)
-	    journal_end(&th, inode->i_sb, jbegin_count) ;
+	if (!dangle && th)
+	    reiserfs_end_persistent_transaction(th);
 
 	reiserfs_write_unlock(inode->i_sb);
 	 
@@ -692,16 +699,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	return 0;
     }
 
-    if (!transaction_started) {
-	/* if we don't pathrelse, we could vs-3050 on the buffer if
-	** someone is waiting for it (they can't finish until the buffer
-	** is released, we can start a new transaction until they finish)
-	*/
+    if (!th) {
 	pathrelse(&path) ;
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
-	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
-	goto research;
+	goto start_trans;
     }
 
     /* desired position is not found or is in the direct item. We have
@@ -729,9 +729,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	    set_cpu_key_k_offset (&tmp_key, 1);
 	    PATH_LAST_POSITION(&path) ++;
 
-	    retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
+	    retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, (char *)&unp);
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure; // retval == -ENOSPC or -EIO or -EEXIST
 	    }
 	    if (unp)
@@ -755,8 +755,14 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		   node. FIXME: this should also get into page cache */
 
 		pathrelse(&path) ;
-		journal_end(&th, inode->i_sb, jbegin_count) ;
-		transaction_started = 0 ;
+		/*
+		 * ugly, but we can only end the transaction if
+		 * we aren't nested
+		 */
+		if (th->t_refcount == 1) {
+		    reiserfs_end_persistent_transaction(th);
+		    th = NULL;
+		}
 
 		retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 		if (retval) {
@@ -764,18 +770,19 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 			printk("clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
 		    if (allocated_block_nr) {
 			/* the bitmap, the super, and the stat data == 3 */
-			journal_begin(&th, inode->i_sb, 3) ;
-			reiserfs_free_block (&th, allocated_block_nr);
-			transaction_started = 1 ;
+			if (!th)
+			    th = reiserfs_persistent_transaction(inode->i_sb,3);
+			if (th)
+			    reiserfs_free_block (th, allocated_block_nr);
 		    }
 		    goto failure ;
 		}
 		goto research ;
 	    }
-	    retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
+	    retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 	    if (retval) {
 		reiserfs_unmap_buffer(unbh);
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure;
 	    }
 	    /* it is important the set_buffer_uptodate is done after
@@ -795,7 +802,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		/* we've converted the tail, so we must
 		** flush unbh before the transaction commits
 		*/
-		add_to_flushlist(inode, unbh) ;
+		reiserfs_add_tail_list(inode, unbh) ;
 
 		/* mark it dirty now to prevent commit_write from adding
 		** this buffer to the inode's dirty buffer list
@@ -858,13 +865,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		   only have space for one block */
 		blocks_needed=max_to_insert?max_to_insert:1;
 	    }
-	    retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
+	    retval = reiserfs_paste_into_item (th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
 
 	    if (blocks_needed != 1)
 		kfree(un);
 
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure;
 	    }
 	    if (done) {
@@ -889,8 +896,8 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	** release the path so that anybody waiting on the path before
 	** ending their transaction will be able to continue.
 	*/
-	if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
-	  restart_transaction(&th, inode, &path) ; 
+	if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+	  restart_transaction(th, inode, &path) ;
 	}
 	/* inserting indirect pointers for a hole can take a 
 	** long time.  reschedule if needed
@@ -907,7 +914,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 			      "%K should not be found\n", &key);
 	    retval = -EEXIST;
 	    if (allocated_block_nr)
-	        reiserfs_free_block (&th, allocated_block_nr);
+	        reiserfs_free_block (th, allocated_block_nr);
 	    pathrelse(&path) ;
 	    goto failure;
 	}
@@ -921,9 +928,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     retval = 0;
 
  failure:
-    if (transaction_started) {
-      reiserfs_update_sd(&th, inode) ;
-      journal_end(&th, inode->i_sb, jbegin_count) ;
+    if (th && !dangle) {
+      reiserfs_update_sd(th, inode) ;
+      reiserfs_end_persistent_transaction(th);
     }
     reiserfs_write_unlock(inode->i_sb);
     reiserfs_check_path(&path) ;
@@ -2007,7 +2014,8 @@ out:
     /* this is where we fill in holes in the file. */
     if (use_get_block) {
 	retval = reiserfs_get_block(inode, block, bh_result, 
-	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
+	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
+				    GET_BLOCK_NO_DANGLE);
 	if (!retval) {
 	    if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
 	        /* get_block failed to find a mapped unformatted node. */
@@ -2219,13 +2227,43 @@ static int reiserfs_writepage (struct page * page, struct writeback_control *wbc
     return reiserfs_write_full_page(page, wbc) ;
 }
 
-
 int reiserfs_prepare_write(struct file *f, struct page *page, 
 			   unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
+    int ret;
+    int old_ref = 0;
+
     reiserfs_wait_on_write_block(inode->i_sb) ;
     fix_tail_page_for_writing(page) ;
-    return block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+	struct reiserfs_transaction_handle *th;
+        th = (struct reiserfs_transaction_handle *)current->journal_info;
+	old_ref = th->t_refcount;
+	th->t_refcount++;
+    }
+
+    ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (ret && reiserfs_transaction_running(inode->i_sb)) {
+    	struct reiserfs_transaction_handle *th = current->journal_info;
+	/* this gets a little ugly.  If reiserfs_get_block returned an
+	 * error and left a transacstion running, we've got to close it,
+	 * and we've got to free handle if it was a persistent transaction.
+	 *
+	 * But, if we had nested into an existing transaction, we need
+	 * to just drop the ref count on the handle.
+	 *
+	 * If old_ref == 0, the transaction is from reiserfs_get_block,
+	 * and it was a persistent trans.  Otherwise, it was nested above.
+	 */
+	if (th->t_refcount > old_ref) {
+	    if (old_ref)
+	    	th->t_refcount--;
+	    else
+		reiserfs_end_persistent_transaction(th);
+	}
+    }
+    return ret;
+
 }
 
 
@@ -2237,16 +2275,21 @@ static int reiserfs_commit_write(struct file *f, struct page *page,
                                  unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-    int ret ; 
+    int ret = 0;
+    struct reiserfs_transaction_handle *th = NULL;
     
     reiserfs_wait_on_write_block(inode->i_sb) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        th = current->journal_info;
+    }
+    reiserfs_commit_page(inode, page, from, to);
  
     /* generic_commit_write does this for us, but does not update the
     ** transaction tracking stuff when the size changes.  So, we have
     ** to do the i_size updates here.
     */
     if (pos > inode->i_size) {
-	struct reiserfs_transaction_handle th ;
+	struct reiserfs_transaction_handle myth ;
 	reiserfs_write_lock(inode->i_sb);
 	/* If the file have grown beyond the border where it
 	   can have a tail, unmark it as needing a tail
@@ -2255,16 +2298,19 @@ static int reiserfs_commit_write(struct file *f, struct page *page,
 	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
 	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
 
-	journal_begin(&th, inode->i_sb, 1) ;
+	journal_begin(&myth, inode->i_sb, 1) ;
 	reiserfs_update_inode_transaction(inode) ;
 	inode->i_size = pos ;
-	reiserfs_update_sd(&th, inode) ;
-	journal_end(&th, inode->i_sb, 1) ;
+	reiserfs_update_sd(&myth, inode) ;
+	journal_end(&myth, inode->i_sb, 1) ;
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    if (th) {
+	reiserfs_write_lock(inode->i_sb);
+        reiserfs_end_persistent_transaction(th);
 	reiserfs_write_unlock(inode->i_sb);
     }
  
-    ret = generic_commit_write(f, page, from, to) ;
-
     /* we test for O_SYNC here so we can commit the transaction
     ** for any packed tails the file might have had
     */
@@ -2324,16 +2370,110 @@ void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
 	}
 }
 
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+    int ret = 1 ;
+    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+
+    spin_lock(&j->j_dirty_buffers_lock) ;
+    if (!buffer_mapped(bh)) {
+        goto free_jh;
+    }
+    /* the page is locked, and the only places that log a data buffer
+     * also lock the page.
+     */
+#if 0
+    if (reiserfs_file_data_log(inode)) {
+	/* very conservative, leave the buffer pinned if anyone might need it.
+	** this should be changed to drop the buffer if it is only in the
+	** current transaction
+	*/
+        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+	    ret = 0 ;
+	}
+    } else
+#endif
+    if (buffer_dirty(bh) || buffer_locked(bh)) {
+	struct reiserfs_journal_list *jl;
+	struct reiserfs_jh *jh = bh->b_private;
+
+	/* why is this safe?
+	 * reiserfs_setattr updates i_size in the on disk
+	 * stat data before allowing vmtruncate to be called.
+	 *
+	 * If buffer was put onto the ordered list for this
+	 * transaction, we know for sure either this transaction
+	 * or an older one already has updated i_size on disk,
+	 * and this ordered data won't be referenced in the file
+	 * if we crash.
+	 *
+	 * if the buffer was put onto the ordered list for an older
+	 * transaction, we need to leave it around
+	 */
+	if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+	    ret = 0;
+    }
+free_jh:
+    if (ret && bh->b_private) {
+        reiserfs_free_jh(bh);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock) ;
+    return ret ;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
+{
+    struct buffer_head *head, *bh, *next;
+    struct inode *inode = page->mapping->host;
+    unsigned int curr_off = 0;
+    int ret = 1;
+
+    BUG_ON(!PageLocked(page));
+    if (!page_has_buffers(page))
+	goto out;
+
+    head = page_buffers(page);
+    bh = head;
+    do {
+	unsigned int next_off = curr_off + bh->b_size;
+	next = bh->b_this_page;
+
+	/*
+	 * is this block fully invalidated?
+	 */
+	if (offset <= curr_off) {
+	    if (invalidatepage_can_drop(inode, bh))
+		reiserfs_unmap_buffer(bh);
+	    else
+	        ret = 0;
+	}
+	curr_off = next_off;
+	bh = next;
+    } while (bh != head);
+
+    /*
+     * We release buffers only if the entire page is being invalidated.
+     * The get_block cached value has been unconditionally invalidated,
+     * so real IO is not possible anymore.
+     */
+    if (!offset && ret)
+	ret = try_to_release_page(page, 0);
+out:
+    return ret;
+}
+
 /*
  * Returns 1 if the page's buffers were dropped.  The page is locked.
  *
  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
  * in the buffers at page_buffers(page).
  *
- * FIXME: Chris says the buffer list is not used with `mount -o notail',
- * so in that case the fs can avoid the extra locking.  Create a second
- * address_space_operations with a NULL ->releasepage and install that
- * into new address_spaces.
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.
  */
 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
 {
@@ -2347,11 +2487,13 @@ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
     head = page_buffers(page) ;
     bh = head ;
     do {
-	if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-		list_del_init(&bh->b_assoc_buffers) ;
-	} else {
+	if (bh->b_private) {
+	    if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+		reiserfs_free_jh(bh);
+	    } else {
 		ret = 0 ;
 		break ;
+	    }
 	}
 	bh = bh->b_this_page ;
     } while (bh != head) ;
@@ -2379,6 +2521,7 @@ struct address_space_operations reiserfs_address_space_operations = {
     .readpage = reiserfs_readpage, 
     .readpages = reiserfs_readpages, 
     .releasepage = reiserfs_releasepage,
+    .invalidatepage = reiserfs_invalidatepage,
     .sync_page = block_sync_page,
     .prepare_write = reiserfs_prepare_write,
     .commit_write = reiserfs_commit_write,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 9d7a018c366f..ec59e074416a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -92,6 +92,7 @@ int reiserfs_unpack (struct inode * inode, struct file * filp)
     int retval = 0;
     int index ;
     struct page *page ;
+    struct address_space *mapping ;
     unsigned long write_from ;
     unsigned long blocksize = inode->i_sb->s_blocksize ;
     	
@@ -122,17 +123,19 @@ int reiserfs_unpack (struct inode * inode, struct file * filp)
     ** reiserfs_get_block to unpack the tail for us.
     */
     index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    page = grab_cache_page(inode->i_mapping, index) ;
+    mapping = inode->i_mapping ;
+    page = grab_cache_page(mapping, index) ;
     retval = -ENOMEM;
     if (!page) {
         goto out ;
     }
-    retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
+    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
     if (retval)
         goto out_unlock ;
 
     /* conversion can change page contents, must flush */
     flush_dcache_page(page) ;
+    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
     REISERFS_I(inode)->i_flags |= i_nopack_mask;
 
 out_unlock:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index cfff6ec0871f..17278f415916 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -54,6 +54,7 @@
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 
 
 /* gets a struct reiserfs_journal_list * from a list head */
@@ -595,6 +596,248 @@ static int journal_list_still_alive(struct super_block *s,
     return 0;
 }
 
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
+    char b[BDEVNAME_SIZE];
+
+    if (buffer_journaled(bh)) {
+        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
+	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
+    }
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void submit_logged_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_buffer_io_sync ;
+    mark_buffer_notjournal_new(bh) ;
+    clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_ordered_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+			 spinlock_t *lock,
+			 void (fn)(struct buffer_chunk *))
+{
+    int ret = 0;
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE) {
+	ret = 1;
+        if (lock)
+	    spin_unlock(lock);
+        fn(chunk);
+        if (lock)
+	    spin_lock(lock);
+    }
+    return ret;
+}
+
+
+atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void) {
+    struct reiserfs_jh *jh;
+    while(1) {
+	jh = kmalloc(sizeof(*jh), GFP_NOFS);
+	if (jh) {
+	    atomic_inc(&nr_reiserfs_jh);
+	    return jh;
+	}
+        yield();
+    }
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh) {
+    struct reiserfs_jh *jh;
+
+    jh = bh->b_private;
+    if (jh) {
+	bh->b_private = NULL;
+	jh->bh = NULL;
+	list_del_init(&jh->list);
+	kfree(jh);
+	if (atomic_read(&nr_reiserfs_jh) <= 0)
+	    BUG();
+	atomic_dec(&nr_reiserfs_jh);
+	put_bh(bh);
+    }
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+                           int tail)
+{
+    struct reiserfs_jh *jh;
+
+    if (bh->b_private) {
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!bh->b_private) {
+	    spin_unlock(&j->j_dirty_buffers_lock);
+	    goto no_jh;
+	}
+        jh = bh->b_private;
+	list_del_init(&jh->list);
+    } else {
+no_jh:
+	get_bh(bh);
+	jh = alloc_jh();
+	spin_lock(&j->j_dirty_buffers_lock);
+	/* buffer must be locked for __add_jh, should be able to have
+	 * two adds at the same time
+	 */
+	if (bh->b_private)
+	    BUG();
+	jh->bh = bh;
+	bh->b_private = jh;
+    }
+    jh->jl = j->j_current_jl;
+    if (tail)
+	list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+    else {
+	list_add_tail(&jh->list, &jh->jl->j_bh_list);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock);
+    return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t *lock,
+				 struct reiserfs_journal *j,
+                                 struct reiserfs_journal_list *jl,
+				 struct list_head *list)
+{
+    struct buffer_head *bh;
+    struct reiserfs_jh *jh;
+    int ret = 0;
+    struct buffer_chunk chunk;
+    struct list_head tmp;
+    INIT_LIST_HEAD(&tmp);
+
+    chunk.nr = 0;
+    spin_lock(lock);
+    while(!list_empty(list)) {
+        jh = JH_ENTRY(list->next);
+	bh = jh->bh;
+	get_bh(bh);
+	if (test_set_buffer_locked(bh)) {
+	    if (!buffer_dirty(bh)) {
+		list_del_init(&jh->list);
+		list_add(&jh->list, &tmp);
+		goto loop_next;
+	    }
+	    spin_unlock(lock);
+	    if (chunk.nr)
+		write_ordered_chunk(&chunk);
+	    wait_on_buffer(bh);
+	    if (need_resched)
+	        schedule();
+	    spin_lock(lock);
+	    goto loop_next;
+	}
+	if (buffer_dirty(bh)) {
+	    list_del_init(&jh->list);
+	    list_add(&jh->list, &tmp);
+	    add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+	} else {
+	    reiserfs_free_jh(bh);
+	    unlock_buffer(bh);
+	}
+loop_next:
+	put_bh(bh);
+	if (chunk.nr == 0 && need_resched) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    if (chunk.nr) {
+	spin_unlock(lock);
+        write_ordered_chunk(&chunk);
+	spin_lock(lock);
+    }
+    while(!list_empty(&tmp)) {
+        jh = JH_ENTRY(tmp.prev);
+	bh = jh->bh;
+	get_bh(bh);
+	reiserfs_free_jh(bh);
+
+	if (buffer_locked(bh)) {
+	    spin_unlock(lock);
+	    wait_on_buffer(bh);
+	    spin_lock(lock);
+	}
+	if (!buffer_uptodate(bh))
+	    ret = -EIO;
+	put_bh(bh);
+	if (need_resched()) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    spin_unlock(lock);
+    return ret;
+}
+
 static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
     struct reiserfs_journal_list *other_jl;
     struct reiserfs_journal_list *first_jl;
@@ -656,6 +899,13 @@ find_first:
     }
     return 0;
 }
+int reiserfs_async_progress_wait(struct super_block *s) {
+    DEFINE_WAIT(wait);
+    struct reiserfs_journal *j = SB_JOURNAL(s);
+    if (atomic_read(&j->j_async_throttle))
+    	blk_congestion_wait(WRITE, HZ/10);
+    return 0;
+}
 
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
@@ -710,28 +960,40 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
     goto put_jl;
   }
 
+  if (!list_empty(&jl->j_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(s)->j_dirty_buffers_lock,
+                            SB_JOURNAL(s), jl, &jl->j_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_bh_list))
+      BUG();
   /*
    * for the description block and all the log blocks, submit any buffers
    * that haven't already reached the disk
    */
+  atomic_inc(&SB_JOURNAL(s)->j_async_throttle);
   for (i = 0 ; i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
          SB_ONDISK_JOURNAL_SIZE(s);
     tbh = journal_find_get_block(s, bn) ;
-    wait_on_buffer(tbh) ;
-    ll_rw_block(WRITE, 1, &tbh) ;
+    if (buffer_dirty(tbh))
+	ll_rw_block(WRITE, 1, &tbh) ;
     put_bh(tbh) ;
   }
+  atomic_dec(&SB_JOURNAL(s)->j_async_throttle);
 
   /* wait on everything written so far before writing the commit */
   for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
     tbh = journal_find_get_block(s, bn) ;
-
     wait_on_buffer(tbh) ;
+    // since we're using ll_rw_blk above, it might have skipped over
+    // a locked buffer.  Double check here
+    //
     if (buffer_dirty(tbh))
-      BUG();
+      sync_dirty_buffer(tbh);
     if (!buffer_uptodate(tbh)) {
       reiserfs_panic(s, "journal-601, buffer write failed\n") ;
     }
@@ -892,33 +1154,6 @@ restart:
     return 0 ;
 }
 
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
-    char b[BDEVNAME_SIZE];
-
-    if (buffer_journaled(bh)) {
-        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
-	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
-    }
-    if (uptodate)
-    	set_buffer_uptodate(bh) ;
-    else
-    	clear_buffer_uptodate(bh) ;
-    unlock_buffer(bh) ;
-    put_bh(bh) ;
-}
-
-static void submit_logged_buffer(struct buffer_head *bh) {
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_buffer_io_sync ;
-    mark_buffer_notjournal_new(bh) ;
-    clear_buffer_dirty(bh) ;
-    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
-        BUG();
-    if (!buffer_uptodate(bh))
-        BUG();
-    submit_bh(WRITE, bh) ;
-}
-
 static void del_from_work_list(struct super_block *s,
                                struct reiserfs_journal_list *jl) {
     if (!list_empty(&jl->j_working_list)) {
@@ -1158,28 +1393,6 @@ flush_older_and_return:
   return 0 ;
 } 
 
-#define CHUNK_SIZE 32
-struct buffer_chunk {
-    struct buffer_head *bh[CHUNK_SIZE];
-    int nr;
-};
-
-static void write_chunk(struct buffer_chunk *chunk) {
-    int i;
-    for (i = 0; i < chunk->nr ; i++) {
-	submit_logged_buffer(chunk->bh[i]) ;
-    }
-    chunk->nr = 0;
-}
-
-static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
-    if (chunk->nr >= CHUNK_SIZE)
-        BUG();
-    chunk->bh[chunk->nr++] = bh;
-    if (chunk->nr >= CHUNK_SIZE)
-        write_chunk(chunk);
-}
-
 static int write_one_transaction(struct super_block *s,
                                  struct reiserfs_journal_list *jl,
 				 struct buffer_chunk *chunk)
@@ -1214,7 +1427,7 @@ static int write_one_transaction(struct super_block *s,
 		if (!buffer_journal_dirty(tmp_bh) ||
 		    reiserfs_buffer_prepared(tmp_bh))
 		    BUG();
-		add_to_chunk(chunk, tmp_bh);
+		add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
 		ret++;
 	    } else {
 		/* note, cn->bh might be null now */
@@ -1937,6 +2150,8 @@ retry:
     memset(jl, 0, sizeof(*jl));
     INIT_LIST_HEAD(&jl->j_list);
     INIT_LIST_HEAD(&jl->j_working_list);
+    INIT_LIST_HEAD(&jl->j_tail_bh_list);
+    INIT_LIST_HEAD(&jl->j_bh_list);
     sema_init(&jl->j_commit_lock, 1);
     SB_JOURNAL(s)->j_num_lists++;
     get_journal_list(jl);
@@ -2166,6 +2381,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ;
+  atomic_set(&(SB_JOURNAL(p_s_sb)->j_async_throttle), 0) ;
   SB_JOURNAL(p_s_sb)->j_bcount = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
@@ -2376,6 +2592,43 @@ relock:
   return 0 ;
 }
 
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
+    int ret ;
+    struct reiserfs_transaction_handle *th ;
+
+    /* if we're nesting into an existing transaction.  It will be
+    ** persistent on its own
+    */
+    if (reiserfs_transaction_running(s)) {
+        th = current->journal_info ;
+	th->t_refcount++ ;
+	if (th->t_refcount < 2) {
+	    BUG() ;
+	}
+	return th ;
+    }
+    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
+    if (!th)
+       return NULL;
+    ret = journal_begin(th, s, nblocks) ;
+    if (ret) {
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+        return NULL;
+    }
+    return th ;
+}
+
+int
+reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
+    struct super_block *s = th->t_super;
+    int ret;
+    ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+    if (th->t_refcount == 0)
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+    return ret;
+}
+
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -2522,7 +2775,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   if (!current->journal_info && th->t_refcount > 1)
     printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
-  if (th->t_refcount > 1) {
+
+  th->t_refcount--;
+  if (th->t_refcount > 0) {
     struct reiserfs_transaction_handle *cur_th = current->journal_info ;
 
     /* we aren't allowed to close a nested transaction on a different
@@ -2531,7 +2786,6 @@ int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_
     if (cur_th->t_super != th->t_super)
       BUG() ;
 
-    th->t_refcount--;
     if (th != cur_th) {
       memcpy(current->journal_info, th, sizeof(*th));
       th->t_trans_id = 0;
@@ -2648,14 +2902,7 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
 }
 
 /*
-** used to get memory back from async commits that are floating around
-** and to reclaim any blocks deleted but unusable because their commits
-** haven't hit disk yet.  called from bitmap.c
-**
-** if it starts flushing things, it ors SCHEDULE_OCCURRED into repeat.
-** note, this is just if schedule has a chance of occurring.  I need to 
-** change flush_commit_lists to have a repeat parameter too.
-**
+** writeback the pending async commits to disk
 */
 static void flush_async_commits(void *p) {
   struct super_block *p_s_sb = p;
@@ -2670,6 +2917,9 @@ static void flush_async_commits(void *p) {
       flush_commit_list(p_s_sb, jl, 1);
   }
   unlock_kernel();
+  atomic_inc(&SB_JOURNAL(p_s_sb)->j_async_throttle);
+  filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
+  atomic_dec(&SB_JOURNAL(p_s_sb)->j_async_throttle);
 }
 
 /*
@@ -3072,6 +3322,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
     p_s_sb->s_dirt = 1;
     wake_queued_writers(p_s_sb);
+    reiserfs_async_progress_wait(p_s_sb);
     goto out ;
   }
 
@@ -3248,23 +3499,38 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  // make sure reiserfs_add_jh sees the new current_jl before we
+  // write out the tails
+  smp_mb();
+
   /* tail conversion targets have to hit the disk before we end the
    * transaction.  Otherwise a later transaction might repack the tail
    * before this transaction commits, leaving the data block unflushed and
    * clean, if we crash before the later transaction commits, the data block
    * is lost.
    */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  if (!list_empty(&jl->j_tail_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock,
+			    SB_JOURNAL(p_s_sb), jl, &jl->j_tail_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_tail_bh_list))
+      BUG();
   up(&jl->j_commit_lock);
 
   /* honor the flush wishes from the caller, simple commits can
   ** be done outside the journal lock, they are done below
+  **
+  ** if we don't flush the commit list right now, we put it into
+  ** the work queue so the people waiting on the async progress work
+  ** queue don't wait for this proc to flush journal lists and such.
   */
   if (flush) {
     flush_commit_list(p_s_sb, jl, 1) ;
     flush_journal_list(p_s_sb, jl, 1) ;
-  }
+  } else
+    queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
 
 
   /* if the next transaction has any chance of wrapping, flush 
@@ -3322,15 +3588,12 @@ first_jl:
   clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
 
-  if (!flush) {
-      if (wait_on_commit) {
-	  if (journal_list_still_alive(p_s_sb, commit_trans_id))
-	      flush_commit_list(p_s_sb, jl, 1) ;
-      } else {
-          queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
-      }
+  if (!flush && wait_on_commit &&
+      journal_list_still_alive(p_s_sb, commit_trans_id)) {
+	  flush_commit_list(p_s_sb, jl, 1) ;
   }
 out:
   reiserfs_check_lock_depth("journal end2");
+  th->t_trans_id = 0;
   return 0 ;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f75349fe4787..57991831eeef 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -510,6 +510,14 @@ typedef struct {
 		    applied BEFORE setmask */
 } opt_desc_t;
 
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
+    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
+    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
+    {NULL, 0}
+};
+
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
@@ -664,6 +672,7 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
 	{"nolog", 0, 0, 0, 0}, /* This is unsupported */
 	{"replayonly", 0, 0, 1<<REPLAYONLY, 0},
 	{"block-allocator", 'a', balloc, 0, 0},
+	{"data", 'd', logging_mode, 0, 0},
 	{"resize", 'r', 0, 0, 0},
 	{"jdev", 'j', 0, 0, 0},
 	{"nolargeio", 'w', 0, 0, 0},
@@ -737,6 +746,33 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
     return 1;
 }
 
+static void switch_data_mode(struct super_block *s, unsigned long mode) {
+    REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+                                       (1 << REISERFS_DATA_ORDERED) |
+				       (1 << REISERFS_DATA_WRITEBACK));
+    REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+    if (mount_options & (1 << REISERFS_DATA_LOG)) {
+        if (!reiserfs_data_log(s)) {
+	    switch_data_mode(s, REISERFS_DATA_LOG);
+	    printk("reiserfs: switching to journaled data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+        if (!reiserfs_data_ordered(s)) {
+	    switch_data_mode(s, REISERFS_DATA_ORDERED);
+	    printk("reiserfs: switching to ordered data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+        if (!reiserfs_data_writeback(s)) {
+	    switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+	    printk("reiserfs: switching to writeback data mode\n");
+	}
+    }
+}
+
 static void handle_attrs( struct super_block *s )
 {
 	struct reiserfs_super_block * rs;
@@ -814,6 +850,7 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     if (!(s->s_flags & MS_RDONLY))
 	return 0; /* We are read-write already */
 
+    handle_data_mode(s, mount_options);
     REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
@@ -1306,6 +1343,21 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
     SPRINTK(silent, "reiserfs:warning: - it is slow mode for debugging.\n");
 #endif
 
+    /* make data=ordered the default */
+    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+        !reiserfs_data_writeback(s))
+    {
+         REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+    }
+
+    if (reiserfs_data_log(s)) {
+        printk("reiserfs: using journaled data mode\n");
+    } else if (reiserfs_data_ordered(s)) {
+        printk("reiserfs: using ordered data mode\n");
+    } else {
+        printk("reiserfs: using writeback data mode\n");
+    }
+
     // set_device_ro(s->s_dev, 1) ;
     if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
 	SPRINTK(silent, "sh-2022: reiserfs_fill_super: unable to initialize journal space\n") ;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index fb0bf2af7fd7..31e8047f0f41 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1707,6 +1707,15 @@ struct reiserfs_journal_header {
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
+enum reiserfs_bh_state_bits {
+    BH_JDirty = BH_PrivateStart,
+    BH_JDirty_wait,
+    BH_JNew,
+    BH_JPrepared,
+    BH_JRestore_dirty,
+    BH_JTest, // debugging only will go away
+};
+
 /*
 ** transaction handle which is passed around for all journal calls
 */
@@ -1726,7 +1735,36 @@ struct reiserfs_transaction_handle {
 				   should be displaced from others */
 } ;
 
+/* used to keep track of ordered and tail writes, attached to the buffer
+ * head through b_journal_head.
+ */
+struct reiserfs_jh {
+    struct reiserfs_journal_list *jl;
+    struct buffer_head *bh;
+    struct list_head list;
+};
+
+void reiserfs_free_jh(struct buffer_head *bh);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
 int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+
+static inline int reiserfs_transaction_running(struct super_block *s) {
+    struct reiserfs_transaction_handle *th = current->journal_info ;
+    if (th && th->t_super == s)
+        return 1 ;
+    if (th && th->t_super == NULL)
+        BUG();
+    return 0 ;
+}
+
+int reiserfs_async_progress_wait(struct super_block *s);
+
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *, int count);
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to);
 int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
@@ -1741,7 +1779,6 @@ int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
 int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
@@ -1749,11 +1786,6 @@ int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_
 
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;
-int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *,
-                                    struct inode *, struct buffer_head *) ;
-int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *,
-                                         struct inode *) ;
-
 int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ;
 
 				/* why is this kerplunked right here? */
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index e1fe3ebe33c0..3248dcf369f2 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -107,21 +107,6 @@ typedef enum {
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
 
-/* these are bh_state bit flag offset numbers, for use in the buffer head */
-
-#define BH_JDirty       16      /* journal data needs to be written before buffer can be marked dirty */
-#define BH_JDirty_wait 18	/* commit is done, buffer marked dirty */
-#define BH_JNew 19		/* buffer allocated during this transaction, no need to write if freed during this trans too */
-
-/* ugly.  metadata blocks must be prepared before they can be logged.  
-** prepared means unlocked and cleaned.  If the block is prepared, but not
-** logged for some reason, any bits cleared while preparing it must be 
-** set again.
-*/
-#define BH_JPrepared 20		/* block has been prepared for the log */
-#define BH_JRestore_dirty 22    /* restore the dirty bit later */
-#define BH_JTest 23             /* debugging use only */
-
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
 ** hash of all the in memory transactions.
@@ -178,6 +163,11 @@ struct reiserfs_journal_list {
 
   /* time ordered list of all transactions we haven't tried to flush yet */
   struct list_head j_working_list;
+
+  /* list of tail conversion targets in need of flush before commit */
+  struct list_head j_tail_bh_list;
+  /* list of data=ordered buffers in need of flush before commit */
+  struct list_head j_bh_list;
   int j_refcount;
 } ;
 
@@ -253,7 +243,9 @@ struct reiserfs_journal {
   unsigned long j_max_trans_size ;
   unsigned long j_max_batch_size ;
 
+  /* when flushing ordered buffers, throttle new ordered writers */
   struct work_struct j_work;
+  atomic_t j_async_throttle;
 };
 
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
@@ -408,11 +400,12 @@ struct reiserfs_sb_info
 #define REISERFS_3_5 0
 #define REISERFS_3_6 1
 
+enum reiserfs_mount_options {
 /* Mount options */
-#define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
-#define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
-#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
+    REISERFS_LARGETAIL,  /* large tails will be created in a session */
+    REISERFS_SMALLTAIL,  /* small (for files less than block size) tails will be created in a session */
+    REPLAYONLY, /* replay journal and return 0. Use by fsck */
+    REISERFS_CONVERT,    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
                                  partition will be dealt with in a
@@ -426,26 +419,29 @@ struct reiserfs_sb_info
 ** the existing hash on the FS, so if you have a tea hash disk, and mount
 ** with -o hash=rupasov, the mount will fail.
 */
-#define FORCE_TEA_HASH 6      /* try to force tea hash on mount */
-#define FORCE_RUPASOV_HASH 7  /* try to force rupasov hash on mount */
-#define FORCE_R5_HASH 8       /* try to force rupasov hash on mount */
-#define FORCE_HASH_DETECT 9   /* try to detect hash function on mount */
+    FORCE_TEA_HASH,      /* try to force tea hash on mount */
+    FORCE_RUPASOV_HASH,  /* try to force rupasov hash on mount */
+    FORCE_R5_HASH,       /* try to force rupasov hash on mount */
+    FORCE_HASH_DETECT,   /* try to detect hash function on mount */
 
+    REISERFS_DATA_LOG,
+    REISERFS_DATA_ORDERED,
+    REISERFS_DATA_WRITEBACK,
 
 /* used for testing experimental features, makes benchmarking new
    features with and without more convenient, should never be used by
    users in any code shipped to users (ideally) */
 
-#define REISERFS_NO_BORDER 11
-#define REISERFS_NO_UNHASHED_RELOCATION 12
-#define REISERFS_HASHED_RELOCATION 13
-
-#define REISERFS_ATTRS 15
+    REISERFS_NO_BORDER,
+    REISERFS_NO_UNHASHED_RELOCATION,
+    REISERFS_HASHED_RELOCATION,
+    REISERFS_ATTRS,
 
-#define REISERFS_TEST1 11
-#define REISERFS_TEST2 12
-#define REISERFS_TEST3 13
-#define REISERFS_TEST4 14 
+    REISERFS_TEST1,
+    REISERFS_TEST2,
+    REISERFS_TEST3,
+    REISERFS_TEST4,
+};
 
 #define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
 #define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
@@ -459,11 +455,12 @@ struct reiserfs_sb_info
 #define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
 #define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
 #define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_dont_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NOLOG))
 #define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
 #define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
 #define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-
+#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-- 
cgit v1.2.3


From b566678f923387aa0cf3ec6d56ff368c3053ea0f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:28:19 -0700
Subject: [PATCH] reiserfs_kfree warning fix

fs/reiserfs/journal.c: In function `reiserfs_end_persistent_transaction':
fs/reiserfs/journal.c:2616: warning: unused variable `s'

Make the functions static inline so that typechecking is enabled if
!CONFIG_REISERFS_CHECK.
---
 include/linux/reiserfs_fs.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 31e8047f0f41..dfb46b513712 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2028,8 +2028,17 @@ extern struct address_space_operations reiserfs_address_space_operations ;
 void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s);
 void reiserfs_kfree (const void * vp, size_t size, struct super_block * s);
 #else
-#define reiserfs_kmalloc(x, y, z) kmalloc(x, y)
-#define reiserfs_kfree(x, y, z) kfree(x)
+static inline void *reiserfs_kmalloc(size_t size, int flags,
+					struct super_block *s)
+{
+	return kmalloc(size, flags);
+}
+
+static inline void reiserfs_kfree(const void *vp, size_t size,
+					struct super_block *s)
+{
+	kfree(vp);
+}
 #endif
 
 int fix_nodes (int n_op_mode, struct tree_balance * p_s_tb, 
-- 
cgit v1.2.3


From f85a96f63f300878dcc785cf2333cab15eef48f0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:29:12 -0700
Subject: [PATCH] Light-weight Auditing Framework

From: Rik Faith <faith@redhat.com>

This patch provides a low-overhead system-call auditing framework for Linux
that is usable by LSM components (e.g., SELinux).  This is an update of the
patch discussed in this thread:

    http://marc.theaimsgroup.com/?t=107815888100001&r=1&w=2

In brief, it provides for netlink-based logging of audit records that have
been generated in other parts of the kernel (e.g., SELinux) as well as the
ability to audit system calls, either independently (using simple
filtering) or as a compliment to the audit record that another part of the
kernel generated.

The main goals were to provide system call auditing with 1) as low overhead
as possible, and 2) without duplicating functionality that is already
provided by SELinux (and/or other security infrastructures).  This
framework will work "stand-alone", but is not designed to provide, e.g.,
CAPP functionality without another security component in place.

This updated patch includes changes from feedback I have received,
including the ability to compile without CONFIG_NET (and better use of
tabs, so use -w if you diff against the older patch).

Please see http://people.redhat.com/faith/audit/ for an early example
user-space client (auditd-0.4.tar.gz) and instructions on how to try it.

My future intentions at the kernel level include improving filtering (e.g.,
syscall personality/exit codes) and syscall support for more architectures.
 First, though, I'm going to work on documentation, a (real) audit daemon,
and patches for other user-space tools so that people can play with the
framework and understand how it can be used with and without SELinux.


Update:

Light-weight Auditing Framework receive filter fixes
From: Rik Faith <faith@redhat.com>

Since audit_receive_filter() is only called with audit_netlink_sem held, it
cannot race with either audit_del_rule() or audit_add_rule(), so the
list_for_each_entry_rcu()s may be replaced by list_for_each_entry()s, and
the rcu_read_{un,}lock()s removed.  A fix for this is part of the attached
patch.

Other features of the attached patch are:

1) generalized the ability to test for inequality

2) added syscall exit status reporting and testing

3) added ability to report and test first 4 syscall arguments (this adds
   a large amount of flexibility for little cost; not implemented or tested
   on ppc64)

4) added ability to report and test personality

User-space demo program enhanced for new fields and inequality testing:
http://people.redhat.com/faith/audit/auditd-0.5.tar.gz
---
 arch/i386/kernel/entry.S         |   6 +-
 arch/i386/kernel/ptrace.c        |  10 +
 arch/ppc64/kernel/entry.S        |  15 +-
 arch/ppc64/kernel/ptrace.c       |  29 +-
 arch/x86_64/ia32/ia32entry.S     |  18 +-
 arch/x86_64/kernel/entry.S       |  21 +-
 arch/x86_64/kernel/ptrace.c      |  30 +-
 fs/namei.c                       |  15 +-
 include/asm-i386/thread_info.h   |   6 +-
 include/asm-ppc64/thread_info.h  |   3 +
 include/asm-x86_64/thread_info.h |   5 +-
 include/linux/audit.h            | 211 +++++++++
 include/linux/fs.h               |  14 +-
 include/linux/netlink.h          |   1 +
 include/linux/sched.h            |   3 +
 init/Kconfig                     |  20 +
 kernel/Makefile                  |   2 +
 kernel/audit.c                   | 825 +++++++++++++++++++++++++++++++++++
 kernel/auditsc.c                 | 922 +++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                    |  10 +-
 security/selinux/avc.c           | 168 ++++---
 security/selinux/include/avc.h   |   7 +-
 security/selinux/ss/services.c   |   2 +-
 23 files changed, 2199 insertions(+), 144 deletions(-)
 create mode 100644 include/linux/audit.h
 create mode 100644 kernel/audit.c
 create mode 100644 kernel/auditsc.c

(limited to 'include/linux')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 14e64d3ea25c..afa02ea3592c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -264,7 +264,7 @@ sysenter_past_esp:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 
-	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
@@ -287,7 +287,7 @@ ENTRY(system_call)
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 					# system call tracing in operation
-	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 syscall_call:
 	call *sys_call_table(,%eax,4)
@@ -354,7 +354,7 @@ syscall_trace_entry:
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $_TIF_SYSCALL_TRACE, %cl
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT), %cl
 	jz work_pending
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index a77061138b0c..9f9b32a3f228 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -524,6 +525,15 @@ out:
 __attribute__((regparm(3)))
 void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	if (unlikely(current->audit_context)) {
+		if (!entryexit)
+			audit_syscall_entry(current, regs->orig_eax,
+					    regs->ebx, regs->ecx,
+					    regs->edx, regs->esi);
+		else
+			audit_syscall_exit(current, regs->eax);
+	}
+
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 	if (!(current->ptrace & PT_PTRACED))
diff --git a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S
index 027967ba3ae4..4ad95bcc5b3e 100644
--- a/arch/ppc64/kernel/entry.S
+++ b/arch/ppc64/kernel/entry.S
@@ -95,7 +95,7 @@ _GLOBAL(DoSyscall)
 #endif /* SHOW_SYSCALLS */
 	clrrdi	r10,r1,THREAD_SHIFT
 	ld	r10,TI_FLAGS(r10)
-	andi.	r11,r10,_TIF_SYSCALL_TRACE
+	andi.	r11,r10,_TIF_SYSCALL_T_OR_A
 	bne-	50f
 	cmpli	0,r0,NR_syscalls
 	bge-	66f
@@ -151,7 +151,8 @@ _GLOBAL(ret_from_syscall_1)
 	b	22b
         
 /* Traced system call support */
-50:	bl	.do_syscall_trace
+50:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_syscall_trace_enter
 	ld	r0,GPR0(r1)	/* Restore original registers */
 	ld	r3,GPR3(r1)
 	ld	r4,GPR4(r1)
@@ -201,7 +202,7 @@ _GLOBAL(ret_from_syscall_2)
 	oris	r10,r10,0x1000
 	std	r10,_CCR(r1)
 60:	std	r3,GPR3(r1)	/* Update return value */
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 66:	li	r3,ENOSYS
 	b	57b
@@ -234,14 +235,14 @@ _GLOBAL(ppc64_rt_sigreturn)
 
 80:	clrrdi	r4,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r4)
-	andi.	r4,r4,_TIF_SYSCALL_TRACE
+	andi.	r4,r4,_TIF_SYSCALL_T_OR_A
 	bne-	81f
 	cmpi	0,r3,0
 	bge	.ret_from_except
 	b	.ret_from_syscall_1
 81:	cmpi	0,r3,0
 	blt	.ret_from_syscall_2
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 
 /*
@@ -352,9 +353,9 @@ _GLOBAL(ret_from_fork)
 	bl	.schedule_tail
 	clrrdi	r4,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r4)
-	andi.	r4,r4,_TIF_SYSCALL_TRACE
+	andi.	r4,r4,_TIF_SYSCALL_T_OR_A
 	beq+	.ret_from_except
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 
 _GLOBAL(ret_from_except)
diff --git a/arch/ppc64/kernel/ptrace.c b/arch/ppc64/kernel/ptrace.c
index 6bf102811810..6afe71a7d56c 100644
--- a/arch/ppc64/kernel/ptrace.c
+++ b/arch/ppc64/kernel/ptrace.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -286,12 +287,8 @@ out:
 	return ret;
 }
 
-void do_syscall_trace(void)
+static void do_syscall_trace(void)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
-	if (!(current->ptrace & PT_PTRACED))
-		return;
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
@@ -307,3 +304,25 @@ void do_syscall_trace(void)
 		current->exit_code = 0;
 	}
 }
+
+void do_syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
+
+void do_syscall_trace_leave(void)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, 0);	/* FIXME: pass pt_regs */
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index aea26e4a8405..4e7ab108e8ac 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -78,8 +78,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	bt  $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc  sysenter_tracesys
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz  sysenter_tracesys
 sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls),%eax
 	jae	ia32_badsys
@@ -106,7 +106,7 @@ sysenter_tracesys:
 	CLEAR_RREGS
 	movq	$-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
-	call	syscall_trace
+	call	syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl	%ebp, %ebp
@@ -163,8 +163,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	bt  $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc  cstar_tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz   cstar_tracesys
 cstar_do_call:	
 	cmpl $IA32_NR_syscalls,%eax
 	jae  ia32_badsys
@@ -187,7 +187,7 @@ cstar_tracesys:
 	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl RSP-ARGOFFSET(%rsp), %r8d
@@ -236,8 +236,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	bt $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc ia32_tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls),%eax
 	jae  ia32_badsys
@@ -251,7 +251,7 @@ ia32_tracesys:
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	jmp ia32_do_syscall
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index a6309212038d..89f74f738a2a 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -131,8 +131,8 @@ ENTRY(ret_from_fork)
 	CFI_DEFAULT_STACK
 	call schedule_tail
 	GET_THREAD_INFO(%rcx)
-	bt $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx)
-	jc rff_trace
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz rff_trace
 rff_action:	
 	RESTORE_REST
 	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
@@ -143,7 +143,7 @@ rff_action:
 	jmp ret_from_sys_call
 rff_trace:
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_leave
 	GET_THREAD_INFO(%rcx)	
 	jmp rff_action
 	CFI_ENDPROC
@@ -185,8 +185,8 @@ ENTRY(system_call)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)  
 	GET_THREAD_INFO(%rcx)
-	bt    $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx) 
-	jc    tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
@@ -244,7 +244,7 @@ tracesys:
 	movq $-ENOSYS,RAX(%rsp)
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
@@ -254,7 +254,7 @@ tracesys:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 1:	SAVE_REST
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_leave
 	RESTORE_TOP_OF_STACK %rbx
 	RESTORE_REST
 	jmp ret_from_sys_call
@@ -297,13 +297,14 @@ int_very_careful:
 	sti
 	SAVE_REST
 	/* Check for syscall exit trace */	
-	bt $TIF_SYSCALL_TRACE,%edx
-	jnc int_signal
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),%edx
+	jz int_signal
 	pushq %rdi
 	leaq 8(%rsp),%rdi	# &ptregs -> arg1	
-	call syscall_trace
+	call syscall_trace_leave
 	popq %rdi
 	btr  $TIF_SYSCALL_TRACE,%edi
+	btr  $TIF_SYSCALL_AUDIT,%edi
 	jmp int_restore_rest
 	
 int_signal:
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 8aad386340dc..a97bee6f29fa 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -16,6 +16,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -486,7 +487,7 @@ out:
 	return ret;
 }
 
-asmlinkage void syscall_trace(struct pt_regs *regs)
+static void syscall_trace(struct pt_regs *regs)
 {
 
 #if 0
@@ -496,11 +497,6 @@ asmlinkage void syscall_trace(struct pt_regs *regs)
 	       current_thread_info()->flags, current->ptrace); 
 #endif
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return; 
-	if (!(current->ptrace & PT_PTRACED))
-		return;
-	
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
 				? 0x80 : 0));
 	/*
@@ -513,3 +509,25 @@ asmlinkage void syscall_trace(struct pt_regs *regs)
 		current->exit_code = 0;
 	}
 }
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->orig_rax,
+				    regs->rdi, regs->rsi,
+				    regs->rdx, regs->r10);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, regs->rax);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
diff --git a/fs/namei.c b/fs/namei.c
index e6320d133c5f..d2cab643cf64 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -26,6 +26,7 @@
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
 
@@ -141,10 +142,12 @@ char * getname(const char __user * filename)
 
 		result = tmp;
 		if (retval < 0) {
-			putname(tmp);
+			__putname(tmp);
 			result = ERR_PTR(retval);
 		}
 	}
+	if (unlikely(current->audit_context) && !IS_ERR(result) && result)
+		audit_getname(result);
 	return result;
 }
 
@@ -860,6 +863,8 @@ walk_init_root(const char *name, struct nameidata *nd)
 
 int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
 {
+	int retval;
+
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 
@@ -882,7 +887,13 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata
 	}
 	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
-	return link_path_walk(name, nd);
+	retval = link_path_walk(name, nd);
+	if (unlikely(current->audit_context
+		     && nd && nd->dentry && nd->dentry->d_inode))
+		audit_inode(name,
+			    nd->dentry->d_inode->i_ino,
+			    nd->dentry->d_inode->i_rdev);
+	return retval;
 }
 
 /*
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index da5c780f2c5c..6f59e1fe345b 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -151,6 +151,7 @@ static inline unsigned long current_stack_pointer(void)
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -159,9 +160,12 @@ static inline unsigned long current_stack_pointer(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
-#define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
+/* work to do on interrupt/exception return */
+#define _TIF_WORK_MASK \
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
 
 /*
diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h
index 5b74b149f04f..297c974bf220 100644
--- a/include/asm-ppc64/thread_info.h
+++ b/include/asm-ppc64/thread_info.h
@@ -97,6 +97,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_32BIT		5	/* 32 bit binary */
 #define TIF_RUN_LIGHT		6	/* iSeries run light */
 #define TIF_ABI_PENDING		7	/* 32/64 bit switch needed */
+#define TIF_SYSCALL_AUDIT	8	/* syscall auditing active */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_32BIT		(1<<TIF_32BIT)
 #define _TIF_RUN_LIGHT		(1<<TIF_RUN_LIGHT)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
 				 _TIF_NEED_RESCHED)
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 0145da994590..73e4fa13ed0c 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -101,6 +101,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -112,13 +113,15 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
 
 /* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK    (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP))
+#define _TIF_WORK_MASK \
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK 0x0000FFFF	
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
new file mode 100644
index 000000000000..d766482451af
--- /dev/null
+++ b/include/linux/audit.h
@@ -0,0 +1,211 @@
+/* audit.h -- Auditing support -*- linux-c -*-
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ */
+
+#ifndef _LINUX_AUDIT_H_
+#define _LINUX_AUDIT_H_
+
+/* Request and reply types */
+#define AUDIT_GET      1000	/* Get status */
+#define AUDIT_SET      1001	/* Set status (enable/disable/auditd) */
+#define AUDIT_LIST     1002	/* List filtering rules */
+#define AUDIT_ADD      1003	/* Add filtering rule */
+#define AUDIT_DEL      1004	/* Delete filtering rule */
+#define AUDIT_USER     1005	/* Send a message from user-space */
+#define AUDIT_LOGIN    1006     /* Define the login id and informaiton */
+#define AUDIT_KERNEL   2000	/* Asynchronous audit record. NOT A REQUEST. */
+
+/* Rule flags */
+#define AUDIT_PER_TASK 0x01	/* Apply rule at task creation (not syscall) */
+#define AUDIT_AT_ENTRY 0x02	/* Apply rule at syscall entry */
+#define AUDIT_AT_EXIT  0x04	/* Apply rule at syscall exit */
+#define AUDIT_PREPEND  0x10	/* Prepend to front of list */
+
+/* Rule actions */
+#define AUDIT_NEVER    0	/* Do not build context if rule matches */
+#define AUDIT_POSSIBLE 1	/* Build context if rule matches  */
+#define AUDIT_ALWAYS   2	/* Generate audit record if rule matches */
+
+/* Rule structure sizes -- if these change, different AUDIT_ADD and
+ * AUDIT_LIST commands must be implemented. */
+#define AUDIT_MAX_FIELDS   64
+#define AUDIT_BITMASK_SIZE 64
+#define AUDIT_WORD(nr) ((__u32)((nr)/32))
+#define AUDIT_BIT(nr)  (1 << ((nr) - AUDIT_WORD(nr)*32))
+
+/* Rule fields */
+				/* These are useful when checking the
+				 * task structure at task creation time
+				 * (AUDIT_PER_TASK).  */
+#define AUDIT_PID	0
+#define AUDIT_UID	1
+#define AUDIT_EUID	2
+#define AUDIT_SUID	3
+#define AUDIT_FSUID	4
+#define AUDIT_GID	5
+#define AUDIT_EGID	6
+#define AUDIT_SGID	7
+#define AUDIT_FSGID	8
+#define AUDIT_LOGINUID	9
+#define AUDIT_PERS	10
+
+				/* These are ONLY useful when checking
+				 * at syscall exit time (AUDIT_AT_EXIT). */
+#define AUDIT_DEVMAJOR	100
+#define AUDIT_DEVMINOR	101
+#define AUDIT_INODE	102
+#define AUDIT_EXIT	103
+#define AUDIT_SUCCESS   104	/* exit >= 0; value ignored */
+
+#define AUDIT_ARG0      200
+#define AUDIT_ARG1      (AUDIT_ARG0+1)
+#define AUDIT_ARG2      (AUDIT_ARG0+2)
+#define AUDIT_ARG3      (AUDIT_ARG0+3)
+
+#define AUDIT_NEGATE    0x80000000
+
+
+/* Status symbols */
+				/* Mask values */
+#define AUDIT_STATUS_ENABLED		0x0001
+#define AUDIT_STATUS_FAILURE		0x0002
+#define AUDIT_STATUS_PID		0x0004
+#define AUDIT_STATUS_RATE_LIMIT		0x0008
+#define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
+				/* Failure-to-log actions */
+#define AUDIT_FAIL_SILENT	0
+#define AUDIT_FAIL_PRINTK	1
+#define AUDIT_FAIL_PANIC	2
+
+#ifndef __KERNEL__
+struct audit_message {
+	struct nlmsghdr nlh;
+	char		data[1200];
+};
+#endif
+
+struct audit_status {
+	__u32		mask;		/* Bit mask for valid entries */
+	__u32		enabled;	/* 1 = enabled, 0 = disbaled */
+	__u32		failure;	/* Failure-to-log action */
+	__u32		pid;		/* pid of auditd process */
+	__u32		rate_limit;	/* messages rate limit (per second) */
+	__u32		backlog_limit;	/* waiting messages limit */
+	__u32		lost;		/* messages lost */
+	__u32		backlog;	/* messages waiting in queue */
+};
+
+struct audit_login {
+	__u32		loginuid;
+	int		msglen;
+	char		msg[1024];
+};
+
+struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
+	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
+	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
+	__u32		field_count;
+	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		fields[AUDIT_MAX_FIELDS];
+	__u32		values[AUDIT_MAX_FIELDS];
+};
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_AUDIT
+struct audit_buffer;
+struct audit_context;
+#endif
+
+#ifdef CONFIG_AUDITSYSCALL
+/* These are defined in auditsc.c */
+				/* Public API */
+extern int  audit_alloc(struct task_struct *task);
+extern void audit_free(struct task_struct *task);
+extern void audit_syscall_entry(struct task_struct *task,
+				int major, unsigned long a0, unsigned long a1,
+				unsigned long a2, unsigned long a3);
+extern void audit_syscall_exit(struct task_struct *task, int return_code);
+extern void audit_getname(const char *name);
+extern void audit_putname(const char *name);
+extern void audit_inode(const char *name, unsigned long ino, dev_t rdev);
+
+				/* Private API (for audit.c only) */
+extern int  audit_receive_filter(int type, int pid, int uid, int seq,
+				 void *data);
+extern void audit_get_stamp(struct audit_context *ctx,
+			    struct timespec *t, int *serial);
+extern int  audit_set_loginuid(struct audit_context *ctx, uid_t loginuid);
+#else
+#define audit_alloc(t) ({ 0; })
+#define audit_free(t) do { ; } while (0)
+#define audit_syscall_entry(t,a,b,c,d,e) do { ; } while (0)
+#define audit_syscall_exit(t,r) do { ; } while (0)
+#define audit_getname(n) do { ; } while (0)
+#define audit_putname(n) do { ; } while (0)
+#define audit_inode(n,i,d) do { ; } while (0)
+#endif
+
+#ifdef CONFIG_AUDIT
+/* These are defined in audit.c */
+				/* Public API */
+extern void		    audit_log(struct audit_context *ctx,
+				      const char *fmt, ...)
+			    __attribute__((format(printf,2,3)));
+
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx);
+extern void		    audit_log_format(struct audit_buffer *ab,
+					     const char *fmt, ...)
+			    __attribute__((format(printf,2,3)));
+extern void		    audit_log_end(struct audit_buffer *ab);
+extern void		    audit_log_end_fast(struct audit_buffer *ab);
+extern void		    audit_log_end_irq(struct audit_buffer *ab);
+extern void		    audit_log_d_path(struct audit_buffer *ab,
+					     const char *prefix,
+					     struct dentry *dentry,
+					     struct vfsmount *vfsmnt);
+extern int		    audit_set_rate_limit(int limit);
+extern int		    audit_set_backlog_limit(int limit);
+extern int		    audit_set_enabled(int state);
+extern int		    audit_set_failure(int state);
+
+				/* Private API (for auditsc.c only) */
+extern void		    audit_send_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_log_lost(const char *message);
+#else
+#define audit_log(t,f,...) do { ; } while (0)
+#define audit_log_start(t) ({ NULL; })
+#define audit_log_vformat(b,f,a) do { ; } while (0)
+#define audit_log_format(b,f,...) do { ; } while (0)
+#define audit_log_end(b) do { ; } while (0)
+#define audit_log_end_fast(b) do { ; } while (0)
+#define audit_log_end_irq(b) do { ; } while (0)
+#define audit_log_d_path(b,p,d,v) do { ; } while (0)
+#define audit_set_rate_limit(l) do { ; } while (0)
+#define audit_set_backlog_limit(l) do { ; } while (0)
+#define audit_set_enabled(s) do { ; } while (0)
+#define audit_set_failure(s) do { ; } while (0)
+#endif
+#endif
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bacf6bcbc7b7..39c893f8aa28 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,6 +20,7 @@
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
 #include <asm/atomic.h>
+#include <linux/audit.h>
 
 struct iovec;
 struct nameidata;
@@ -1159,7 +1160,18 @@ extern char * getname(const char __user *);
 extern void vfs_caches_init(unsigned long);
 
 #define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
-#define putname(name)	kmem_cache_free(names_cachep, (void *)(name))
+#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
+#ifndef CONFIG_AUDITSYSCALL
+#define putname(name)   __putname(name)
+#else
+#define putname(name)							\
+	do {								\
+		if (unlikely(current->audit_context))			\
+			audit_putname(name);				\
+		else							\
+			__putname(name);				\
+	} while (0)
+#endif
 
 extern int register_blkdev(unsigned int, const char *);
 extern int unregister_blkdev(unsigned int, const char *);
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index e5e15ddadab5..5adca479de6e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -13,6 +13,7 @@
 #define NETLINK_XFRM		6	/* ipsec */
 #define NETLINK_SELINUX		7	/* SELinux event notifications */
 #define NETLINK_ARPD		8
+#define NETLINK_AUDIT		9	/* auditing */
 #define NETLINK_ROUTE6		11	/* af_inet6 route comm channel */
 #define NETLINK_IP6_FW		13
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 22080f919266..b72c38420d71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -371,6 +371,8 @@ int set_current_groups(struct group_info *group_info);
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
 
+struct audit_context;		/* See audit.c */
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -474,6 +476,7 @@ struct task_struct {
 	sigset_t *notifier_mask;
 	
 	void *security;
+	struct audit_context *audit_context;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/init/Kconfig b/init/Kconfig
index ddd82dbad5dd..55261afdc3bf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -137,6 +137,26 @@ config SYSCTL
 	  building a kernel for install/rescue disks or your system is very
 	  limited in memory.
 
+config AUDIT
+	bool "Auditing support"
+	default y if SECURITY_SELINUX
+	default n
+	help
+	  Enable auditing infrastructure that can be used with another
+	  kernel subsystem, such as SELinux (which requires this for
+	  logging of avc messages output).  Does not do system-call
+	  auditing without CONFIG_AUDITSYSCALL.
+
+config AUDITSYSCALL
+	bool "Enable system-call auditing support"
+	depends on AUDIT && (X86 || PPC64)
+	default y if SECURITY_SELINUX
+	default n
+	help
+	  Enable low-overhead system-call auditing infrastructure that
+	  can be used independently or with another kernel subsystem,
+	  such as SELinux.
+
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 20
diff --git a/kernel/Makefile b/kernel/Makefile
index 3a6484838748..238c65f60d9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,8 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..765822b03b91
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,825 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ *	  2) Minimal run-time overhead:
+ *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
+ *	     b) Small when syscall auditing is enabled and no audit record
+ *		is generated (defer as much work as possible to record
+ *		generation time):
+ *		i) context is allocated,
+ *		ii) names from getname are stored without a copy, and
+ *		iii) inode information stored from path_lookup.
+ *	  3) Ability to disable syscall auditing at boot time (audit=0).
+ *	  4) Usable by other parts of the kernel (if audit_log* is called,
+ *	     then a syscall record will be generated automatically for the
+ *	     current syscall).
+ *	  5) Netlink interface to user-space.
+ *	  6) Support low-overhead kernel-based filtering to minimize the
+ *	     information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int	audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int		audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int	audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int	audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int	audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second.  This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int	audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int	audit_backlog_limit = 64;
+static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+   0) [suppressed in audit_alloc]
+   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+   2) out of memory in audit_log_move [alloc_skb]
+   3) suppressed due to audit_rate_limit
+   4) suppressed due to audit_backlog_limit
+*/
+static atomic_t    audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers.  The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static spinlock_t  audit_txlist_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t  audit_freelist_lock = SPIN_LOCK_UNLOCKED;
+static int	   audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records.  Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE  (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record.  The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue.  Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+	struct list_head     list;
+	struct sk_buff_head  sklist;	/* formatted skbs ready to send */
+	struct audit_context *ctx;	/* NULL or associated context */
+	int		     len;	/* used area of tmp */
+	char		     tmp[AUDIT_BUFSIZ];
+
+				/* Pointer to header and contents */
+	struct nlmsghdr      *nlh;
+	int		     total;
+	int		     type;
+	int		     pid;
+	int		     count; /* Times requeued */
+};
+
+struct audit_entry {
+	struct list_head  list;
+	struct audit_rule rule;
+};
+
+static void audit_panic(const char *message)
+{
+	switch (audit_failure)
+	{
+	case AUDIT_FAIL_SILENT:
+		break;
+	case AUDIT_FAIL_PRINTK:
+		printk(KERN_ERR "audit: %s\n", message);
+		break;
+	case AUDIT_FAIL_PANIC:
+		panic(message);
+		break;
+	}
+}
+
+static inline int audit_rate_check(void)
+{
+	static unsigned long	last_check = 0;
+	static int		messages   = 0;
+	static spinlock_t	lock	   = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	unsigned long		elapsed;
+	int			retval	   = 0;
+
+	if (!audit_rate_limit) return 1;
+
+	spin_lock_irqsave(&lock, flags);
+	if (++messages < audit_rate_limit) {
+		retval = 1;
+	} else {
+		now     = jiffies;
+		elapsed = now - last_check;
+		if (elapsed > HZ) {
+			last_check = now;
+			messages   = 0;
+			retval     = 1;
+		}
+	}
+	spin_unlock_irqrestore(&lock, flags);
+
+	return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+	static unsigned long	last_msg = 0;
+	static spinlock_t	lock     = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	int			print;
+
+	atomic_inc(&audit_lost);
+
+	print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+	if (!print) {
+		spin_lock_irqsave(&lock, flags);
+		now = jiffies;
+		if (now - last_msg > HZ) {
+			print = 1;
+			last_msg = now;
+		}
+		spin_unlock_irqrestore(&lock, flags);
+	}
+
+	if (print) {
+		printk(KERN_WARNING
+		       "audit: audit_lost=%d audit_backlog=%d"
+		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       atomic_read(&audit_lost),
+		       atomic_read(&audit_backlog),
+		       audit_rate_limit,
+		       audit_backlog_limit);
+		audit_panic(message);
+	}
+
+}
+
+int audit_set_rate_limit(int limit)
+{
+	int old		 = audit_rate_limit;
+	audit_rate_limit = limit;
+	audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+		  audit_rate_limit, old);
+	return old;
+}
+
+int audit_set_backlog_limit(int limit)
+{
+	int old		 = audit_backlog_limit;
+	audit_backlog_limit = limit;
+	audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+		  audit_backlog_limit, old);
+	return old;
+}
+
+int audit_set_enabled(int state)
+{
+	int old		 = audit_enabled;
+	if (state != 0 && state != 1)
+		return -EINVAL;
+	audit_enabled = state;
+	audit_log(current->audit_context, "audit_enabled=%d old=%d",
+		  audit_enabled, old);
+	return old;
+}
+
+int audit_set_failure(int state)
+{
+	int old		 = audit_failure;
+	if (state != AUDIT_FAIL_SILENT
+	    && state != AUDIT_FAIL_PRINTK
+	    && state != AUDIT_FAIL_PANIC)
+		return -EINVAL;
+	audit_failure = state;
+	audit_log(current->audit_context, "audit_failure=%d old=%d",
+		  audit_failure, old);
+	return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+		      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nlh;
+	int		len = NLMSG_SPACE(size);
+	void		*data;
+	int		flags = multi ? NLM_F_MULTI : 0;
+	int		t     = done  ? NLMSG_DONE  : type;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		goto nlmsg_failure;
+
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+	nlh->nlmsg_flags = flags;
+	data		 = NLMSG_DATA(nlh);
+	memcpy(data, payload, size);
+	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+	return;
+
+nlmsg_failure:			/* Used by NLMSG_PUT */
+	if (skb)
+		kfree_skb(skb);
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32			uid, pid, seq;
+	void			*data;
+	struct audit_status	*status_get, status_set;
+	struct audit_login	*login;
+	int			err = 0;
+	struct audit_buffer	*ab;
+
+	pid  = NETLINK_CREDS(skb)->pid;
+	uid  = NETLINK_CREDS(skb)->uid;
+	seq  = nlh->nlmsg_seq;
+	data = NLMSG_DATA(nlh);
+
+	switch (nlh->nlmsg_type) {
+	case AUDIT_GET:
+		status_set.enabled	 = audit_enabled;
+		status_set.failure	 = audit_failure;
+		status_set.pid		 = audit_pid;
+		status_set.rate_limit	 = audit_rate_limit;
+		status_set.backlog_limit = audit_backlog_limit;
+		status_set.lost		 = atomic_read(&audit_lost);
+		status_set.backlog	 = atomic_read(&audit_backlog);
+		audit_send_reply(pid, seq, AUDIT_GET, 0, 0,
+				 &status_set, sizeof(status_set));
+		break;
+	case AUDIT_SET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		status_get   = (struct audit_status *)data;
+		if (status_get->mask & AUDIT_STATUS_ENABLED) {
+			err = audit_set_enabled(status_get->enabled);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_FAILURE) {
+			err = audit_set_failure(status_get->failure);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_PID) {
+			int old   = audit_pid;
+			audit_pid = status_get->pid;
+			audit_log(current->audit_context,
+				  "audit_pid=%d old=%d", audit_pid, old);
+		}
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+			audit_set_rate_limit(status_get->rate_limit);
+		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+			audit_set_backlog_limit(status_get->backlog_limit);
+		break;
+	case AUDIT_USER:
+		ab = audit_log_start(NULL);
+		if (!ab)
+			break;	/* audit_panic has been called */
+		audit_log_format(ab,
+				 "user pid=%d uid=%d length=%d msg='%.1024s'",
+				 pid, uid,
+				 (int)(nlh->nlmsg_len
+				       - ((char *)data - (char *)nlh)),
+				 (char *)data);
+		ab->type = AUDIT_USER;
+		ab->pid  = pid;
+		audit_log_end(ab);
+		break;
+	case AUDIT_LOGIN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		login = (struct audit_login *)data;
+		ab = audit_log_start(NULL);
+		if (ab) {
+			audit_log_format(ab, "login pid=%d uid=%d loginuid=%d"
+					 " length=%d msg='%.1024s'",
+					 pid, uid,
+					 login->loginuid,
+					 login->msglen,
+					 login->msg);
+			ab->type = AUDIT_LOGIN;
+			ab->pid  = pid;
+			audit_log_end(ab);
+		}
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_set_loginuid(current->audit_context,
+					 login->loginuid);
+#endif
+		break;
+	case AUDIT_LIST:
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_receive_filter(nlh->nlmsg_type, pid, uid, seq,
+					   data);
+#else
+		err = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * discarded silently.  */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+	int		err;
+	struct nlmsghdr	*nlh;
+	u32		rlen;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return 0;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if ((err = audit_receive_msg(skb, nlh))) {
+			netlink_ack(skb, nlh, -err);
+		} else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+	return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+	struct sk_buff  *skb;
+
+	if (down_trylock(&audit_netlink_sem))
+		return;
+
+				/* FIXME: this must not cause starvation */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+		if (audit_receive_skb(skb) && skb->len)
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		else
+			kfree_skb(skb);
+	}
+	up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb.  This is an extra copy, and
+ * that is unfortunate.  However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation.  (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	struct sk_buff	*skb;
+	char		*start;
+	int		extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+	skb = skb_peek(&ab->sklist);
+	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+		if (!skb) {
+			ab->len = 0; /* Lose information in ab->tmp */
+			audit_log_lost("out of memory in audit_log_move");
+			return;
+		}
+		__skb_queue_tail(&ab->sklist, skb);
+		if (!ab->nlh)
+			ab->nlh = (struct nlmsghdr *)skb_put(skb,
+							     NLMSG_SPACE(0));
+	}
+	start = skb_put(skb, ab->len);
+	memcpy(start, ab->tmp, ab->len);
+	ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ab->sklist))) {
+		int retval = 0;
+
+		if (audit_pid) {
+			if (ab->nlh) {
+				ab->nlh->nlmsg_len   = ab->total;
+				ab->nlh->nlmsg_type  = ab->type;
+				ab->nlh->nlmsg_flags = 0;
+				ab->nlh->nlmsg_seq   = 0;
+				ab->nlh->nlmsg_pid   = ab->pid;
+			}
+			skb_get(skb); /* because netlink_* frees */
+			retval = netlink_unicast(audit_sock, skb, audit_pid,
+						 MSG_DONTWAIT);
+		}
+		if (retval == -EAGAIN && ab->count < 5) {
+			++ab->count;
+			audit_log_end_irq(ab);
+			return 1;
+		}
+		if (retval < 0) {
+			if (retval == -ECONNREFUSED) {
+				printk(KERN_ERR
+				       "audit: *NO* daemon at audit_pid=%d\n",
+				       audit_pid);
+				audit_pid = 0;
+			} else
+				audit_log_lost("netlink socket too busy");
+		}
+		if (!audit_pid) { /* No daemon */
+			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+			int len    = skb->len - offset;
+			printk(KERN_ERR "%*.*s\n",
+			       len, len, skb->data + offset);
+		}
+		kfree_skb(skb);
+		ab->nlh = NULL;
+	}
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+	       audit_default ? "enabled" : "disabled");
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	if (!audit_sock)
+		audit_panic("cannot initialize netlink socket");
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs.  For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+	ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+	audit_sock = NULL;
+	audit_pid  = 0;
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+	audit_default = !!simple_strtol(str, NULL, 0);
+	printk(KERN_INFO "audit: %s%s\n",
+	       audit_default ? "enabled" : "disabled",
+	       audit_initialized ? "" : " (after initialization)");
+	if (audit_initialized)
+		audit_enabled = audit_default;
+	return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+	struct audit_buffer	*ab	= NULL;
+	unsigned long		flags;
+	struct timespec		t;
+	int			serial	= 0;
+
+	if (!audit_initialized)
+		return NULL;
+
+	if (audit_backlog_limit
+	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       atomic_read(&audit_backlog),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		return NULL;
+	}
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab)
+		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+	if (!ab)
+		audit_log_lost("audit: out of memory in audit_log_start");
+	if (!ab)
+		return NULL;
+
+	atomic_inc(&audit_backlog);
+	skb_queue_head_init(&ab->sklist);
+
+	ab->ctx   = ctx;
+	ab->len   = 0;
+	ab->nlh   = NULL;
+	ab->total = 0;
+	ab->type  = AUDIT_KERNEL;
+	ab->pid   = 0;
+	ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (ab->ctx)
+		audit_get_stamp(ab->ctx, &t, &serial);
+	else
+#endif
+		t = CURRENT_TIME;
+
+	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+			 t.tv_sec, t.tv_nsec/1000000, serial);
+	return ab;
+}
+
+
+/* Format an audit message into the audit buffer.  If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time.  Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+			      va_list args)
+{
+	int len, avail;
+
+	if (!ab)
+		return;
+
+	avail = sizeof(ab->tmp) - ab->len;
+	if (avail <= 0) {
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+	}
+	len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	if (len >= avail) {
+		/* The printk buffer is 1024 bytes long, so if we get
+		 * here and AUDIT_BUFSIZ is at least 1024, then we can
+		 * log everything that printk could have logged. */
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+		len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	}
+	ab->len   += (len < avail) ? len : avail;
+	ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer.  All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+	va_list args;
+
+	if (!ab)
+		return;
+	va_start(args, fmt);
+	audit_log_vformat(ab, fmt, args);
+	va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+		      struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	char *p;
+	int  len, avail;
+
+	if (prefix) audit_log_format(ab, " %s", prefix);
+
+	if (ab->len > 128)
+		audit_log_move(ab);
+	avail = sizeof(ab->tmp) - ab->len;
+	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+	if (p == ERR_PTR(-ENAMETOOLONG)) {
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<toolong>");
+	} else {
+				/* path isn't at start of buffer */
+		len	   = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+		memmove(ab->tmp + ab->len, p, len);
+		ab->len   += len;
+		ab->total += len;
+	}
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+	LIST_HEAD(list);
+	struct audit_buffer *ab;
+	unsigned long	    flags;
+
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_splice_init(&audit_txlist, &list);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	while (!list_empty(&list)) {
+		ab = list_entry(list.next, struct audit_buffer, list);
+		list_del(&ab->list);
+		audit_log_end_fast(ab);
+	}
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context.  May be called in
+ * any context. */
+void audit_log_end_irq(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	if (!ab)
+		return;
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_add_tail(&ab->list, &audit_txlist);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space.  May not
+ * be called in an irq context. */
+void audit_log_end_fast(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	BUG_ON(in_irq());
+	if (!ab)
+		return;
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		audit_log_move(ab);
+		if (audit_log_drain(ab))
+			return;
+	}
+
+	atomic_dec(&audit_backlog);
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context.  (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+	if (in_irq())
+		audit_log_end_irq(ab);
+	else
+		audit_log_end_fast(ab);
+}
+
+/* Log an audit record.  This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+	struct audit_buffer *ab;
+	va_list args;
+
+	ab = audit_log_start(ctx);
+	if (ab) {
+		va_start(args, fmt);
+		audit_log_vformat(ab, fmt, args);
+		va_end(args);
+		audit_log_end(ab);
+	}
+}
+
+EXPORT_SYMBOL_GPL(audit_set_rate_limit);
+EXPORT_SYMBOL_GPL(audit_set_backlog_limit);
+EXPORT_SYMBOL_GPL(audit_set_enabled);
+EXPORT_SYMBOL_GPL(audit_set_failure);
+
+EXPORT_SYMBOL_GPL(audit_log_start);
+EXPORT_SYMBOL_GPL(audit_log_format);
+EXPORT_SYMBOL_GPL(audit_log_end_irq);
+EXPORT_SYMBOL_GPL(audit_log_end_fast);
+EXPORT_SYMBOL_GPL(audit_log_end);
+EXPORT_SYMBOL_GPL(audit_log);
+EXPORT_SYMBOL_GPL(audit_log_d_path);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..342b57141fd9
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,922 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES    20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+	const char	*name;
+	unsigned long	ino;
+	dev_t		rdev;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+	int		    in_syscall;	/* 1 if task is in a syscall */
+	enum audit_state    state;
+	unsigned int	    serial;     /* serial number for record */
+	struct timespec	    ctime;      /* time of syscall entry */
+	uid_t		    loginuid;   /* login uid (identity) */
+	int		    major;      /* syscall number */
+	unsigned long	    argv[4];    /* syscall arguments */
+	int		    return_valid; /* return code is valid */
+	int		    return_code;/* syscall return code */
+	int		    auditable;  /* 1 if record should be written */
+	int		    name_count;
+	struct audit_names  names[AUDIT_NAMES];
+	struct audit_context *previous; /* For nested syscalls */
+
+				/* Save things to print about task_struct */
+	pid_t		    pid;
+	uid_t		    uid, euid, suid, fsuid;
+	gid_t		    gid, egid, sgid, fsgid;
+	unsigned long	    personality;
+
+#if AUDIT_DEBUG
+	int		    put_count;
+	int		    ino_count;
+#endif
+};
+
+				/* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+				 struct list_head *list)
+{
+	if (entry->rule.flags & AUDIT_PREPEND) {
+		entry->rule.flags &= ~AUDIT_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+	return 0;
+}
+
+static void audit_free_rule(void *arg)
+{
+	kfree(arg);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule, e);
+			return 0;
+		}
+	}
+	return -EFAULT;		/* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space.  Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+	u32		   flags;
+	struct audit_entry *entry;
+	int		   err = 0;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* The *_rcu iterators not needed here because we are
+		   always called with audit_netlink_sem held. */
+		list_for_each_entry(entry, &audit_tsklist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_entlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_extlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+		break;
+	case AUDIT_ADD:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+			return -ENOMEM;
+		if (audit_copy_rule(&entry->rule, data)) {
+			kfree(entry);
+			return -EINVAL;
+		}
+		flags = entry->rule.flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_add_rule(entry, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_add_rule(entry, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_add_rule(entry, &audit_extlist);
+		break;
+	case AUDIT_DEL:
+		flags =((struct audit_rule *)data)->flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_del_rule(data, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_del_rule(data, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_del_rule(data, &audit_extlist);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule.  Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+			      struct audit_rule *rule,
+			      struct audit_context *ctx,
+			      enum audit_state *state)
+{
+	int i, j;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = (tsk->pid == value);
+			break;
+		case AUDIT_UID:
+			result = (tsk->uid == value);
+			break;
+		case AUDIT_EUID:
+			result = (tsk->euid == value);
+			break;
+		case AUDIT_SUID:
+			result = (tsk->suid == value);
+			break;
+		case AUDIT_FSUID:
+			result = (tsk->fsuid == value);
+			break;
+		case AUDIT_GID:
+			result = (tsk->gid == value);
+			break;
+		case AUDIT_EGID:
+			result = (tsk->egid == value);
+			break;
+		case AUDIT_SGID:
+			result = (tsk->sgid == value);
+			break;
+		case AUDIT_FSGID:
+			result = (tsk->fsgid == value);
+			break;
+		case AUDIT_PERS:
+			result = (tsk->personality == value);
+			break;
+
+		case AUDIT_EXIT:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code == value);
+			break;
+		case AUDIT_SUCCESS:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code >= 0);
+			break;
+		case AUDIT_DEVMAJOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MAJOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_DEVMINOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_INODE:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].ino)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_LOGINUID:
+			result = 0;
+			if (ctx)
+				result = (ctx->loginuid == value);
+			break;
+		case AUDIT_ARG0:
+		case AUDIT_ARG1:
+		case AUDIT_ARG2:
+		case AUDIT_ARG3:
+			if (ctx)
+				result = (ctx->argv[field-AUDIT_ARG0]==value);
+			break;
+		}
+
+		if (rule->fields[i] & AUDIT_NEGATE)
+			result = !result;
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task.  Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_tsklist, list) {
+		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+					     struct audit_context *ctx,
+					     struct list_head *list)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int		   word = AUDIT_WORD(ctx->major);
+	int		   bit  = AUDIT_BIT(ctx->major);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit
+ 		    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+						      int return_valid,
+						      int return_code)
+{
+	struct audit_context *context = tsk->audit_context;
+
+	if (likely(!context))
+		return NULL;
+	context->return_valid = return_valid;
+	context->return_code  = return_code;
+
+	if (context->in_syscall && !context->auditable) {
+		enum audit_state state;
+		state = audit_filter_syscall(tsk, context, &audit_extlist);
+		if (state == AUDIT_RECORD_CONTEXT)
+			context->auditable = 1;
+	}
+
+	context->pid = tsk->pid;
+	context->uid = tsk->uid;
+	context->gid = tsk->gid;
+	context->euid = tsk->euid;
+	context->suid = tsk->suid;
+	context->fsuid = tsk->fsuid;
+	context->egid = tsk->egid;
+	context->sgid = tsk->sgid;
+	context->fsgid = tsk->fsgid;
+	context->personality = tsk->personality;
+	tsk->audit_context = NULL;
+	return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+	int i;
+
+#if AUDIT_DEBUG == 2
+	if (context->auditable
+	    ||context->put_count + context->ino_count != context->name_count) {
+		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		       " name_count=%d put_count=%d"
+		       " ino_count=%d [NOT freeing]\n",
+		       __LINE__,
+		       context->serial, context->major, context->in_syscall,
+		       context->name_count, context->put_count,
+		       context->ino_count);
+		for (i = 0; i < context->name_count; i++)
+			printk(KERN_ERR "names[%d] = %p = %s\n", i,
+			       context->names[i].name,
+			       context->names[i].name);
+		dump_stack();
+		return;
+	}
+#endif
+#if AUDIT_DEBUG
+	context->put_count  = 0;
+	context->ino_count  = 0;
+#endif
+
+	for (i = 0; i < context->name_count; i++)
+		if (context->names[i].name)
+			__putname(context->names[i].name);
+	context->name_count = 0;
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+				      enum audit_state state)
+{
+	uid_t loginuid = context->loginuid;
+
+	memset(context, 0, sizeof(*context));
+	context->state      = state;
+	context->loginuid   = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+	struct audit_context *context;
+
+	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+		return NULL;
+	audit_zero_context(context, state);
+	return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary.  Doing so turns on system call auditing for the
+ * specified task.  This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+	struct audit_context *context;
+	enum audit_state     state;
+
+	if (likely(!audit_enabled))
+		return 0; /* Return if not auditing. */
+
+	state = audit_filter_task(tsk);
+	if (likely(state == AUDIT_DISABLED))
+		return 0;
+
+	if (!(context = audit_alloc_context(state))) {
+		audit_log_lost("out of memory in audit_alloc");
+		return -ENOMEM;
+	}
+
+				/* Preserve login uid */
+	context->loginuid = -1;
+	if (tsk->audit_context)
+		context->loginuid = tsk->audit_context->loginuid;
+
+	tsk->audit_context  = context;
+	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+	return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+	struct audit_context *previous;
+	int		     count = 0;
+
+	do {
+		previous = context->previous;
+		if (previous || (count &&  count < 10)) {
+			++count;
+			printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+			       " freeing multiple contexts (%d)\n",
+			       context->serial, context->major,
+			       context->name_count, count);
+		}
+		audit_free_names(context);
+		kfree(context);
+		context  = previous;
+	} while (context);
+	if (count >= 10)
+		printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+	int i;
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "syscall=%d", context->major);
+	if (context->personality != PER_LINUX)
+		audit_log_format(ab, " per=%lx", context->personality);
+	if (context->return_valid)
+		audit_log_format(ab, " exit=%u", context->return_code);
+	audit_log_format(ab,
+		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+		  " pid=%d loginuid=%d uid=%d gid=%d"
+		  " euid=%d suid=%d fsuid=%d"
+		  " egid=%d sgid=%d fsgid=%d",
+		  context->argv[0],
+		  context->argv[1],
+		  context->argv[2],
+		  context->argv[3],
+		  context->name_count,
+		  context->pid,
+		  context->loginuid,
+		  context->uid,
+		  context->gid,
+		  context->euid, context->suid, context->fsuid,
+		  context->egid, context->sgid, context->fsgid);
+	audit_log_end(ab);
+	for (i = 0; i < context->name_count; i++) {
+		ab = audit_log_start(context);
+		if (!ab)
+			continue; /* audit_panic has been called */
+		audit_log_format(ab, "item=%d", i);
+		if (context->names[i].name)
+			audit_log_format(ab, " name=%s",
+					 context->names[i].name);
+		if (context->names[i].ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu",
+					 context->names[i].ino);
+		/* FIXME: should use format_dev_t, but ab structure is
+		 * opaque. */
+		if (context->names[i].rdev != -1)
+			audit_log_format(ab, " dev=%02x:%02x",
+					 MAJOR(context->names[i].rdev),
+					 MINOR(context->names[i].rdev));
+		audit_log_end(ab);
+	}
+}
+
+/* Free a per-task audit context.  Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+	struct audit_context *context;
+
+	task_lock(tsk);
+	context = audit_get_context(tsk, 0, 0);
+	task_unlock(tsk);
+
+	if (likely(!context))
+		return;
+
+	/* Check for system calls that do not go through the exit
+	 * function (e.g., exit_group), then free context block. */
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+	static atomic_t serial = ATOMIC_INIT(0xffffff);
+	unsigned int a, b;
+
+	do {
+		a = atomic_read(&serial);
+		if (atomic_dec_and_test(&serial))
+			atomic_set(&serial, 0xffffff);
+		b = atomic_read(&serial);
+	} while (b != a - 1);
+
+	return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry.  This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built.  If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+			 unsigned long a1, unsigned long a2,
+			 unsigned long a3, unsigned long a4)
+{
+	struct audit_context *context = tsk->audit_context;
+	enum audit_state     state;
+
+	BUG_ON(!context);
+
+	/* This happens only on certain architectures that make system
+	 * calls in kernel_thread via the entry.S interface, instead of
+	 * with direct calls.  (If you are porting to a new
+	 * architecture, hitting this condition can indicate that you
+	 * got the _exit/_leave calls backward in entry.S.)
+	 *
+	 * i386     no
+	 * x86_64   no
+	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 *
+	 * This also happens with vm86 emulation in a non-nested manner
+	 * (entries without exits), so this case must be caught.
+	 */
+	if (context->in_syscall) {
+		struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+		/* vm86 mode should only be entered once */
+		if (major == __NR_vm86 || major == __NR_vm86old)
+			return;
+#endif
+#if AUDIT_DEBUG
+		printk(KERN_ERR
+		       "audit(:%d) pid=%d in syscall=%d;"
+		       " entering syscall=%d\n",
+		       context->serial, tsk->pid, context->major, major);
+#endif
+		newctx = audit_alloc_context(context->state);
+		if (newctx) {
+			newctx->previous   = context;
+			context		   = newctx;
+			tsk->audit_context = newctx;
+		} else	{
+			/* If we can't alloc a new context, the best we
+			 * can do is to leak memory (any pending putname
+			 * will be lost).  The only other alternative is
+			 * to abandon auditing. */
+			audit_zero_context(context, context->state);
+		}
+	}
+	BUG_ON(context->in_syscall || context->name_count);
+
+	if (!audit_enabled)
+		return;
+
+	context->major      = major;
+	context->argv[0]    = a1;
+	context->argv[1]    = a2;
+	context->argv[2]    = a3;
+	context->argv[3]    = a4;
+
+	state = context->state;
+	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+		state = audit_filter_syscall(tsk, context, &audit_entlist);
+	if (likely(state == AUDIT_DISABLED))
+		return;
+
+	context->serial     = audit_serial();
+	context->ctime      = CURRENT_TIME;
+	context->in_syscall = 1;
+	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call.  If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information.  In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+	struct audit_context *context;
+
+	get_task_struct(tsk);
+	task_lock(tsk);
+	context = audit_get_context(tsk, 1, return_code);
+	task_unlock(tsk);
+
+	/* Not having a context here is ok, since the parent may have
+	 * called __put_task_struct. */
+	if (likely(!context))
+		return;
+
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	context->in_syscall = 0;
+	context->auditable  = 0;
+	if (context->previous) {
+		struct audit_context *new_context = context->previous;
+		context->previous  = NULL;
+		audit_free_context(context);
+		tsk->audit_context = new_context;
+	} else {
+		audit_free_names(context);
+		audit_zero_context(context, context->state);
+		tsk->audit_context = context;
+	}
+	put_task_struct(tsk);
+}
+
+/* Add a name to the list.  Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		dump_stack();
+#endif
+		return;
+	}
+	BUG_ON(context->name_count >= AUDIT_NAMES);
+	context->names[context->name_count].name = name;
+	context->names[context->name_count].ino  = (unsigned long)-1;
+	context->names[context->name_count].rdev = -1;
+	++context->name_count;
+}
+
+/* Intercept a putname request.  Called from
+ * include/linux/fs.h:putname().  If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		if (context->name_count) {
+			int i;
+			for (i = 0; i < context->name_count; i++)
+				printk(KERN_ERR "name[%d] = %p = %s\n", i,
+				       context->names[i].name,
+				       context->names[i].name);
+		}
+#endif
+		__putname(name);
+	}
+#if AUDIT_DEBUG
+	else {
+		++context->put_count;
+		if (context->put_count > context->name_count) {
+			printk(KERN_ERR "%s:%d(:%d): major=%d"
+			       " in_syscall=%d putname(%p) name_count=%d"
+			       " put_count=%d\n",
+			       __FILE__, __LINE__,
+			       context->serial, context->major,
+			       context->in_syscall, name, context->name_count,
+			       context->put_count);
+			dump_stack();
+		}
+	}
+#endif
+}
+
+/* Store the inode and device from a lookup.  Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, unsigned long ino, dev_t rdev)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+	if (context->name_count
+	    && context->names[context->name_count-1].name
+	    && context->names[context->name_count-1].name == name)
+		idx = context->name_count - 1;
+	else if (context->name_count > 1
+		 && context->names[context->name_count-2].name
+		 && context->names[context->name_count-2].name == name)
+		idx = context->name_count - 2;
+	else {
+		/* FIXME: how much do we care about inodes that have no
+		 * associated name? */
+		if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+			return;
+		idx = context->name_count++;
+		context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+		++context->ino_count;
+#endif
+	}
+	context->names[idx].ino  = ino;
+	context->names[idx].rdev = rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+		     struct timespec *t, int *serial)
+{
+	if (ctx) {
+		t->tv_sec  = ctx->ctime.tv_sec;
+		t->tv_nsec = ctx->ctime.tv_nsec;
+		*serial    = ctx->serial;
+		ctx->auditable = 1;
+	} else {
+		*t      = CURRENT_TIME;
+		*serial = 0;
+	}
+}
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+	if (ctx) {
+		if (loginuid < 0)
+			return -EINVAL;
+		ctx->loginuid = loginuid;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(audit_alloc);
+EXPORT_SYMBOL_GPL(audit_free);
+EXPORT_SYMBOL_GPL(audit_syscall_entry);
+EXPORT_SYMBOL_GPL(audit_syscall_exit);
+EXPORT_SYMBOL_GPL(audit_getname);
+EXPORT_SYMBOL_GPL(audit_putname);
+EXPORT_SYMBOL_GPL(audit_inode);
diff --git a/kernel/fork.c b/kernel/fork.c
index fc25a3a15d0e..6035db6957f8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/futex.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -83,6 +84,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -949,13 +952,16 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
 	p->io_context = NULL;
+	p->audit_context = NULL;
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
 		goto bad_fork_cleanup;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_audit;
 	if ((retval = copy_files(clone_flags, p)))
 		goto bad_fork_cleanup_semundo;
 	if ((retval = copy_fs(clone_flags, p)))
@@ -1090,6 +1096,8 @@ bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
 	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
 bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup:
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index dad7ae38cc91..2e431763dc73 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -22,11 +22,14 @@
 #include <linux/un.h>
 #include <net/af_unix.h>
 #include <linux/ip.h>
+#include <linux/audit.h>
 #include <linux/ipv6.h>
 #include <net/ipv6.h>
 #include "avc.h"
 #include "avc_ss.h"
+#ifdef CONFIG_AUDIT
 #include "class_to_string.h"
+#endif
 #include "common_perm_to_string.h"
 #include "av_inherit.h"
 #include "av_perm_to_string.h"
@@ -68,14 +71,10 @@ struct avc_callback_node {
 };
 
 static spinlock_t avc_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t avc_log_lock = SPIN_LOCK_UNLOCKED;
 static struct avc_node *avc_node_freelist = NULL;
 static struct avc_cache avc_cache;
-static char *avc_audit_buffer = NULL;
 static unsigned avc_cache_stats[AVC_NSTATS];
 static struct avc_callback_node *avc_callbacks = NULL;
-static unsigned int avc_log_level = 4; /* default:  KERN_WARNING */
-static char avc_level_string[4] = "< >";
 
 static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
 {
@@ -87,14 +86,14 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
  * @tclass: target security class
  * @av: access vector
  */
-void avc_dump_av(u16 tclass, u32 av)
+void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
 {
 	char **common_pts = 0;
 	u32 common_base = 0;
 	int i, i2, perm;
 
 	if (av == 0) {
-		printk(" null");
+		audit_log_format(ab, " null");
 		return;
 	}
 
@@ -106,12 +105,12 @@ void avc_dump_av(u16 tclass, u32 av)
 		}
 	}
 
-	printk(" {");
+	audit_log_format(ab, " {");
 	i = 0;
 	perm = 1;
 	while (perm < common_base) {
 		if (perm & av)
-			printk(" %s", common_pts[i]);
+			audit_log_format(ab, " %s", common_pts[i]);
 		i++;
 		perm <<= 1;
 	}
@@ -124,13 +123,14 @@ void avc_dump_av(u16 tclass, u32 av)
 					break;
 			}
 			if (i2 < ARRAY_SIZE(av_perm_to_string))
-				printk(" %s", av_perm_to_string[i2].name);
+				audit_log_format(ab, " %s",
+						 av_perm_to_string[i2].name);
 		}
 		i++;
 		perm <<= 1;
 	}
 
-	printk(" }");
+	audit_log_format(ab, " }");
 }
 
 /**
@@ -139,7 +139,7 @@ void avc_dump_av(u16 tclass, u32 av)
  * @tsid: target security identifier
  * @tclass: target security class
  */
-void avc_dump_query(u32 ssid, u32 tsid, u16 tclass)
+void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass)
 {
 	int rc;
 	char *scontext;
@@ -147,20 +147,20 @@ void avc_dump_query(u32 ssid, u32 tsid, u16 tclass)
 
  	rc = security_sid_to_context(ssid, &scontext, &scontext_len);
 	if (rc)
-		printk("ssid=%d", ssid);
+		audit_log_format(ab, "ssid=%d", ssid);
 	else {
-		printk("scontext=%s", scontext);
+		audit_log_format(ab, "scontext=%s", scontext);
 		kfree(scontext);
 	}
 
 	rc = security_sid_to_context(tsid, &scontext, &scontext_len);
 	if (rc)
-		printk(" tsid=%d", tsid);
+		audit_log_format(ab, " tsid=%d", tsid);
 	else {
-		printk(" tcontext=%s", scontext);
+		audit_log_format(ab, " tcontext=%s", scontext);
 		kfree(scontext);
 	}
-	printk(" tclass=%s", class_to_string[tclass]);
+	audit_log_format(ab, " tclass=%s", class_to_string[tclass]);
 }
 
 /**
@@ -194,11 +194,7 @@ void __init avc_init(void)
 		avc_node_freelist = new;
 	}
 
-	avc_audit_buffer = (char *)__get_free_page(GFP_ATOMIC);
-	if (!avc_audit_buffer)
-		panic("AVC:  unable to allocate audit buffer\n");
-
-	avc_level_string[1] = '0' + avc_log_level;
+	audit_log(current->audit_context, "AVC INITIALIZED\n");
 }
 
 #if 0
@@ -430,12 +426,13 @@ static inline void avc_print_ipv6_addr(struct in6_addr *addr, u16 port,
 		printk(" %s=%d", name2, ntohs(port));
 }
 
-static inline void avc_print_ipv4_addr(u32 addr, u16 port, char *name1, char *name2)
+static inline void avc_print_ipv4_addr(struct audit_buffer *ab, u32 addr,
+				       u16 port, char *name1, char *name2)
 {
 	if (addr)
-		printk(" %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
+		audit_log_format(ab, " %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
 	if (port)
-		printk(" %s=%d", name2, ntohs(port));
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
 
 /*
@@ -515,9 +512,8 @@ void avc_audit(u32 ssid, u32 tsid,
 {
 	struct task_struct *tsk = current;
 	struct inode *inode = NULL;
-	char *p;
 	u32 denied, audited;
-	unsigned long flags;
+	struct audit_buffer *ab;
 
 	denied = requested & ~avd->allowed;
 	if (denied) {
@@ -535,19 +531,18 @@ void avc_audit(u32 ssid, u32 tsid,
 	if (!check_avc_ratelimit())
 		return;
 
-	/* prevent overlapping printks */
-	spin_lock_irqsave(&avc_log_lock,flags);
-
-	printk("%s\n", avc_level_string);
-	printk("%savc:  %s ", avc_level_string, denied ? "denied" : "granted");
-	avc_dump_av(tclass,audited);
-	printk(" for ");
+	ab = audit_log_start(current->audit_context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "avc:  %s ", denied ? "denied" : "granted");
+	avc_dump_av(ab, tclass,audited);
+	audit_log_format(ab, " for ");
 	if (a && a->tsk)
 		tsk = a->tsk;
 	if (tsk && tsk->pid) {
 		struct mm_struct *mm;
 		struct vm_area_struct *vma;
-		printk(" pid=%d", tsk->pid);
+		audit_log_format(ab, " pid=%d", tsk->pid);
 		if (tsk == current)
 			mm = current->mm;
 		else
@@ -558,11 +553,9 @@ void avc_audit(u32 ssid, u32 tsid,
 				while (vma) {
 					if ((vma->vm_flags & VM_EXECUTABLE) &&
 					    vma->vm_file) {
-						p = d_path(vma->vm_file->f_dentry,
-							   vma->vm_file->f_vfsmnt,
-							   avc_audit_buffer,
-							   PAGE_SIZE);
-						printk(" exe=%s", p);
+						audit_log_d_path(ab, "exe=",
+							vma->vm_file->f_dentry,
+							vma->vm_file->f_vfsmnt);
 						break;
 					}
 					vma = vma->vm_next;
@@ -572,29 +565,26 @@ void avc_audit(u32 ssid, u32 tsid,
 			if (tsk != current)
 				mmput(mm);
 		} else {
-			printk(" comm=%s", tsk->comm);
+			audit_log_format(ab, " comm=%s", tsk->comm);
 		}
 	}
 	if (a) {
 		switch (a->type) {
 		case AVC_AUDIT_DATA_IPC:
-			printk(" key=%d", a->u.ipc_id);
+			audit_log_format(ab, " key=%d", a->u.ipc_id);
 			break;
 		case AVC_AUDIT_DATA_CAP:
-			printk(" capability=%d", a->u.cap);
+			audit_log_format(ab, " capability=%d", a->u.cap);
 			break;
 		case AVC_AUDIT_DATA_FS:
 			if (a->u.fs.dentry) {
 				struct dentry *dentry = a->u.fs.dentry;
 				if (a->u.fs.mnt) {
-					p = d_path(dentry,
-						   a->u.fs.mnt,
-						   avc_audit_buffer,
-						   PAGE_SIZE);
-					if (p)
-						printk(" path=%s", p);
+					audit_log_d_path(ab, "path=", dentry,
+							a->u.fs.mnt);
 				} else {
-					printk(" name=%s", dentry->d_name.name);
+					audit_log_format(ab, " name=%s",
+							 dentry->d_name.name);
 				}
 				inode = dentry->d_inode;
 			} else if (a->u.fs.inode) {
@@ -602,29 +592,33 @@ void avc_audit(u32 ssid, u32 tsid,
 				inode = a->u.fs.inode;
 				dentry = d_find_alias(inode);
 				if (dentry) {
-					printk(" name=%s", dentry->d_name.name);
+					audit_log_format(ab, " name=%s",
+							 dentry->d_name.name);
 					dput(dentry);
 				}
 			}
 			if (inode)
-				printk(" dev=%s ino=%ld",
-				       inode->i_sb->s_id, inode->i_ino);
+				audit_log_format(ab, " dev=%s ino=%ld",
+						 inode->i_sb->s_id,
+						 inode->i_ino);
 			break;
 		case AVC_AUDIT_DATA_NET:
 			if (a->u.net.sk) {
 				struct sock *sk = a->u.net.sk;
 				struct unix_sock *u;
+				int len = 0;
+				char *p = NULL;
 
 				switch (sk->sk_family) {
 				case AF_INET: {
 					struct inet_opt *inet = inet_sk(sk);
 
-					avc_print_ipv4_addr(inet->rcv_saddr,
-					                    inet->sport,
-					                    "laddr", "lport");
-					avc_print_ipv4_addr(inet->daddr,
-					                    inet->dport,
-					                    "faddr", "fport");
+					avc_print_ipv4_addr(ab, inet->rcv_saddr,
+							    inet->sport,
+							    "laddr", "lport");
+					avc_print_ipv4_addr(ab, inet->daddr,
+							    inet->dport,
+							    "faddr", "fport");
 					break;
 				}
 				case AF_INET6: {
@@ -642,34 +636,32 @@ void avc_audit(u32 ssid, u32 tsid,
 				case AF_UNIX:
 					u = unix_sk(sk);
 					if (u->dentry) {
-						p = d_path(u->dentry,
-							   u->mnt,
-							   avc_audit_buffer,
-							   PAGE_SIZE);
-						printk(" path=%s", p);
-					} else if (u->addr) {
-						p = avc_audit_buffer;
-						memcpy(p,
-						       u->addr->name->sun_path,
-						       u->addr->len-sizeof(short));
-						if (*p == 0) {
-							*p = '@';
-							p += u->addr->len-sizeof(short);
-							*p = 0;
-						}
-						printk(" path=%s",
-						       avc_audit_buffer);
+						audit_log_d_path(ab, "path=",
+							u->dentry, u->mnt);
+						break;
 					}
+					if (!u->addr)
+						break;
+					len = u->addr->len-sizeof(short);
+					p = &u->addr->name->sun_path[0];
+					if (*p)
+						audit_log_format(ab,
+							"path=%*.*s", len,
+							len, p);
+					else
+						audit_log_format(ab,
+							"path=@%*.*s", len-1,
+							len-1, p+1);
 					break;
 				}
 			}
 			
 			switch (a->u.net.family) {
 			case AF_INET:
-				avc_print_ipv4_addr(a->u.net.v4info.saddr,
+				avc_print_ipv4_addr(ab, a->u.net.v4info.saddr,
 						    a->u.net.sport,
 						    "saddr", "src");
-				avc_print_ipv4_addr(a->u.net.v4info.daddr,
+				avc_print_ipv4_addr(ab, a->u.net.v4info.daddr,
 						    a->u.net.dport,
 						    "daddr", "dest");
 				break;
@@ -683,15 +675,14 @@ void avc_audit(u32 ssid, u32 tsid,
 				break;
 			}
 			if (a->u.net.netif)
-				printk(" netif=%s", a->u.net.netif);
+				audit_log_format(ab, " netif=%s",
+					a->u.net.netif);
 			break;
 		}
 	}
-	printk(" ");
-	avc_dump_query(ssid, tsid, tclass);
-	printk("\n");
-
-	spin_unlock_irqrestore(&avc_log_lock,flags);
+	audit_log_format(ab, " ");
+	avc_dump_query(ab, ssid, tsid, tclass);
+	audit_log_end(ab);
 }
 
 /**
@@ -1120,14 +1111,3 @@ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
 	avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata);
 	return rc;
 }
-
-static int __init avc_log_level_setup(char *str)
-{
-	avc_log_level = simple_strtol(str, NULL, 0);
-	if (avc_log_level > 7)
-		avc_log_level = 7;
-	return 1;
-}
-
-__setup("avc_log_level=", avc_log_level_setup);
-
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index c143db4ca685..86bdeef585a4 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -127,9 +127,10 @@ static inline void avc_cache_stats_add(int type, unsigned val)
 /*
  * AVC display support
  */
-void avc_dump_av(u16 tclass, u32 av);
-void avc_dump_query(u32 ssid, u32 tsid, u16 tclass);
-void avc_dump_cache(char *tag);
+struct audit_buffer;
+void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av);
+void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass);
+void avc_dump_cache(struct audit_buffer *ab, char *tag);
 
 /*
  * AVC operations
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d8c18ed0087b..f2a53e22b060 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -399,7 +399,7 @@ int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len)
 			char *scontextp;
 
 			*scontext_len = strlen(initial_sid_to_string[sid]) + 1;
-			scontextp = kmalloc(*scontext_len,GFP_KERNEL);
+			scontextp = kmalloc(*scontext_len,GFP_ATOMIC);
 			strcpy(scontextp, initial_sid_to_string[sid]);
 			*scontext = scontextp;
 			goto out;
-- 
cgit v1.2.3


From 0e568881178ff0e0aceeafdb51f9fecab39e1923 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:33:21 -0700
Subject: [PATCH] fix posix-timers to have proper per-process scope

From: Roland McGrath <roland@redhat.com>

The posix-timers implementation associates timers with the creating thread
and destroys timers when their creator thread dies.  POSIX clearly
specifies that these timers are per-process, and a timer should not be torn
down when the thread that created it exits.  I hope there won't be any
controversy on what the correct semantics are here, since POSIX is clear
and the Linux feature is called "posix-timers".

The attached program built with NPTL -lrt -lpthread demonstrates the bug.
The program is correct by POSIX, but fails on Linux.  Note that a until
just the other day, NPTL had a trivial bug that always disabled its use of
kernel timer syscalls (check strace for lack of timer_create/SYS_259).  So
unless you have built your own NPTL libs very recently, you probably won't
see the kernel calls actually used by this program.

Also attached is my patch to fix this.  It (you guessed it) moves the
posix_timers field from task_struct to signal_struct.  Access is now
governed by the siglock instead of the task lock.  exit_itimers is called
from __exit_signal, i.e.  only on the death of the last thread in the
group, rather than from do_exit for every thread.  Timers' it_process
fields store the group leader's pointer, which won't die.  For the case of
SIGEV_THREAD_ID, I hold a ref on the task_struct for it_process to stay
robust in case the target thread dies; the ref is released and the dangling
pointer cleared when the timer fires and the target thread is dead.  (This
should only come up in a buggy user program, so noone cares exactly how the
kernel handles that case.  But I think what I did is robust and sensical.)

/* Test for bogus per-thread deletion of timers.  */

#include <stdio.h>
#include <error.h>
#include <time.h>
#include <signal.h>
#include <stdint.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#include <pthread.h>

/* Creating timers in another thread should work too.  */
static void *do_timer_create(void *arg)
{
	struct sigevent *const sigev = arg;
	timer_t *const timerId = sigev->sigev_value.sival_ptr;
	if (timer_create(CLOCK_REALTIME, sigev, timerId) < 0) {
		perror("timer_create");
		return NULL;
	}
	return timerId;
}

int main(void)
{
	int i, res;
	timer_t timerId;
	struct itimerspec itval;
	struct sigevent sigev;

	itval.it_interval.tv_sec = 2;
	itval.it_interval.tv_nsec = 0;
	itval.it_value.tv_sec = 2;
	itval.it_value.tv_nsec = 0;

	sigev.sigev_notify = SIGEV_SIGNAL;
	sigev.sigev_signo = SIGALRM;
	sigev.sigev_value.sival_ptr = (void *)&timerId;

	for (i = 0; i < 100; i++) {
		printf("cnt = %d\n", i);

		pthread_t thr;
		res = pthread_create(&thr, NULL, &do_timer_create, &sigev);
		if (res) {
			error(0, res, "pthread_create");
			continue;
		}
		void *val;
		res = pthread_join(thr, &val);
		if (res) {
			error(0, res, "pthread_join");
			continue;
		}
		if (val == NULL)
			continue;

		res = timer_settime(timerId, 0, &itval, NULL);
		if (res < 0)
			perror("timer_settime");

		res = timer_delete(timerId);
		if (res < 0)
			perror("timer_delete");
	}

	return 0;
}
---
 fs/exec.c                 |  1 -
 include/linux/init_task.h |  2 +-
 include/linux/sched.h     |  6 ++-
 kernel/exit.c             |  1 -
 kernel/fork.c             |  2 +-
 kernel/posix-timers.c     | 97 ++++++++++++++++++++++++++++++++++-------------
 kernel/signal.c           |  2 +-
 7 files changed, 78 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 26e3392b6369..5fb9f8f7c38f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -856,7 +856,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 			
 	flush_signal_handlers(current, 0);
 	flush_old_files(current->files);
-	exit_itimers(current);
 
 	return 0;
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5c4843a08917..29189706ea57 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -49,6 +49,7 @@
 	.shared_pending	= { 				\
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}}, \
+	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 }
 
 #define INIT_SIGHAND(sighand) {	\
@@ -107,7 +108,6 @@ extern struct group_info init_groups;
 		.list = LIST_HEAD_INIT(tsk.pending.list),		\
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
-	.posix_timers	 = LIST_HEAD_INIT(tsk.posix_timers),		\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b72c38420d71..17bbedd6bb3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -270,6 +270,9 @@ struct signal_struct {
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
 
+	/* POSIX.1b Interval Timers */
+	struct list_head posix_timers;
+
 	/* job control IDs */
 	pid_t pgrp;
 	pid_t tty_old_pgrp;
@@ -433,7 +436,6 @@ struct task_struct {
 	unsigned long it_real_value, it_prof_value, it_virt_value;
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
 	struct timer_list real_timer;
-	struct list_head posix_timers; /* POSIX.1b Interval Timers */
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
 	u64 start_time;
@@ -728,7 +730,7 @@ extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
-extern void exit_itimers(struct task_struct *);
+extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 8157dbc037d6..0ec66729ead8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -776,7 +776,6 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	__exit_files(tsk);
 	__exit_fs(tsk);
 	exit_namespace(tsk);
-	exit_itimers(tsk);
 	exit_thread();
 
 	if (tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6035db6957f8..b4cbfd04847b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->group_stop_count = 0;
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
 
 	sig->tty = current->signal->tty;
 	sig->pgrp = process_group(current);
@@ -932,7 +933,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-	INIT_LIST_HEAD(&p->posix_timers);
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 082693e383cf..3de4d0ae9d26 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -317,12 +317,21 @@ static void timer_notify_task(struct k_itimer *timr)
 	if (timr->it_incr)
 		timr->sigq->info.si_sys_private = ++timr->it_requeue_pending;
 
-	if (timr->it_sigev_notify & SIGEV_THREAD_ID )
+	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+		if (unlikely(timr->it_process->flags & PF_EXITING)) {
+			timr->it_sigev_notify = SIGEV_SIGNAL;
+			put_task_struct(timr->it_process);
+			timr->it_process = timr->it_process->group_leader;
+			goto group;
+		}
 		ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
-	else
+	}
+	else {
+	group:
 		ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
+	}
 	if (ret) {
 		/*
 		 * signal was not sent because of sig_ignor
@@ -352,7 +361,7 @@ static void posix_timer_fn(unsigned long __data)
 
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
-	struct task_struct *rtn = current;
+	struct task_struct *rtn = current->group_leader;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
@@ -395,11 +404,15 @@ static struct k_itimer * alloc_posix_timer(void)
 static void release_posix_timer(struct k_itimer *tmr)
 {
 	if (tmr->it_id != -1) {
-		spin_lock_irq(&idr_lock);
+		unsigned long flags;
+		spin_lock_irqsave(&idr_lock, flags);
 		idr_remove(&posix_timers_id, tmr->it_id);
-		spin_unlock_irq(&idr_lock);
+		spin_unlock_irqrestore(&idr_lock, flags);
 	}
 	sigqueue_free(tmr->sigq);
+	if (unlikely(tmr->it_process) &&
+	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(tmr->it_process);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -414,6 +427,7 @@ sys_timer_create(clockid_t which_clock,
 	struct k_itimer *new_timer = NULL;
 	timer_t new_timer_id;
 	struct task_struct *process = 0;
+	unsigned long flags;
 	sigevent_t event;
 
 	if ((unsigned) which_clock >= MAX_CLOCKS ||
@@ -458,7 +472,7 @@ sys_timer_create(clockid_t which_clock,
 			 * We may be setting up this process for another
 			 * thread.  It may be exiting.  To catch this
 			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the task_lock will catch
+			 * the flag is not set, the siglock will catch
 			 * him before it is too late (in exit_itimers).
 			 *
 			 * The exec case is a bit more invloved but easy
@@ -469,13 +483,14 @@ sys_timer_create(clockid_t which_clock,
 			 * for us to die which means we can finish this
 			 * linkage with our last gasp. I.e. no code :)
 			 */
-			task_lock(process);
+			spin_lock_irqsave(&process->sighand->siglock, flags);
 			if (!(process->flags & PF_EXITING)) {
 				list_add(&new_timer->list,
-					 &process->posix_timers);
-				task_unlock(process);
+					 &process->signal->posix_timers);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				get_task_struct(process);
 			} else {
-				task_unlock(process);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 				process = 0;
 			}
 		}
@@ -491,10 +506,10 @@ sys_timer_create(clockid_t which_clock,
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;
 		new_timer->it_sigev_signo = SIGALRM;
 		new_timer->it_sigev_value.sival_int = new_timer->it_id;
-		process = current;
-		task_lock(process);
-		list_add(&new_timer->list, &process->posix_timers);
-		task_unlock(process);
+		process = current->group_leader;
+		spin_lock_irqsave(&process->sighand->siglock, flags);
+		list_add(&new_timer->list, &process->signal->posix_timers);
+		spin_unlock_irqrestore(&process->sighand->siglock, flags);
 	}
 
 	new_timer->it_clock = which_clock;
@@ -925,14 +940,18 @@ retry_delete:
 #else
 	p_timer_del(&posix_clocks[timer->it_clock], timer);
 #endif
-	task_lock(timer->it_process);
+	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
-	task_unlock(timer->it_process);
+	spin_unlock(&current->sighand->siglock);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
 	timer->it_process = NULL;
+	}
 	unlock_timer(timer, flags);
 	release_posix_timer(timer);
 	return 0;
@@ -942,24 +961,50 @@ retry_delete:
  */
 static inline void itimer_delete(struct k_itimer *timer)
 {
-	if (sys_timer_delete(timer->it_id))
-		BUG();
+	unsigned long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+retry_delete:
+#endif
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	p_timer_del(&posix_clocks[timer->it_clock], timer);
+#endif
+	list_del(&timer->list);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer);
 }
+
 /*
- * This is exported to exit and exec
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
  */
-void exit_itimers(struct task_struct *tsk)
+void exit_itimers(struct signal_struct *sig)
 {
 	struct k_itimer *tmr;
 
-	task_lock(tsk);
-	while (!list_empty(&tsk->posix_timers)) {
-		tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list);
-		task_unlock(tsk);
+	while (!list_empty(&sig->posix_timers)) {
+		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
-		task_lock(tsk);
 	}
-	task_unlock(tsk);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 7a4b479a6f45..c69671600bef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -352,6 +352,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		exit_itimers(sig);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 		kmem_cache_free(signal_cachep, sig);
@@ -2555,4 +2556,3 @@ void __init signals_init(void)
 	if (!sigqueue_cachep)
 		panic("signals_init(): cannot create sigqueue SLAB cache");
 }
-
-- 
cgit v1.2.3


From c8b976af1af10de3d92968bf7d4bd5415e8a3778 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:41:36 -0700
Subject: [PATCH] hugetlb consolidation

From: William Lee Irwin III <wli@holomorphy.com>

The following patch consolidates redundant code in various hugetlb
implementations.  I took the liberty of renaming a few things, since the
code was all moved anyway, and it has the benefit of helping to catch
missed conversions and/or consolidations.
---
 arch/i386/mm/hugetlbpage.c    | 264 +-----------------------------------------
 arch/ia64/mm/hugetlbpage.c    | 251 +--------------------------------------
 arch/ppc64/mm/hugetlbpage.c   | 258 +----------------------------------------
 arch/sh/mm/hugetlbpage.c      | 258 +----------------------------------------
 arch/sparc64/mm/hugetlbpage.c | 259 +----------------------------------------
 fs/hugetlbfs/inode.c          |   2 +-
 include/linux/hugetlb.h       |   7 +-
 kernel/sysctl.c               |   6 +-
 mm/Makefile                   |   1 +
 mm/hugetlb.c                  | 245 +++++++++++++++++++++++++++++++++++++++
 10 files changed, 263 insertions(+), 1288 deletions(-)
 create mode 100644 mm/hugetlb.c

(limited to 'include/linux')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 7224ddcb6a11..a702f96373af 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -20,68 +20,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long    htlbpagemem;
-int     htlbpage_max;
-static long    htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-				HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -276,26 +214,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 #endif
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end)
 {
@@ -319,16 +237,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void
-zap_hugepage_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -360,7 +268,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -380,173 +288,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write,
-		struct file *file, void *buffer, size_t *length)
-{
-	if (!cpu_has_pse)
-		return -ENODEV;
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	if (!cpu_has_pse)
-		return -ENODEV;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 3dec8e2f4056..8b5b1cac3a1c 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -22,69 +22,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-unsigned int	hpage_shift=HPAGE_SHIFT_DEFAULT;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
+unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
 static pte_t *
 huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
@@ -244,26 +182,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 	return NULL;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 /*
  * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
  * are hugetlb region specific.
@@ -339,14 +257,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsig
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -378,7 +288,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -422,106 +332,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 		addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
 	}
 }
-void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-int set_hugetlb_mem_size(int count)
-{
-	int  lcount;
-	struct page *page ;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write, struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
 
 static int __init hugetlb_setup_sz(char *str)
 {
@@ -551,60 +361,3 @@ static int __init hugetlb_setup_sz(char *str)
 	return 1;
 }
 __setup("hugepagesz=", hugetlb_setup_sz);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-__initcall(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	if (size > (htlbpagemem << HPAGE_SHIFT))
-		return 0;
-	return 1;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage =	hugetlb_nopage,
-};
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index a7b2c63c700f..e81eeec9b009 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -29,65 +29,6 @@
 
 #include <linux/sysctl.h>
 
-int htlbpage_max;
-
-/* This lock protects the two counters and list below */
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static int htlbpage_free; /* = 0 */
-static int htlbpage_total; /* = 0 */
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-/* XXX make this a sysctl */
-unsigned long largepage_roundrobin = 1;
-
-static struct page *dequeue_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page = NULL;
-	int i;
-
-	if (!largepage_roundrobin)
-		nid = numa_node_id();
-
-	for (i = 0; i < numnodes; i++) {
-		if (!list_empty(&hugepage_freelists[nid]))
-			break;
-		nid = (nid + 1) % numnodes;
-	}
-
-	if (!list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-
-	if (largepage_roundrobin)
-		nid = (nid + 1) % numnodes;
-
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	if (!page)
-		return NULL;
-
-	nid = page_zone(page)->zone_pgdat->node_id;
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
 /* HugePTE layout:
  *
  * 31 30 ... 15 14 13 12 10 9  8  7   6    5    4    3    2    1    0
@@ -119,7 +60,6 @@ typedef struct {unsigned int val;} hugepte_t;
 #define hugepte_none(x)	(!(hugepte_val(x) & _HUGEPAGE_PFN))
 
 
-static void free_huge_page(struct page *page);
 static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
 				hugepte_t pte, int local);
 
@@ -146,27 +86,6 @@ static inline void set_hugepte(hugepte_t *ptep, hugepte_t pte)
 		       hugepte_val(pte) & ~_HUGEPAGE_HPTEFLAGS);
 }
 
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-
-	htlbpage_free--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
-
 static hugepte_t *hugepte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -448,26 +367,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return page;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpage_free++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -510,16 +409,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	mm->rss -= (end - start) >> PAGE_SHIFT;
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -554,7 +443,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -876,148 +765,3 @@ static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
 
 	ppc_md.hpte_invalidate(slot, va, 1, local);
 }
-
-static void split_and_free_hugepage(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbpage_total--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (!(cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE))
-		return 0;
-	
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbpage_total;
-
-	if (lcount == 0)
-		return htlbpage_total;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpage_free++;
-			htlbpage_total++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return htlbpage_total;
-	}
-	/* Shrink the memory size. */
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		split_and_free_hugepage(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return htlbpage_total;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write,
-		struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	if (cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) {
-		for (i = 0; i < MAX_NUMNODES; ++i)
-			INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-		for (i = 0; i < htlbpage_max; ++i) {
-			page = alloc_fresh_huge_page();
-			if (!page)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			spin_unlock(&htlbpage_lock);
-		}
-		htlbpage_max = htlbpage_free = htlbpage_total = i;
-		printk(KERN_INFO "Total HugeTLB memory allocated, %d\n",
-		       htlbpage_free);
-	} else {
-		htlbpage_max = 0;
-		printk(KERN_INFO "CPU does not support HugeTLB\n");
-	}
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5d\n"
-			"HugePages_Free:  %5d\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbpage_total,
-			htlbpage_free,
-			HPAGE_SIZE/1024);
-}
-
-/* This is advisory only, so we can get away with accesing
- * htlbpage_free without taking the lock. */
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpage_free;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbpage_total * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6f72d865e8d2..751a7d1a666d 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -24,68 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, list);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	memset(page_address(page), 0, HPAGE_SIZE);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -250,25 +188,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -297,16 +216,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -338,7 +247,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -358,168 +267,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, list);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				   unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 771ec3757d73..b4e6dfa0833a 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -21,68 +21,6 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	memset(page_address(page), 0, HPAGE_SIZE);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -247,26 +185,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -295,16 +213,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -336,7 +244,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -356,168 +264,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				   unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c60d937b202e..5e37a271dd2e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -573,7 +573,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			unsigned long long size = memparse(value, &rest);
 			if (*rest == '%') {
 				size <<= HPAGE_SHIFT;
-				size *= htlbpage_max;
+				size *= max_huge_pages;
 				do_div(size, 100);
 				rest++;
 			}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index da3fc826a0de..b0e98cfe15f9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -28,8 +28,11 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
+struct page *alloc_huge_page(void);
+void free_huge_page(struct page *);
 
-extern int htlbpage_max;
+extern unsigned long max_huge_pages;
+extern const unsigned long hugetlb_zero, hugetlb_infinity;
 
 static inline void
 mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma)
@@ -78,6 +81,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(addr, len)	0
 #define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define alloc_huge_page()			({ NULL; })
+#define free_huge_page(p)			({ (void)(p); BUG(); })
 
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	0		/* Keep the compiler happy */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 05ea59ae4276..69e9123cdd0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -710,10 +710,12 @@ static ctl_table vm_table[] = {
 	 {
 		.ctl_name	= VM_HUGETLB_PAGES,
 		.procname	= "nr_hugepages",
-		.data		= &htlbpage_max,
-		.maxlen		= sizeof(int),
+		.data		= &max_huge_pages,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
 	 },
 #endif
 	{
diff --git a/mm/Makefile b/mm/Makefile
index c66aba5886f8..5f3baecd85a7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,3 +12,4 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   slab.o swap.o truncate.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 000000000000..cb72a40c38b6
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,245 @@
+/*
+ * Generic hugetlb support.
+ * (C) William Irwin, April 2004
+ */
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/sysctl.h>
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static unsigned long nr_huge_pages, free_huge_pages;
+unsigned long max_huge_pages;
+static struct list_head hugepage_freelists[MAX_NUMNODES];
+static spinlock_t hugetlb_lock = SPIN_LOCK_UNLOCKED;
+
+static void enqueue_huge_page(struct page *page)
+{
+	list_add(&page->lru,
+		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
+}
+
+static struct page *dequeue_huge_page(void)
+{
+	int nid = numa_node_id();
+	struct page *page = NULL;
+
+	if (list_empty(&hugepage_freelists[nid])) {
+		for (nid = 0; nid < MAX_NUMNODES; ++nid)
+			if (!list_empty(&hugepage_freelists[nid]))
+				break;
+	}
+	if (nid >= 0 && nid < MAX_NUMNODES &&
+	    !list_empty(&hugepage_freelists[nid])) {
+		page = list_entry(hugepage_freelists[nid].next,
+				  struct page, lru);
+		list_del(&page->lru);
+	}
+	return page;
+}
+
+static struct page *alloc_fresh_huge_page(void)
+{
+	static int nid = 0;
+	struct page *page;
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
+	nid = (nid + 1) % numnodes;
+	return page;
+}
+
+void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	free_huge_pages++;
+	spin_unlock(&hugetlb_lock);
+}
+
+struct page *alloc_huge_page(void)
+{
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	page = dequeue_huge_page();
+	if (!page) {
+		spin_unlock(&hugetlb_lock);
+		return NULL;
+	}
+	free_huge_pages--;
+	spin_unlock(&hugetlb_lock);
+	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
+	memset(page_address(page), 0, HPAGE_SIZE);
+	return page;
+}
+
+void huge_page_release(struct page *page)
+{
+	if (!put_page_testzero(page))
+		return;
+
+	free_huge_page(page);
+}
+
+static int __init hugetlb_init(void)
+{
+	unsigned long i;
+	struct page *page;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(&hugepage_freelists[i]);
+
+	for (i = 0; i < max_huge_pages; ++i) {
+		page = alloc_fresh_huge_page();
+		if (!page)
+			break;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	return 0;
+}
+module_init(hugetlb_init);
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+		max_huge_pages = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static void update_and_free_page(struct page *page)
+{
+	int i;
+	nr_huge_pages--;
+	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(&page[i], 0);
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+#ifdef CONFIG_HIGHMEM
+static int try_to_free_low(unsigned long count)
+{
+	int i;
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page;
+		list_for_each_entry(page, &hugepage_freelists[i], lru) {
+			if (PageHighMem(page))
+				continue;
+			list_del(&page->lru);
+			update_and_free_page(page);
+			--free_huge_pages;
+			if (!--count)
+				return 0;
+		}
+	}
+	return count;
+}
+#else
+static inline int try_to_free_low(unsigned long count)
+{
+	return count;
+}
+#endif
+
+static unsigned long set_max_huge_pages(unsigned long count)
+{
+	while (count > nr_huge_pages) {
+		struct page *page = alloc_fresh_huge_page();
+		if (!page)
+			return nr_huge_pages;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		free_huge_pages++;
+		nr_huge_pages++;
+		spin_unlock(&hugetlb_lock);
+	}
+	if (count >= nr_huge_pages)
+		return nr_huge_pages;
+
+	spin_lock(&hugetlb_lock);
+	for (count = try_to_free_low(count); count < nr_huge_pages; --free_huge_pages) {
+		struct page *page = dequeue_huge_page();
+		if (!page)
+			break;
+		update_and_free_page(page);
+	}
+	spin_unlock(&hugetlb_lock);
+	return nr_huge_pages;
+}
+
+#ifdef CONFIG_SYSCTL
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+			   struct file *file, void *buffer, size_t *length)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length);
+	max_huge_pages = set_max_huge_pages(max_huge_pages);
+	return 0;
+}
+#endif /* CONFIG_SYSCTL */
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			nr_huge_pages,
+			free_huge_pages,
+			HPAGE_SIZE/1024);
+}
+
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+EXPORT_SYMBOL(hugetlb_total_pages);
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
-- 
cgit v1.2.3


From 6a435d69de04e96de8001edbd4a3da94eaec56b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:15:12 -0700
Subject: [PATCH] Add queue congestion callout

From: Miquel van Smoorenburg <miquels@cistron.nl>

The VM and VFS use the address_space_backing_dev_info to track the realtime
status of the device which backs the mapping.  The read_congested and
write_congested fields are used to determine whether a read or write
against that device may block.

We use this infrastructure to

a) allow pdflush to service many queues in parallel (by not getting
   stuck on any particular one) and

b) to avoid undesirable and uncontrolled latencies in places such as
   page reclaim and

c) To avoid blocking in readahead operations

The current code only supports simple disk queues (and I have a patch here
for NFS).  Stacked queues (MD and DM) don't get this information right and
problems were expected.  Efficiency problems have now been noted and it's
time to fix it.

This patch lays down the infrastructure which permits the queue
implementation to get control when someone at a higher level is querying
the queue's congestion state.  So DM (for example) can run around and
examine all the queues which contribute to the higher-level queue.


It also adds bdi_rw_congested() for code in xfs and ext2 that calls both
bdi_read_congested() and bdi_write_congested() in a row, and it was "free"
anyway.
---
 include/linux/backing-dev.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 94c93c9c5f66..e34916ddd1d7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -20,10 +20,14 @@ enum bdi_state {
 	BDI_unused,		/* Available bits start here */
 };
 
+typedef int (congested_fn)(void *, int);
+
 struct backing_dev_info {
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	int memory_backed;	/* Cannot clean pages with writepage */
+	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	void *congested_data;	/* Pointer to aux data for congested func */
 };
 
 extern struct backing_dev_info default_backing_dev_info;
@@ -32,14 +36,27 @@ int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
 void writeback_release(struct backing_dev_info *bdi);
 
+static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+{
+	if (bdi->congested_fn)
+		return bdi->congested_fn(bdi->congested_data, bdi_bits);
+	return (bdi->state & bdi_bits);
+}
+
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_read_congested, &bdi->state);
+	return bdi_congested(bdi, 1 << BDI_read_congested);
 }
 
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_write_congested, &bdi->state);
+	return bdi_congested(bdi, 1 << BDI_write_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, (1 << BDI_read_congested)|
+				  (1 << BDI_write_congested));
 }
 
 #endif		/* _LINUX_BACKING_DEV_H */
-- 
cgit v1.2.3


From 6d27f67bf6ee2b9ad0c8814118264bc273d916a1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:15:51 -0700
Subject: [PATCH] per-backing dev unplugging

From: Jens Axboe <axboe@suse.de>,
      Chris Mason,
      me, others.

The global unplug list causes horrid spinlock contention on many-disk
many-CPU setups - throughput is worse than halved.

The other problem with the global unplugging is of course that it will cause
the unplugging of queues which are unrelated to the I/O upon which the caller
is about to wait.

So what we do to solve these problems is to remove the global unplug and set
up the infrastructure under which the VFS can tell the block layer to unplug
only those queues which are relevant to the page or buffer_head whcih is
about to be waited upon.

We do this via the very appropriate address_space->backing_dev_info structure.

Most of the complexity is in devicemapper, MD and swapper_space, because for
these backing devices, multiple queues may need to be unplugged to complete a
page/buffer I/O.  In each case we ensure that data structures are in place to
permit us to identify all the lower-level queues which contribute to the
higher-level backing_dev_info.  Each contributing queue is told to unplug in
response to a higher-level unplug.

To simplify things in various places we also introduce the concept of a
"synchronous BIO": it is tagged with BIO_RW_SYNC.  The block layer will
perform an immediate unplug when it sees one of these go past.
---
 drivers/block/ll_rw_blk.c    | 96 +++++++++++++-------------------------------
 drivers/block/loop.c         | 15 ++++++-
 drivers/block/rd.c           |  1 +
 drivers/block/umem.c         |  3 +-
 drivers/md/dm-crypt.c        |  2 +-
 drivers/md/dm-table.c        | 16 ++++++++
 drivers/md/dm.c              | 23 +++++++++--
 drivers/md/dm.h              |  1 +
 drivers/md/md.c              | 32 +++++++++++++--
 drivers/md/raid1.c           |  3 ++
 drivers/md/raid5.c           |  4 +-
 drivers/md/raid6main.c       |  3 +-
 drivers/mtd/devices/blkmtd.c |  6 +--
 fs/buffer.c                  | 12 ++++--
 fs/direct-io.c               |  4 +-
 fs/jfs/jfs_logmgr.c          |  6 +--
 fs/ntfs/compress.c           |  3 +-
 fs/ufs/truncate.c            |  3 +-
 fs/xfs/linux/xfs_buf.c       | 24 ++++-------
 include/linux/backing-dev.h  |  3 ++
 include/linux/bio.h          |  3 ++
 include/linux/blkdev.h       | 23 ++++++++---
 include/linux/fs.h           |  2 +
 include/linux/raid/md.h      |  1 +
 include/linux/raid/md_k.h    | 26 ------------
 include/linux/swap.h         |  3 ++
 kernel/power/disk.c          |  1 -
 kernel/power/pmdisk.c        |  3 +-
 kernel/power/swsusp.c        |  5 ---
 mm/filemap.c                 |  4 +-
 mm/mempool.c                 |  2 -
 mm/nommu.c                   |  5 +++
 mm/readahead.c               |  8 +++-
 mm/shmem.c                   |  1 +
 mm/swap_state.c              |  1 +
 mm/swapfile.c                | 65 +++++++++++++++++++++++++++++-
 36 files changed, 254 insertions(+), 159 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index fc4b6c698fcf..209fdef4d986 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -42,12 +42,6 @@ static void blk_unplug_timeout(unsigned long data);
  */
 static kmem_cache_t *request_cachep;
 
-/*
- * plug management
- */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -251,8 +245,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
-	INIT_LIST_HEAD(&q->plug_list);
-
 	blk_queue_activity_fn(q, NULL, NULL);
 }
 
@@ -1104,13 +1096,11 @@ void blk_plug_device(request_queue_t *q)
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
-	if (!blk_queue_plugged(q)
-	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
-		spin_lock(&blk_plug_lock);
-		list_add_tail(&q->plug_list, &blk_plug_list);
+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+		return;
+
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		spin_unlock(&blk_plug_lock);
-	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1122,15 +1112,12 @@ EXPORT_SYMBOL(blk_plug_device);
 int blk_remove_plug(request_queue_t *q)
 {
 	WARN_ON(!irqs_disabled());
-	if (blk_queue_plugged(q)) {
-		spin_lock(&blk_plug_lock);
-		list_del_init(&q->plug_list);
-		del_timer(&q->unplug_timer);
-		spin_unlock(&blk_plug_lock);
-		return 1;
-	}
 
-	return 0;
+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+		return 0;
+
+	del_timer(&q->unplug_timer);
+	return 1;
 }
 
 EXPORT_SYMBOL(blk_remove_plug);
@@ -1161,24 +1148,32 @@ static inline void __generic_unplug_device(request_queue_t *q)
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged (either by manually calling this function, or by
- *   calling blk_run_queues()), the request_fn defined for the
- *   queue is invoked and transfers started.
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
  **/
-void generic_unplug_device(void *data)
+void generic_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
-
 	spin_lock_irq(q->queue_lock);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 }
-
 EXPORT_SYMBOL(generic_unplug_device);
 
+static void blk_backing_dev_unplug(struct backing_dev_info *bdi)
+{
+	request_queue_t *q = bdi->unplug_io_data;
+
+	/*
+	 * devices don't necessarily have an ->unplug_fn defined
+	 */
+	if (q->unplug_fn)
+		q->unplug_fn(q);
+}
+
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
+
 	q->unplug_fn(q);
 }
 
@@ -1255,42 +1250,6 @@ void blk_run_queue(struct request_queue *q)
 
 EXPORT_SYMBOL(blk_run_queue);
 
-/**
- * blk_run_queues - fire all plugged queues
- *
- * Description:
- *   Start I/O on all plugged queues known to the block layer. Queues that
- *   are currently stopped are ignored. This is equivalent to the older
- *   tq_disk task queue run.
- **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
-void blk_run_queues(void)
-{
-	LIST_HEAD(local_plug_list);
-
-	spin_lock_irq(&blk_plug_lock);
-
-	/*
-	 * this will happen fairly often
-	 */
-	if (list_empty(&blk_plug_list))
-		goto out;
-
-	list_splice_init(&blk_plug_list, &local_plug_list);
-	
-	while (!list_empty(&local_plug_list)) {
-		request_queue_t *q = blk_plug_entry(local_plug_list.next);
-
-		spin_unlock_irq(&blk_plug_lock);
-		q->unplug_fn(q);
-		spin_lock_irq(&blk_plug_lock);
-	}
-out:
-	spin_unlock_irq(&blk_plug_lock);
-}
-
-EXPORT_SYMBOL(blk_run_queues);
-
 /**
  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
  * @q:    the request queue to be released
@@ -1390,6 +1349,10 @@ request_queue_t *blk_alloc_queue(int gfp_mask)
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	atomic_set(&q->refcnt, 1);
+
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+
 	return q;
 }
 
@@ -2050,7 +2013,6 @@ long blk_congestion_wait(int rw, long timeout)
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
 
-	blk_run_queues();
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
@@ -2315,7 +2277,7 @@ out:
 	if (blk_queue_plugged(q)) {
 		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
-		if (nr_queued == q->unplug_thresh)
+		if (nr_queued == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f29f72ee30d0..a43c545071cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -434,6 +434,17 @@ inactive:
 	goto out;
 }
 
+/*
+ * kick off io on the underlying address space
+ */
+static void loop_unplug(request_queue_t *q)
+{
+	struct loop_device *lo = q->queuedata;
+
+	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+	blk_run_address_space(lo->lo_backing_file->f_mapping);
+}
+
 struct switch_request {
 	struct file *file;
 	struct completion wait;
@@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 {
 	struct file	*file;
 	struct inode	*inode;
-	struct block_device *lo_device = NULL;
 	struct address_space *mapping;
 	unsigned lo_blocksize;
 	int		lo_flags = 0;
@@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
 	lo->lo_blocksize = lo_blocksize;
-	lo->lo_device = lo_device;
+	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
 	lo->transfer = NULL;
@@ -688,6 +698,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	 */
 	blk_queue_make_request(lo->lo_queue, loop_make_request);
 	lo->lo_queue->queuedata = lo;
+	lo->lo_queue->unplug_fn = loop_unplug;
 
 	set_capacity(disks[lo->lo_number], size);
 	bd_set_size(bdev, size << 9);
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index e626344c9b58..3dd9163a64e2 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode, struct file *file,
 static struct backing_dev_info rd_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 static int rd_open(struct inode *inode, struct file *filp)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 31cd010f4d56..5a1e349b131d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -368,9 +368,8 @@ static inline void reset_page(struct mm_page *page)
 	page->biotail = & page->bio;
 }
 
-static void mm_unplug_device(void *data)
+static void mm_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	struct cardinfo *card = q->queuedata;
 	unsigned long flags;
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8e1798115e2f..a17b25380fce 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
 		/* out of memory -> run queues */
 		if (remaining)
-			blk_run_queues();
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 
 	/* drop reference, clones could have returned before we reach this */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4aa6c43ffd01..93dc0e6361c0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
+void dm_table_unplug_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+
+		if (q->unplug_fn)
+			q->unplug_fn(q);
+	}
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
 EXPORT_SYMBOL(dm_table_event);
 EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_put);
+EXPORT_SYMBOL(dm_table_get);
+EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6dc34c8b4604..542f9cd0acc0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
+static void dm_unplug_all(request_queue_t *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+}
+
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
@@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
+	md->queue->unplug_fn = dm_unplug_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
@@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
 	add_wait_queue(&md->wait, &wait);
 	up_write(&md->lock);
 
+	/* unplug */
+	map = dm_get_table(md);
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+
 	/*
 	 * Then we wait for the already mapped ios to
 	 * complete.
 	 */
-	blk_run_queues();
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md)
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
+	dm_table_unplug_all(map);
 	dm_table_put(map);
 
-	blk_run_queues();
-
 	return 0;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 780185db38d0..34bf0e7cceb2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t);
 void dm_table_suspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aa6fef11aa4e..72d6a2da5827 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -160,6 +160,30 @@ static int md_fail_request (request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
+void md_unplug_mddev(mddev_t *mddev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+EXPORT_SYMBOL(md_unplug_mddev);
+
+static void md_unplug_all(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+
+	md_unplug_mddev(mddev);
+}
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -335,6 +359,8 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct bio_vec vec;
 	struct completion event;
 
+	rw |= (1 << BIO_RW_SYNC);
+
 	bio_init(&bio);
 	bio.bi_io_vec = &vec;
 	vec.bv_page = page;
@@ -349,7 +375,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	bio.bi_private = &event;
 	bio.bi_end_io = bi_complete;
 	submit_bio(rw, &bio);
-	blk_run_queues();
 	wait_for_completion(&event);
 
 	return test_bit(BIO_UPTODATE, &bio.bi_flags);
@@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->unplug_fn = md_unplug_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -2718,7 +2744,7 @@ int md_thread(void * arg)
 		run = thread->run;
 		if (run) {
 			run(thread->mddev);
-			blk_run_queues();
+			md_unplug_mddev(thread->mddev);
 		}
 		if (signal_pending(current))
 			flush_signals(current);
@@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev)
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		blk_run_queues();
+		md_unplug_mddev(mddev);
 
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f308d5fe946f..6616cd46c50f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -451,6 +451,7 @@ rb_out:
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
 	
@@ -478,6 +479,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	conf->nr_pending++;
@@ -644,6 +646,7 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	spin_unlock_irq(&conf->resync_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b5cc6c4ba6ba..5c9d3fd66913 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -249,6 +249,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
+				md_unplug_mddev(conf->mddev);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
@@ -1292,9 +1293,8 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 		}
 	}
 }
-static void raid5_unplug_device(void *data)
+static void raid5_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 747085a6dac0..131f4a1f34eb 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1454,9 +1454,8 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
 		}
 	}
 }
-static void raid6_unplug_device(void *data)
+static void raid6_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid6_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff --git a/drivers/mtd/devices/blkmtd.c b/drivers/mtd/devices/blkmtd.c
index b4b4178943a1..4bd5d3219458 100644
--- a/drivers/mtd/devices/blkmtd.c
+++ b/drivers/mtd/devices/blkmtd.c
@@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd_dev *dev, struct page *page)
 		bio->bi_private = &event;
 		bio->bi_end_io = bi_read_complete;
 		if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
-			submit_bio(READ, bio);
-			blk_run_queues();
+			submit_bio(READ_SYNC, bio);
 			wait_for_completion(&event);
 			err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
 			bio_put(bio);
@@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio *bio)
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_write_complete;
-	submit_bio(WRITE, bio);
-	blk_run_queues();
+	submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&event);
 	DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
 	err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
diff --git a/fs/buffer.c b/fs/buffer.c
index be9cc963a178..8ab66d0b7548 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head * bh)
 	do {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		if (buffer_locked(bh)) {
-			blk_run_queues();
+			struct block_device *bd;
+			smp_mb();
+			bd = bh->b_bdev;
+			if (bd)
+				blk_run_address_space(bd->bd_inode->i_mapping);
 			io_schedule();
 		}
 	} while (buffer_locked(bh));
@@ -492,7 +496,6 @@ static void free_more_memory(void)
 	pg_data_t *pgdat;
 
 	wakeup_bdflush(1024);
-	blk_run_queues();
 	yield();
 
 	for_each_pgdat(pgdat) {
@@ -2927,7 +2930,10 @@ EXPORT_SYMBOL(try_to_free_buffers);
 
 int block_sync_page(struct page *page)
 {
-	blk_run_queues();
+	struct address_space *mapping;
+	smp_mb();
+	mapping = page->mapping;
+	blk_run_address_space(mapping);
 	return 0;
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d022a233820f..79534d258f37 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -364,7 +364,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		if (dio->bio_list == NULL) {
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_lock, flags);
-			blk_run_queues();
+			blk_run_address_space(dio->inode->i_mapping);
 			io_schedule();
 			spin_lock_irqsave(&dio->bio_lock, flags);
 			dio->waiter = NULL;
@@ -1035,7 +1035,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		if (ret == 0)
 			ret = dio->result;
 		finished_one_bio(dio);		/* This can free the dio */
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		if (should_wait) {
 			unsigned long flags;
 			/*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index b72fb4a40adc..b90aa961dd5a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1975,8 +1975,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	submit_bio(READ, bio);
-	blk_run_queues();
+	submit_bio(READ_SYNC, bio);
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
@@ -2120,9 +2119,8 @@ static void lbmStartIO(struct lbuf * bp)
 
 	/* check if journaling to disk has been disabled */
 	if (!log->no_integrity) {
-		submit_bio(WRITE, bio);
+		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
-		blk_run_queues();
 	}
 	else {
 		bio->bi_size = 0;
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index a8618f107ead..68231e909496 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -23,6 +23,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 #include "ntfs.h"
 
@@ -668,7 +669,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_queues();
+			blk_run_address_space(mapping);
 			schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 04e50f696202..b22169e7ba76 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -38,6 +38,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/sched.h>
 
 #include "swab.h"
@@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode)
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		yield();
 	}
 	offset = inode->i_size & uspi->s_fshift;
diff --git a/fs/xfs/linux/xfs_buf.c b/fs/xfs/linux/xfs_buf.c
index c5f06aad5234..2d4cf586cf85 100644
--- a/fs/xfs/linux/xfs_buf.c
+++ b/fs/xfs/linux/xfs_buf.c
@@ -1013,7 +1013,7 @@ pagebuf_lock(
 {
 	PB_TRACE(pb, "lock", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_sema);
 	PB_SET_OWNER(pb);
 	PB_TRACE(pb, "locked", 0);
@@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin(
 		if (atomic_read(&pb->pb_pin_count) == 0)
 			break;
 		if (atomic_read(&pb->pb_io_remaining))
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		schedule();
 	}
 	remove_wait_queue(&pb->pb_waiters, &wait);
@@ -1407,7 +1407,7 @@ submit_io:
 	if (pb->pb_flags & PBF_RUN_QUEUES) {
 		pb->pb_flags &= ~PBF_RUN_QUEUES;
 		if (atomic_read(&pb->pb_io_remaining) > 1)
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 	}
 }
 
@@ -1471,7 +1471,7 @@ pagebuf_iowait(
 {
 	PB_TRACE(pb, "iowait", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_iodonesema);
 	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
 	return pb->pb_error;
@@ -1617,7 +1617,6 @@ STATIC int
 pagebuf_daemon(
 	void			*data)
 {
-	int			count;
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 
@@ -1640,7 +1639,6 @@ pagebuf_daemon(
 
 		spin_lock(&pbd_delwrite_lock);
 
-		count = 0;
 		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
 			pb = list_entry(curr, page_buf_t, pb_list);
 
@@ -1657,7 +1655,6 @@ pagebuf_daemon(
 				pb->pb_flags &= ~PBF_DELWRI;
 				pb->pb_flags |= PBF_WRITE;
 				list_move(&pb->pb_list, &tmp);
-				count++;
 			}
 		}
 
@@ -1667,12 +1664,11 @@ pagebuf_daemon(
 			list_del_init(&pb->pb_list);
 
 			pagebuf_iostrategy(pb);
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		}
 
 		if (as_list_len > 0)
 			purge_addresses();
-		if (count)
-			blk_run_queues();
 
 		force_flush = 0;
 	} while (pagebuf_daemon_active);
@@ -1689,7 +1685,6 @@ pagebuf_delwri_flush(
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 	int			pincount = 0;
-	int			flush_cnt = 0;
 
 	pagebuf_runall_queues(pagebuf_dataio_workqueue);
 	pagebuf_runall_queues(pagebuf_logio_workqueue);
@@ -1733,14 +1728,8 @@ pagebuf_delwri_flush(
 
 		pagebuf_lock(pb);
 		pagebuf_iostrategy(pb);
-		if (++flush_cnt > 32) {
-			blk_run_queues();
-			flush_cnt = 0;
-		}
 	}
 
-	blk_run_queues();
-
 	while (!list_empty(&tmp)) {
 		pb = list_entry(tmp.next, page_buf_t, pb_list);
 
@@ -1751,6 +1740,9 @@ pagebuf_delwri_flush(
 		pagebuf_rele(pb);
 	}
 
+	if (flags & PBDF_WAIT)
+		blk_run_address_space(target->pbr_mapping);
+
 	if (pinptr)
 		*pinptr = pincount;
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e34916ddd1d7..00371734995c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -28,9 +28,12 @@ struct backing_dev_info {
 	int memory_backed;	/* Cannot clean pages with writepage */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	void (*unplug_io_fn)(struct backing_dev_info *);
+	void *unplug_io_data;
 };
 
 extern struct backing_dev_info default_backing_dev_info;
+void default_unplug_io_fn(struct backing_dev_info *bdi);
 
 int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c421c46bfbb2..c4dd287dd1c8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -119,11 +119,13 @@ struct bio {
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
+ * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
+#define BIO_RW_SYNC	4
 
 /*
  * various member access, note that bio_data should of course not be used
@@ -138,6 +140,7 @@ struct bio {
 #define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 
 /*
  * will die
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a521e16b398..572f96e6940a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request_queue_t *, struct request *,
 typedef void (request_fn_proc) (request_queue_t *q);
 typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
 typedef int (prep_rq_fn) (request_queue_t *, struct request *);
-typedef void (unplug_fn) (void *q);
+typedef void (unplug_fn) (request_queue_t *);
 
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
@@ -315,8 +315,6 @@ struct request_queue
 	unsigned long		bounce_pfn;
 	int			bounce_gfp;
 
-	struct list_head	plug_list;
-
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
@@ -370,8 +368,9 @@ struct request_queue
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
+#define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 
-#define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
+#define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 
@@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk *, unsigned int, unsigned long);
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void __blk_stop_queue(request_queue_t *q);
-extern void blk_run_queue(request_queue_t *q);
+extern void blk_run_queue(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int);
@@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_disk->queue;
 }
 
+static inline void blk_run_backing_dev(struct backing_dev_info *bdi)
+{
+	if (bdi && bdi->unplug_io_fn)
+		bdi->unplug_io_fn(bdi);
+}
+
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info);
+}
+
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
@@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(void *);
+extern void generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(request_queue_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39c893f8aa28..c7f0052b4abd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
+#define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 240dc450dcd3..9c06e776cfc2 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
+extern void md_unplug_mddev(mddev_t *mddev);
 
 extern void md_print_devices (void);
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index bea64b0fb6c1..42c973c53d04 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -326,7 +326,6 @@ do {									\
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
-		blk_run_queues();					\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -341,30 +340,5 @@ do {									\
 	__wait_event_lock_irq(wq, condition, lock);			\
 } while (0)
 
-
-#define __wait_disk_event(wq, condition) 				\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		blk_run_queues();					\
-		schedule();						\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_disk_event(wq, condition) 					\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_disk_event(wq, condition);				\
-} while (0)
-
 #endif
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b000c56803b8..d189090cf63a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -197,6 +197,8 @@ extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #define	SWAP_AGAIN	1
 #define	SWAP_FAIL	2
 
+extern void swap_unplug_io_fn(struct backing_dev_info *);
+
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
@@ -232,6 +234,7 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
 extern spinlock_t swaplock;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 7e035a9b42d1..6abcf99b7ada 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -84,7 +84,6 @@ static void free_some_memory(void)
 	while (shrink_all_memory(10000))
 		printk(".");
 	printk("|\n");
-	blk_run_queues();
 }
 
 
diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c
index d54147214bea..22855abbdd6e 100644
--- a/kernel/power/pmdisk.c
+++ b/kernel/power/pmdisk.c
@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err)
 
 static void wait_io(void)
 {
-	blk_run_queues();
 	while(atomic_read(&io_done))
 		io_schedule();
 }
@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page)
 	if (rw == WRITE)
 		bio_set_pages_dirty(bio);
 	start_io();
-	submit_bio(rw,bio);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 	wait_io();
  Done:
 	bio_put(bio);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 20134ab8e0b2..ae748a467af5 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -707,11 +707,6 @@ int software_suspend(void)
 
 		free_some_memory();
 		
-		/* No need to invalidate any vfsmnt list -- 
-		 * they will be valid after resume, anyway.
-		 */
-		blk_run_queues();
-
 		/* Save state of all device drivers, and stop them. */		   
 		if ((res = device_suspend(4))==0)
 			/* If stopping device drivers worked, we proceed basically into
diff --git a/mm/filemap.c b/mm/filemap.c
index ec1952db8baf..dc2f0992d879 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,8 +119,10 @@ void remove_from_page_cache(struct page *page)
 
 static inline int sync_page(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping;
 
+	smp_mb();
+	mapping = page->mapping;
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		return mapping->a_ops->sync_page(page);
 	return 0;
diff --git a/mm/mempool.c b/mm/mempool.c
index 756e60ee18d6..da6ad1e12c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -234,8 +234,6 @@ repeat_alloc:
 	if (!(gfp_mask & __GFP_WAIT))
 		return NULL;
 
-	blk_run_queues();
-
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	mb();
 	if (!pool->curr_nr)
diff --git a/mm/nommu.c b/mm/nommu.c
index c940756b49e5..1432dbab85eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -571,3 +572,7 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr,
 void pte_chain_init(void)
 {
 }
+
+void swap_unplug_io_fn(struct backing_dev_info *)
+{
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index 08a2d9f1051d..71bf2462d097 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -15,11 +15,16 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
+void default_unplug_io_fn(struct backing_dev_info *bdi)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
 struct backing_dev_info default_backing_dev_info = {
 	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
 	.state		= 0,
+	.unplug_io_fn	= default_unplug_io_fn,
 };
-
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 /*
@@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
 	ra->average = ra->ra_pages / 2;
 }
-
 EXPORT_SYMBOL(file_ra_state_init);
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 4116ea26daf1..345e04cb0f6c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -169,6 +169,7 @@ static struct vm_operations_struct shmem_vm_ops;
 static struct backing_dev_info shmem_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 LIST_HEAD(shmem_inodes);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 22946f0d9ecf..97f80d20807c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
 static struct backing_dev_info swap_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
 extern struct address_space_operations swap_aops;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5cebb1800b9..f885e6d17a49 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/rmap-locking.h>
 #include <linux/security.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
+/*
+ * Array of backing blockdevs, for swap_unplug_fn.  We need this because the
+ * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
+ * the unplug_fn.  And swap_list_lock cannot be turned into a semaphore.
+ */
+static DECLARE_MUTEX(swap_bdevs_sem);
+static struct block_device *swap_bdevs[MAX_SWAPFILES];
+
 #define SWAPFILE_CLUSTER 256
 
+/*
+ * Caller holds swap_bdevs_sem
+ */
+static void install_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == NULL) {
+			swap_bdevs[i] = bdev;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void remove_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == bdev) {
+			memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
+				(MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
+			swap_bdevs[MAX_SWAPFILES - 1] = NULL;
+			return;
+		}
+	}
+	BUG();
+}
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi)
+{
+	int i;
+
+	down(&swap_bdevs_sem);
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		struct block_device *bdev = swap_bdevs[i];
+		struct backing_dev_info *bdi;
+
+		if (bdev == NULL)
+			break;
+		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+		(*bdi->unplug_io_fn)(bdi);
+	}
+	up(&swap_bdevs_sem);
+}
+
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
@@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		swap_list_unlock();
 		goto out_dput;
 	}
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	swap_file = p->swap_file;
@@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	destroy_swap_extents(p);
 	swap_device_unlock(p);
 	swap_list_unlock();
+	remove_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	vfree(swap_map);
 	if (S_ISBLK(mapping->host->i_mode)) {
 		struct block_device *bdev = I_BDEV(mapping->host);
@@ -1440,6 +1500,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	if (error)
 		goto bad_swap;
 
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	p->flags = SWP_ACTIVE;
@@ -1465,6 +1526,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	}
 	swap_device_unlock(p);
 	swap_list_unlock();
+	install_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	error = 0;
 	goto out;
 bad_swap:
@@ -1484,7 +1547,7 @@ bad_swap_2:
 	destroy_swap_extents(p);
 	if (swap_map)
 		vfree(swap_map);
-	if (swap_file && !IS_ERR(swap_file))
+	if (swap_file)
 		filp_close(swap_file, NULL);
 out:
 	if (page && !IS_ERR(page)) {
-- 
cgit v1.2.3


From 66db15b4577185624ae95ffe99a66305c8c63ef7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:16:17 -0700
Subject: [PATCH] unplugging: md update

From: Neil Brown <neilb@cse.unsw.edu.au>

I've made a bunch of changes to the 'md' bits - largely moving the
unplugging into the individual personalities which know more about which
drives are actually in use.
---
 drivers/md/linear.c       | 15 +++++++++++++
 drivers/md/md.c           | 35 +++++------------------------
 drivers/md/multipath.c    | 23 +++++++++++++++++++
 drivers/md/raid0.c        | 17 ++++++++++++++
 drivers/md/raid1.c        | 56 ++++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.c        | 36 ++++++++++++++++++++++++++----
 drivers/md/raid6main.c    | 36 +++++++++++++++++++++++++++---
 include/linux/raid/md_k.h |  7 +++---
 8 files changed, 175 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1198e07e7abe..e0aa017a26b7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -80,6 +80,20 @@ static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio
 	return maxsectors << 9;
 }
 
+static void linear_unplug(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i < mddev->raid_disks; i++) {
+		request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+
+
 static int linear_run (mddev_t *mddev)
 {
 	linear_conf_t *conf;
@@ -185,6 +199,7 @@ static int linear_run (mddev_t *mddev)
 		BUG();
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
+	mddev->queue->unplug_fn = linear_unplug;
 	return 0;
 
 out:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 72d6a2da5827..b521ca509b1e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -160,30 +160,6 @@ static int md_fail_request (request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
-void md_unplug_mddev(mddev_t *mddev)
-{
-	struct list_head *tmp;
-	mdk_rdev_t *rdev;
-
-	/*
-	 * this list iteration is done without any locking in md?!
-	 */
-	ITERATE_RDEV(mddev, rdev, tmp) {
-		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-
-		if (r_queue->unplug_fn)
-			r_queue->unplug_fn(r_queue);
-	}
-}
-EXPORT_SYMBOL(md_unplug_mddev);
-
-static void md_unplug_all(request_queue_t *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	md_unplug_mddev(mddev);
-}
-
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -1669,7 +1645,6 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
-	mddev->queue->unplug_fn = md_unplug_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -2742,10 +2717,9 @@ int md_thread(void * arg)
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 
 		run = thread->run;
-		if (run) {
+		if (run)
 			run(thread->mddev);
-			md_unplug_mddev(thread->mddev);
-		}
+
 		if (signal_pending(current))
 			flush_signals(current);
 	}
@@ -3313,8 +3287,6 @@ static void md_do_sync(mddev_t *mddev)
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		md_unplug_mddev(mddev);
-
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
 			/* step marks */
@@ -3347,6 +3319,7 @@ static void md_do_sync(mddev_t *mddev)
 		 * about not overloading the IO subsystem. (things like an
 		 * e2fsck being done on the RAID array should execute fast)
 		 */
+		mddev->queue->unplug_fn(mddev->queue);
 		cond_resched();
 
 		currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
@@ -3365,6 +3338,8 @@ static void md_do_sync(mddev_t *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
+	mddev->queue->unplug_fn(mddev->queue);
+
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index bf9980a8b1fd..9114c7c269ed 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -155,6 +155,27 @@ static int multipath_read_balance (multipath_conf_t *conf)
 	return 0;
 }
 
+static void unplug_slaves(mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+		if (rdev && !rdev->faulty) {
+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+			if (r_queue->unplug_fn)
+				r_queue->unplug_fn(r_queue);
+		}
+	}
+}
+static void multipath_unplug(request_queue_t *q)
+{
+	unplug_slaves(q->queuedata);
+}
+
+
 static int multipath_make_request (request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -419,6 +440,8 @@ static int multipath_run (mddev_t *mddev)
 	}
 	memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
 
+	mddev->queue->unplug_fn = multipath_unplug;
+
 	conf->working_disks = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		disk_idx = rdev->raid_disk;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3cbf14021820..5f4b8bfefc91 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,6 +25,21 @@
 #define MD_DRIVER
 #define MD_PERSONALITY
 
+static void raid0_unplug(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev);
+
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
@@ -202,6 +217,8 @@ static int create_strip_zones (mddev_t *mddev)
 			conf->hash_spacing = sz;
 	}
 
+	mddev->queue->unplug_fn = raid0_unplug;
+
 	printk("raid0: done.\n");
 	return 0;
  abort:
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6616cd46c50f..bcc81ef13a35 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,6 +37,9 @@ static mdk_personality_t raid1_personality;
 static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED;
 static LIST_HEAD(retry_list_head);
 
+static void unplug_slaves(mddev_t *mddev);
+
+
 static void * r1bio_pool_alloc(int gfp_flags, void *data)
 {
 	mddev_t *mddev = data;
@@ -47,6 +50,8 @@ static void * r1bio_pool_alloc(int gfp_flags, void *data)
 			 gfp_flags);
 	if (r1_bio)
 		memset(r1_bio, 0, sizeof(*r1_bio) + sizeof(struct bio*)*mddev->raid_disks);
+	else
+		unplug_slaves(mddev);
 
 	return r1_bio;
 }
@@ -71,8 +76,10 @@ static void * r1buf_pool_alloc(int gfp_flags, void *data)
 	int i, j;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, conf->mddev);
-	if (!r1_bio)
+	if (!r1_bio) {
+		unplug_slaves(conf->mddev);
 		return NULL;
+	}
 
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
@@ -443,6 +450,29 @@ rb_out:
 	return new_disk;
 }
 
+static void unplug_slaves(mddev_t *mddev)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && !rdev->faulty) {
+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+			if (r_queue->unplug_fn)
+				r_queue->unplug_fn(r_queue);
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+static void raid1_unplug(request_queue_t *q)
+{
+	unplug_slaves(q->queuedata);
+}
+
 /*
  * Throttle resync depth, so that we can both get proper overlapping of
  * requests, but are still able to handle normal requests quickly.
@@ -451,16 +481,18 @@ rb_out:
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
-	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
+	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
+			    conf->resync_lock, unplug_slaves(conf->mddev));
 	
 	if (!conf->barrier++) {
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, conf->resync_lock);
+		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+				    conf->resync_lock, unplug_slaves(conf->mddev));
 		if (conf->nr_pending)
 			BUG();
 	}
-	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock, unplug_slaves(conf->mddev));
 	conf->next_resync = sect;
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -479,9 +511,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 
@@ -646,9 +677,9 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
-	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
+			    conf->resync_lock, 	unplug_slaves(conf->mddev));
 	spin_unlock_irq(&conf->resync_lock);
 
 	if (conf->barrier) BUG();
@@ -862,6 +893,7 @@ static void raid1d(mddev_t *mddev)
 	struct bio *bio;
 	unsigned long flags;
 	conf_t *conf = mddev_to_conf(mddev);
+	int unplug=0;
 	mdk_rdev_t *rdev;
 
 	md_check_recovery(mddev);
@@ -881,6 +913,7 @@ static void raid1d(mddev_t *mddev)
 		bio = r1_bio->master_bio;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
+			unplug = 1;
 		} else {
 			if (map(mddev, &rdev) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -896,12 +929,14 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_rw = READ;
-
+				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 	}
 	spin_unlock_irqrestore(&retry_list_lock, flags);
+	if (unplug)
+		unplug_slaves(mddev);
 }
 
 
@@ -1104,6 +1139,7 @@ static int run(mddev_t *mddev)
 			mdname(mddev));
 		goto out_free_conf;
 	}
+	mddev->queue->unplug_fn = raid1_unplug;
 
 
 	ITERATE_RDEV(mddev, rdev, tmp) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5c9d3fd66913..05087b8ae056 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -231,6 +231,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 	return NULL;
 }
 
+static void unplug_slaves(mddev_t *mddev);
+
 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
 					     int pd_idx, int noblock) 
 {
@@ -249,12 +251,13 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
-				md_unplug_mddev(conf->mddev);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 						     || !conf->inactive_blocked),
-						    conf->device_lock);
+						    conf->device_lock,
+						    unplug_slaves(conf->mddev);
+					);
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, pd_idx);
@@ -1293,6 +1296,25 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 		}
 	}
 }
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			if (bdev) {
+				request_queue_t *r_queue = bdev_get_queue(bdev);
+				if (r_queue && r_queue->unplug_fn)
+					r_queue->unplug_fn(r_queue);
+			}
+		}
+	}
+}
+
 static void raid5_unplug_device(request_queue_t *q)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1306,6 +1328,8 @@ static void raid5_unplug_device(request_queue_t *q)
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
+
+	unplug_slaves(mddev);
 }
 
 static inline void raid5_plug_device(raid5_conf_t *conf)
@@ -1392,9 +1416,11 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 
-	if (sector_nr >= mddev->size <<1)
-		/* just being told to finish up .. nothing to do */
+	if (sector_nr >= mddev->size <<1) {
+		/* just being told to finish up .. nothing much to do */
+		unplug_slaves(mddev);
 		return 0;
+	}
 
 	x = sector_nr;
 	chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1474,6 +1500,8 @@ static void raid5d (mddev_t *mddev)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	unplug_slaves(mddev);
+
 	PRINTK("--- raid5d inactive\n");
 }
 
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 131f4a1f34eb..99d08d67342f 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -250,6 +250,8 @@ static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
 	return NULL;
 }
 
+static void unplug_slaves(mddev_t *mddev);
+
 static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
 					     int pd_idx, int noblock)
 {
@@ -272,7 +274,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 						     || !conf->inactive_blocked),
-						    conf->device_lock);
+						    conf->device_lock,
+						    unplug_slaves(conf->mddev);
+					);
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, pd_idx);
@@ -1454,6 +1458,26 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
 		}
 	}
 }
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	/* note: this is always called with device_lock held */
+	raid6_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			if (bdev) {
+				request_queue_t *r_queue = bdev_get_queue(bdev);
+				if (r_queue && r_queue->unplug_fn)
+					r_queue->unplug_fn(r_queue);
+			}
+		}
+	}
+}
+
 static void raid6_unplug_device(request_queue_t *q)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1467,6 +1491,8 @@ static void raid6_unplug_device(request_queue_t *q)
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
+
+	unplug_slaves(mddev);
 }
 
 static inline void raid6_plug_device(raid6_conf_t *conf)
@@ -1553,9 +1579,11 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks - 2;
 
-	if (sector_nr >= mddev->size <<1)
-		/* just being told to finish up .. nothing to do */
+	if (sector_nr >= mddev->size <<1) {
+		/* just being told to finish up .. nothing much to do */
+		unplug_slaves(mddev);
 		return 0;
+	}
 
 	x = sector_nr;
 	chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1635,6 +1663,8 @@ static void raid6d (mddev_t *mddev)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	unplug_slaves(mddev);
+
 	PRINTK("--- raid6d inactive\n");
 }
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 42c973c53d04..0b6b5e6f34eb 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -315,7 +315,7 @@ typedef struct mdk_thread_s {
 
 #define THREAD_WAKEUP  0
 
-#define __wait_event_lock_irq(wq, condition, lock) 			\
+#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
 do {									\
 	wait_queue_t __wait;						\
 	init_waitqueue_entry(&__wait, current);				\
@@ -326,6 +326,7 @@ do {									\
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
+		cmd;							\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -333,11 +334,11 @@ do {									\
 	remove_wait_queue(&wq, &__wait);				\
 } while (0)
 
-#define wait_event_lock_irq(wq, condition, lock) 			\
+#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
 do {									\
 	if (condition)	 						\
 		break;							\
-	__wait_event_lock_irq(wq, condition, lock);			\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
 } while (0)
 
 #endif
-- 
cgit v1.2.3


From 1dc841edc41a3014ece92b72013b3b57b0424e6b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:16:32 -0700
Subject: [PATCH] Correct unplugs on nr_queued

From: Jens Axboe <axboe@suse.de>

There's a small discrepancy in when we decide to unplug a queue based on
q->unplug_thresh.  Basically it doesn't work for tagged queues, since
q->rq.count[READ] + q->rq.count[WRITE] is just the number of allocated
requests, not the number of requests stuck in the io scheduler.  We could
just change the nr_queued == to a nr_queued >=, however that is still
suboptimal.

This patch adds accounting for requests that have been dequeued from the io
scheduler, but not freed yet.  These are q->in_flight.  allocated_requests
- q->in_flight == requests_in_scheduler.  So the condition correctly
becomes

	if (requests_in_scheduler == q->unplug_thresh)

instead.  I did a quick round of testing, and for dbench on a SCSI disk the
number of timer induced unplugs was reduced from 13 to 5 :-).  Not a huge
number, but there might be cases where it's more significant.  Either way,
it gets ->unplug_thresh always right, which the old logic didn't.
---
 drivers/block/elevator.c  | 23 +++++++++++++++++++++++
 drivers/block/ll_rw_blk.c |  4 ++--
 include/linux/blkdev.h    |  5 +++++
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 40377d4a030a..c42fd0ddd75f 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -149,6 +149,13 @@ void elv_merge_requests(request_queue_t *q, struct request *rq,
 
 void elv_requeue_request(request_queue_t *q, struct request *rq)
 {
+	/*
+	 * it already went through dequeue, we need to decrement the
+	 * in_flight count again
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight--;
+
 	/*
 	 * if iosched has an explicit requeue hook, then use that. otherwise
 	 * just put the request at the front of the queue
@@ -232,6 +239,16 @@ void elv_remove_request(request_queue_t *q, struct request *rq)
 {
 	elevator_t *e = &q->elevator;
 
+	/*
+	 * the time frame between a request being removed from the lists
+	 * and to it is freed is accounted as io that is in progress at
+	 * the driver side. note that we only account requests that the
+	 * driver has seen (REQ_STARTED set), to avoid false accounting
+	 * for request-request merges
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight++;
+
 	/*
 	 * the main clearing point for q->last_merge is on retrieval of
 	 * request by driver (it calls elv_next_request()), but it _can_
@@ -321,6 +338,12 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 {
 	elevator_t *e = &q->elevator;
 
+	/*
+	 * request is released from the driver, io must be done
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight--;
+
 	if (e->elevator_completed_req_fn)
 		e->elevator_completed_req_fn(q, rq);
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 209fdef4d986..6b0ff2c5f092 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -2275,9 +2275,9 @@ out:
 		__blk_put_request(q, freereq);
 
 	if (blk_queue_plugged(q)) {
-		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
+		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;
 
-		if (nr_queued == q->unplug_thresh || bio_sync(bio))
+		if (nrq == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 572f96e6940a..44c722d4b67b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -348,6 +348,8 @@ struct request_queue
 
 	atomic_t		refcnt;
 
+	unsigned int		in_flight;
+
 	/*
 	 * sg stuff
 	 */
@@ -377,6 +379,9 @@ struct request_queue
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
 #define blk_pc_request(rq)	((rq)->flags & REQ_BLOCK_PC)
 #define blk_noretry_request(rq)	((rq)->flags & REQ_FAILFAST)
+#define blk_rq_started(rq)	((rq)->flags & REQ_STARTED)
+
+#define blk_account_rq(rq)	(blk_rq_started(rq) && blk_fs_request(rq))
 
 #define blk_pm_suspend_request(rq)	((rq)->flags & REQ_PM_SUSPEND)
 #define blk_pm_resume_request(rq)	((rq)->flags & REQ_PM_RESUME)
-- 
cgit v1.2.3


From 3e2ea65d7be031644c6d6a3ed5bd89af5ef2f090 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:16:44 -0700
Subject: [PATCH] CFQ io scheduler

From: Jens Axboe <axboe@suse.de>

CFQ I/O scheduler
---
 drivers/block/Kconfig.iosched |   7 +
 drivers/block/Makefile        |   1 +
 drivers/block/cfq-iosched.c   | 707 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c     |   6 +
 include/linux/elevator.h      |   5 +
 5 files changed, 726 insertions(+)
 create mode 100644 drivers/block/cfq-iosched.c

(limited to 'include/linux')

diff --git a/drivers/block/Kconfig.iosched b/drivers/block/Kconfig.iosched
index fa5476571677..d938c5fd130b 100644
--- a/drivers/block/Kconfig.iosched
+++ b/drivers/block/Kconfig.iosched
@@ -27,3 +27,10 @@ config IOSCHED_DEADLINE
 	  a disk at any one time, its behaviour is almost identical to the
 	  anticipatory I/O scheduler and so is a good choice.
 
+config IOSCHED_CFQ
+	bool "CFQ I/O scheduler" if EMBEDDED
+	default y
+	---help---
+	  The CFQ I/O scheduler tries to distribute bandwidth equally
+	  among all processes in the system. It should provide a fair
+	  working environment, suitable for desktop systems.
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index da1ce1b1361f..33b14e84cd8c 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -18,6 +18,7 @@ obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
+obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
 obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o
 obj-$(CONFIG_BLK_DEV_FD98)	+= floppy98.o
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
new file mode 100644
index 000000000000..11528978acf4
--- /dev/null
+++ b/drivers/block/cfq-iosched.c
@@ -0,0 +1,707 @@
+/*
+ *  linux/drivers/block/cfq-iosched.c
+ *
+ *  CFQ, or complete fairness queueing, disk scheduler.
+ *
+ *  Based on ideas from a previously unfinished io
+ *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
+ *
+ *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/hash.h>
+#include <linux/rbtree.h>
+#include <linux/mempool.h>
+
+/*
+ * tunables
+ */
+static int cfq_quantum = 4;
+static int cfq_queued = 8;
+
+#define CFQ_QHASH_SHIFT		6
+#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
+#define list_entry_qhash(entry)	list_entry((entry), struct cfq_queue, cfq_hash)
+
+#define CFQ_MHASH_SHIFT		8
+#define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
+#define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
+#define CFQ_MHASH_FN(sec)	(hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
+#define ON_MHASH(crq)		!list_empty(&(crq)->hash)
+#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
+#define list_entry_hash(ptr)	list_entry((ptr), struct cfq_rq, hash)
+
+#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
+
+#define RQ_DATA(rq)		((struct cfq_rq *) (rq)->elevator_private)
+
+static kmem_cache_t *crq_pool;
+static kmem_cache_t *cfq_pool;
+static mempool_t *cfq_mpool;
+
+struct cfq_data {
+	struct list_head rr_list;
+	struct list_head *dispatch;
+	struct list_head *cfq_hash;
+
+	struct list_head *crq_hash;
+
+	unsigned int busy_queues;
+	unsigned int max_queued;
+
+	mempool_t *crq_pool;
+};
+
+struct cfq_queue {
+	struct list_head cfq_hash;
+	struct list_head cfq_list;
+	struct rb_root sort_list;
+	int pid;
+	int queued[2];
+#if 0
+	/*
+	 * with a simple addition like this, we can do io priorities. almost.
+	 * does need a split request free list, too.
+	 */
+	int io_prio
+#endif
+};
+
+struct cfq_rq {
+	struct rb_node rb_node;
+	sector_t rb_key;
+
+	struct request *request;
+
+	struct cfq_queue *cfq_queue;
+
+	struct list_head hash;
+};
+
+static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
+static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
+
+/*
+ * lots of deadline iosched dupes, can be abstracted later...
+ */
+static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
+{
+	list_del_init(&crq->hash);
+}
+
+static inline void cfq_del_crq_hash(struct cfq_rq *crq)
+{
+	if (ON_MHASH(crq))
+		__cfq_del_crq_hash(crq);
+}
+
+static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
+{
+	cfq_del_crq_hash(crq);
+
+	if (q->last_merge == crq->request)
+		q->last_merge = NULL;
+}
+
+static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
+{
+	struct request *rq = crq->request;
+
+	BUG_ON(ON_MHASH(crq));
+
+	list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
+}
+
+static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
+{
+	struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
+	struct list_head *entry, *next = hash_list->next;
+
+	while ((entry = next) != hash_list) {
+		struct cfq_rq *crq = list_entry_hash(entry);
+		struct request *__rq = crq->request;
+
+		next = entry->next;
+
+		BUG_ON(!ON_MHASH(crq));
+
+		if (!rq_mergeable(__rq)) {
+			__cfq_del_crq_hash(crq);
+			continue;
+		}
+
+		if (rq_hash_key(__rq) == offset)
+			return __rq;
+	}
+
+	return NULL;
+}
+
+/*
+ * rb tree support functions
+ */
+#define RB_NONE		(2)
+#define RB_EMPTY(node)	((node)->rb_node == NULL)
+#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
+#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
+#define ON_RB(node)	((node)->rb_color != RB_NONE)
+#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
+#define rq_rb_key(rq)		(rq)->sector
+
+static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	if (ON_RB(&crq->rb_node)) {
+		cfqq->queued[rq_data_dir(crq->request)]--;
+		rb_erase(&crq->rb_node, &cfqq->sort_list);
+		crq->cfq_queue = NULL;
+	}
+}
+
+static struct cfq_rq *
+__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	struct rb_node **p = &cfqq->sort_list.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_rq *__crq;
+
+	while (*p) {
+		parent = *p;
+		__crq = rb_entry_crq(parent);
+
+		if (crq->rb_key < __crq->rb_key)
+			p = &(*p)->rb_left;
+		else if (crq->rb_key > __crq->rb_key)
+			p = &(*p)->rb_right;
+		else
+			return __crq;
+	}
+
+	rb_link_node(&crq->rb_node, parent, p);
+	return 0;
+}
+
+static void
+cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
+{
+	struct request *rq = crq->request;
+	struct cfq_rq *__alias;
+
+	crq->rb_key = rq_rb_key(rq);
+	cfqq->queued[rq_data_dir(rq)]++;
+retry:
+	__alias = __cfq_add_crq_rb(cfqq, crq);
+	if (!__alias) {
+		rb_insert_color(&crq->rb_node, &cfqq->sort_list);
+		crq->cfq_queue = cfqq;
+		return;
+	}
+
+	cfq_del_crq_rb(cfqq, __alias);
+	cfq_dispatch_sort(cfqd->dispatch, __alias);
+	goto retry;
+}
+
+static struct request *
+cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
+{
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	struct rb_node *n;
+
+	if (!cfqq)
+		goto out;
+
+	n = cfqq->sort_list.rb_node;
+	while (n) {
+		struct cfq_rq *crq = rb_entry_crq(n);
+
+		if (sector < crq->rb_key)
+			n = n->rb_left;
+		else if (sector > crq->rb_key)
+			n = n->rb_right;
+		else
+			return crq->request;
+	}
+
+out:
+	return NULL;
+}
+
+static void cfq_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (crq) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		cfq_remove_merge_hints(q, crq);
+		list_del_init(&rq->queuelist);
+
+		if (cfqq) {
+			cfq_del_crq_rb(cfqq, crq);
+
+			if (RB_EMPTY(&cfqq->sort_list))
+				cfq_put_queue(cfqd, cfqq);
+		}
+	}
+}
+
+static int
+cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct request *__rq;
+	int ret;
+
+	ret = elv_try_last_merge(q, bio);
+	if (ret != ELEVATOR_NO_MERGE) {
+		__rq = q->last_merge;
+		goto out_insert;
+	}
+
+	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
+	if (__rq) {
+		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
+
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_BACK_MERGE;
+			goto out;
+		}
+	}
+
+	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
+	if (__rq) {
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_FRONT_MERGE;
+			goto out;
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+out:
+	q->last_merge = __rq;
+out_insert:
+	*req = __rq;
+	return ret;
+}
+
+static void cfq_merged_request(request_queue_t *q, struct request *req)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(req);
+
+	cfq_del_crq_hash(crq);
+	cfq_add_crq_hash(cfqd, crq);
+
+	if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		cfq_del_crq_rb(cfqq, crq);
+		cfq_add_crq_rb(cfqd, cfqq, crq);
+	}
+
+	q->last_merge = req;
+}
+
+static void
+cfq_merged_requests(request_queue_t *q, struct request *req,
+		    struct request *next)
+{
+	cfq_merged_request(q, req);
+	cfq_remove_request(q, next);
+}
+
+static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
+{
+	struct list_head *entry = head;
+	struct request *__rq;
+
+	if (!list_empty(head)) {
+		__rq = list_entry_rq(head->next);
+
+		if (crq->request->sector < __rq->sector) {
+			entry = head->prev;
+			goto link;
+		}
+	}
+
+	while ((entry = entry->prev) != head) {
+		__rq = list_entry_rq(entry);
+
+		if (crq->request->sector <= __rq->sector)
+			break;
+	}
+
+link:
+	list_add_tail(&crq->request->queuelist, entry);
+}
+
+static inline void
+__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
+			struct cfq_queue *cfqq)
+{
+	struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+
+	cfq_del_crq_rb(cfqq, crq);
+	cfq_remove_merge_hints(q, crq);
+	cfq_dispatch_sort(cfqd->dispatch, crq);
+}
+
+static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq;
+	struct list_head *entry, *tmp;
+	int ret, queued, good_queues;
+
+	if (list_empty(&cfqd->rr_list))
+		return 0;
+
+	queued = ret = 0;
+restart:
+	good_queues = 0;
+	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
+		cfqq = list_entry_cfqq(cfqd->rr_list.next);
+
+		BUG_ON(RB_EMPTY(&cfqq->sort_list));
+
+		__cfq_dispatch_requests(q, cfqd, cfqq);
+
+		if (RB_EMPTY(&cfqq->sort_list))
+			cfq_put_queue(cfqd, cfqq);
+		else
+			good_queues++;
+
+		queued++;
+		ret = 1;
+	}
+
+	if ((queued < cfq_quantum) && good_queues)
+		goto restart;
+
+	return ret;
+}
+
+static struct request *cfq_next_request(request_queue_t *q)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct request *rq;
+
+	if (!list_empty(cfqd->dispatch)) {
+		struct cfq_rq *crq;
+dispatch:
+		rq = list_entry_rq(cfqd->dispatch->next);
+
+		BUG_ON(q->last_merge == rq);
+		crq = RQ_DATA(rq);
+		if (crq)
+			BUG_ON(ON_MHASH(crq));
+
+		return rq;
+	}
+
+	if (cfq_dispatch_requests(q, cfqd))
+		goto dispatch;
+
+	return NULL;
+}
+
+static inline struct cfq_queue *
+__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
+{
+	struct list_head *hash_list = &cfqd->cfq_hash[hashval];
+	struct list_head *entry;
+
+	list_for_each(entry, hash_list) {
+		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+
+		if (__cfqq->pid == pid)
+			return __cfqq;
+	}
+
+	return NULL;
+}
+
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
+{
+	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+
+	return __cfq_find_cfq_hash(cfqd, pid, hashval);
+}
+
+static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	cfqd->busy_queues--;
+	list_del(&cfqq->cfq_list);
+	list_del(&cfqq->cfq_hash);
+	mempool_free(cfqq, cfq_mpool);
+}
+
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid)
+{
+	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
+
+	if (!cfqq) {
+		cfqq = mempool_alloc(cfq_mpool, GFP_NOIO);
+
+		INIT_LIST_HEAD(&cfqq->cfq_hash);
+		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_ROOT(&cfqq->sort_list);
+
+		cfqq->pid = pid;
+		cfqq->queued[0] = cfqq->queued[1] = 0;
+		list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+	}
+
+	return cfqq;
+}
+
+static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq;
+
+	cfqq = cfq_get_queue(cfqd, current->tgid);
+
+	cfq_add_crq_rb(cfqd, cfqq, crq);
+
+	if (list_empty(&cfqq->cfq_list)) {
+		list_add(&cfqq->cfq_list, &cfqd->rr_list);
+		cfqd->busy_queues++;
+	}
+}
+
+static void
+cfq_insert_request(request_queue_t *q, struct request *rq, int where)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	switch (where) {
+		case ELEVATOR_INSERT_BACK:
+			while (cfq_dispatch_requests(q, cfqd))
+				;
+			list_add_tail(&rq->queuelist, cfqd->dispatch);
+			break;
+		case ELEVATOR_INSERT_FRONT:
+			list_add(&rq->queuelist, cfqd->dispatch);
+			break;
+		case ELEVATOR_INSERT_SORT:
+			BUG_ON(!blk_fs_request(rq));
+			cfq_enqueue(cfqd, crq);
+			break;
+		default:
+			printk("%s: bad insert point %d\n", __FUNCTION__,where);
+			return;
+	}
+
+	if (rq_mergeable(rq)) {
+		cfq_add_crq_hash(cfqd, crq);
+
+		if (!q->last_merge)
+			q->last_merge = rq;
+	}
+}
+
+static int cfq_queue_empty(request_queue_t *q)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+
+	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
+		return 1;
+
+	return 0;
+}
+
+static struct request *
+cfq_former_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct rb_node *rbprev = rb_prev(&crq->rb_node);
+
+	if (rbprev)
+		return rb_entry_crq(rbprev)->request;
+
+	return NULL;
+}
+
+static struct request *
+cfq_latter_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct rb_node *rbnext = rb_next(&crq->rb_node);
+
+	if (rbnext)
+		return rb_entry_crq(rbnext)->request;
+
+	return NULL;
+}
+
+static int cfq_may_queue(request_queue_t *q, int rw)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_queue *cfqq;
+	int ret = 1;
+
+	if (!cfqd->busy_queues)
+		goto out;
+
+	cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	if (cfqq) {
+		int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues;
+
+		if (limit < 3)
+			limit = 3;
+		else if (limit > cfqd->max_queued)
+			limit = cfqd->max_queued;
+
+		if (cfqq->queued[rw] > limit)
+			ret = 0;
+	}
+out:
+	return ret;
+}
+
+static void cfq_put_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (crq) {
+		BUG_ON(q->last_merge == rq);
+		BUG_ON(ON_MHASH(crq));
+
+		mempool_free(crq, cfqd->crq_pool);
+		rq->elevator_private = NULL;
+	}
+}
+
+static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
+
+	if (crq) {
+		RB_CLEAR(&crq->rb_node);
+		crq->request = rq;
+		crq->cfq_queue = NULL;
+		INIT_LIST_HEAD(&crq->hash);
+		rq->elevator_private = crq;
+		return 0;
+	}
+
+	return 1;
+}
+
+static void cfq_exit(request_queue_t *q, elevator_t *e)
+{
+	struct cfq_data *cfqd = e->elevator_data;
+
+	e->elevator_data = NULL;
+	mempool_destroy(cfqd->crq_pool);
+	kfree(cfqd->crq_hash);
+	kfree(cfqd->cfq_hash);
+	kfree(cfqd);
+}
+
+static int cfq_init(request_queue_t *q, elevator_t *e)
+{
+	struct cfq_data *cfqd;
+	int i;
+
+	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
+	if (!cfqd)
+		return -ENOMEM;
+
+	memset(cfqd, 0, sizeof(*cfqd));
+	INIT_LIST_HEAD(&cfqd->rr_list);
+
+	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
+	if (!cfqd->crq_hash)
+		goto out_crqhash;
+
+	cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
+	if (!cfqd->cfq_hash)
+		goto out_cfqhash;
+
+	cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
+	if (!cfqd->crq_pool)
+		goto out_crqpool;
+
+	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
+	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
+
+	cfqd->dispatch = &q->queue_head;
+	e->elevator_data = cfqd;
+
+	/*
+	 * just set it to some high value, we want anyone to be able to queue
+	 * some requests. fairness is handled differently
+	 */
+	cfqd->max_queued = q->nr_requests;
+	q->nr_requests = 8192;
+
+	return 0;
+out_crqpool:
+	kfree(cfqd->cfq_hash);
+out_cfqhash:
+	kfree(cfqd->crq_hash);
+out_crqhash:
+	kfree(cfqd);
+	return -ENOMEM;
+}
+
+static int __init cfq_slab_setup(void)
+{
+	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
+					NULL, NULL);
+
+	if (!crq_pool)
+		panic("cfq_iosched: can't init crq pool\n");
+
+	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
+					NULL, NULL);
+
+	if (!cfq_pool)
+		panic("cfq_iosched: can't init cfq pool\n");
+
+	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
+
+	if (!cfq_mpool)
+		panic("cfq_iosched: can't init cfq mpool\n");
+
+	return 0;
+}
+
+subsys_initcall(cfq_slab_setup);
+
+elevator_t iosched_cfq = {
+	.elevator_name =		"cfq",
+	.elevator_merge_fn = 		cfq_merge,
+	.elevator_merged_fn =		cfq_merged_request,
+	.elevator_merge_req_fn =	cfq_merged_requests,
+	.elevator_next_req_fn =		cfq_next_request,
+	.elevator_add_req_fn =		cfq_insert_request,
+	.elevator_remove_req_fn =	cfq_remove_request,
+	.elevator_queue_empty_fn =	cfq_queue_empty,
+	.elevator_former_req_fn =	cfq_former_request,
+	.elevator_latter_req_fn =	cfq_latter_request,
+	.elevator_set_req_fn =		cfq_set_request,
+	.elevator_put_req_fn =		cfq_put_request,
+	.elevator_may_queue_fn =	cfq_may_queue,
+	.elevator_init_fn =		cfq_init,
+	.elevator_exit_fn =		cfq_exit,
+};
+
+EXPORT_SYMBOL(iosched_cfq);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 6b0ff2c5f092..5ee752d64f4a 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1311,6 +1311,8 @@ static elevator_t *chosen_elevator =
 	&iosched_as;
 #elif defined(CONFIG_IOSCHED_DEADLINE)
 	&iosched_deadline;
+#elif defined(CONFIG_IOSCHED_CFQ)
+	&iosched_cfq;
 #elif defined(CONFIG_IOSCHED_NOOP)
 	&elevator_noop;
 #else
@@ -1329,6 +1331,10 @@ static int __init elevator_setup(char *str)
 	if (!strcmp(str, "as"))
 		chosen_elevator = &iosched_as;
 #endif
+#ifdef CONFIG_IOSCHED_CFQ
+	if (!strcmp(str, "cfq"))
+		chosen_elevator = &iosched_cfq;
+#endif
 #ifdef CONFIG_IOSCHED_NOOP
 	if (!strcmp(str, "noop"))
 		chosen_elevator = &elevator_noop;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ce58f47126c1..27e8183f4776 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -94,6 +94,11 @@ extern elevator_t iosched_deadline;
  */
 extern elevator_t iosched_as;
 
+/*
+ * completely fair queueing I/O scheduler
+ */
+extern elevator_t iosched_cfq;
+
 extern int elevator_init(request_queue_t *, elevator_t *);
 extern void elevator_exit(request_queue_t *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
-- 
cgit v1.2.3


From 4c4acd2447ef473f23aee53f04518f93840a8693 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:53:50 -0700
Subject: [PATCH] rmap 1 linux/rmap.h

From: Hugh Dickins <hugh@veritas.com>

First of a batch of three rmap patches: this initial batch of three paving
the way for a move to some form of object-based rmap (probably Andrea's, but
drawing from mine too), and making almost no functional change by itself.  A
few days will intervene before the next batch, to give the struct page
changes in the second patch some exposure before proceeding.

rmap 1 create include/linux/rmap.h

Start small: linux/rmap-locking.h has already gathered some declarations
unrelated to locking, and the rest of the rmap declarations were over in
linux/swap.h: gather them all together in linux/rmap.h, and rename the
pte_chain_lock to rmap_lock.
---
 fs/exec.c                    |  2 +-
 include/linux/page-flags.h   |  2 +-
 include/linux/rmap-locking.h | 23 --------------------
 include/linux/rmap.h         | 52 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/swap.h         | 16 --------------
 mm/fremap.c                  |  2 +-
 mm/memory.c                  |  2 +-
 mm/mremap.c                  |  2 +-
 mm/rmap.c                    | 20 ++++++++---------
 mm/swapfile.c                |  2 +-
 mm/vmscan.c                  | 24 ++++++++++----------
 11 files changed, 79 insertions(+), 68 deletions(-)
 delete mode 100644 include/linux/rmap-locking.h
 create mode 100644 include/linux/rmap.h

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 5fb9f8f7c38f..1ea7c8d6c898 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,7 +45,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bd6ddb279c55..93f22640b6cb 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -69,7 +69,7 @@
 #define PG_private		12	/* Has something at ->private */
 #define PG_writeback		13	/* Page is under writeback */
 #define PG_nosave		14	/* Used for system suspend/resume */
-#define PG_chainlock		15	/* lock bit for ->pte_chain */
+#define PG_maplock		15	/* Lock bit for rmap to ptes */
 
 #define PG_direct		16	/* ->pte_chain points directly at pte */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
diff --git a/include/linux/rmap-locking.h b/include/linux/rmap-locking.h
deleted file mode 100644
index cb30ed470cf6..000000000000
--- a/include/linux/rmap-locking.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * include/linux/rmap-locking.h
- *
- * Locking primitives for exclusive access to a page's reverse-mapping
- * pte chain.
- */
-
-#include <linux/slab.h>
-
-struct pte_chain;
-extern kmem_cache_t *pte_chain_cache;
-
-#define pte_chain_lock(page)	bit_spin_lock(PG_chainlock, (unsigned long *)&page->flags)
-#define pte_chain_unlock(page)	bit_spin_unlock(PG_chainlock, (unsigned long *)&page->flags)
-
-struct pte_chain *pte_chain_alloc(int gfp_flags);
-void __pte_chain_free(struct pte_chain *pte_chain);
-
-static inline void pte_chain_free(struct pte_chain *pte_chain)
-{
-	if (pte_chain)
-		__pte_chain_free(pte_chain);
-}
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
new file mode 100644
index 000000000000..5f9b35f2fa65
--- /dev/null
+++ b/include/linux/rmap.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_RMAP_H
+#define _LINUX_RMAP_H
+/*
+ * Declarations for Reverse Mapping functions in mm/rmap.c
+ * Its structures are declared within that file.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+#define rmap_lock(page) \
+	bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags)
+#define rmap_unlock(page) \
+	bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags)
+
+#ifdef CONFIG_MMU
+
+struct pte_chain;
+struct pte_chain *pte_chain_alloc(int gfp_flags);
+void __pte_chain_free(struct pte_chain *pte_chain);
+
+static inline void pte_chain_free(struct pte_chain *pte_chain)
+{
+	if (pte_chain)
+		__pte_chain_free(pte_chain);
+}
+
+struct pte_chain * fastcall
+	page_add_rmap(struct page *, pte_t *, struct pte_chain *);
+void fastcall page_remove_rmap(struct page *, pte_t *);
+
+/*
+ * Called from mm/vmscan.c to handle paging out
+ */
+int fastcall page_referenced(struct page *);
+int fastcall try_to_unmap(struct page *);
+
+#else	/* !CONFIG_MMU */
+
+#define page_referenced(page)	TestClearPageReferenced(page)
+#define try_to_unmap(page)	SWAP_FAIL
+
+#endif	/* CONFIG_MMU */
+
+/*
+ * Return values of try_to_unmap
+ */
+#define SWAP_SUCCESS	0
+#define SWAP_AGAIN	1
+#define SWAP_FAIL	2
+
+#endif	/* _LINUX_RMAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d189090cf63a..f911d8afb8a5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -76,7 +76,6 @@ struct reclaim_state {
 #ifdef __KERNEL__
 
 struct address_space;
-struct pte_chain;
 struct sysinfo;
 struct writeback_control;
 struct zone;
@@ -177,26 +176,11 @@ extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
-/* linux/mm/rmap.c */
 #ifdef CONFIG_MMU
-int FASTCALL(page_referenced(struct page *));
-struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *,
-					struct pte_chain *));
-void FASTCALL(page_remove_rmap(struct page *, pte_t *));
-int FASTCALL(try_to_unmap(struct page *));
-
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
-#else
-#define page_referenced(page)	TestClearPageReferenced(page)
-#define try_to_unmap(page)	SWAP_FAIL
 #endif /* CONFIG_MMU */
 
-/* return values of try_to_unmap */
-#define	SWAP_SUCCESS	0
-#define	SWAP_AGAIN	1
-#define	SWAP_FAIL	2
-
 extern void swap_unplug_io_fn(struct backing_dev_info *);
 
 #ifdef CONFIG_SWAP
diff --git a/mm/fremap.c b/mm/fremap.c
index dacebb172f6f..2c8abe6d1f5a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -12,7 +12,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/swapops.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/module.h>
 
 #include <asm/mmu_context.h>
diff --git a/mm/memory.c b/mm/memory.c
index f7f1649b848c..40695793393c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -43,7 +43,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/mm/mremap.c b/mm/mremap.c
index e59e9355055e..c355d4da4afe 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -15,7 +15,7 @@
 #include <linux/swap.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/security.h>
 
 #include <asm/uaccess.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index b960734c8724..3f304d8fd38a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -13,7 +13,7 @@
 
 /*
  * Locking:
- * - the page->pte.chain is protected by the PG_chainlock bit,
+ * - the page->pte.chain is protected by the PG_maplock bit,
  *   which nests within the the mm->page_table_lock,
  *   which nests within the page lock.
  * - because swapout locking is opposite to the locking order
@@ -26,7 +26,7 @@
 #include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/cache.h>
 #include <linux/percpu.h>
 
@@ -108,7 +108,7 @@ pte_chain_encode(struct pte_chain *pte_chain, int idx)
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of processes which referenced the page.
- * Caller needs to hold the pte_chain_lock.
+ * Caller needs to hold the rmap lock.
  *
  * If the page has a single-entry pte_chain, collapse that back to a PageDirect
  * representation.  This way, it's only done under memory pressure.
@@ -175,7 +175,7 @@ page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 	if (PageReserved(page))
 		return pte_chain;
 
-	pte_chain_lock(page);
+	rmap_lock(page);
 
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
@@ -208,7 +208,7 @@ page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 	cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
 	cur_pte_chain->next_and_idx--;
 out:
-	pte_chain_unlock(page);
+	rmap_unlock(page);
 	return pte_chain;
 }
 
@@ -230,7 +230,7 @@ void fastcall page_remove_rmap(struct page *page, pte_t *ptep)
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 		return;
 
-	pte_chain_lock(page);
+	rmap_lock(page);
 
 	if (!page_mapped(page))
 		goto out_unlock;	/* remap_page_range() from a driver? */
@@ -276,8 +276,7 @@ out:
 	if (!page_mapped(page))
 		dec_page_state(nr_mapped);
 out_unlock:
-	pte_chain_unlock(page);
-	return;
+	rmap_unlock(page);
 }
 
 /**
@@ -290,10 +289,9 @@ out_unlock:
  * to the locking order used by the page fault path, we use trylocks.
  * Locking:
  *	    page lock			shrink_list(), trylock
- *		pte_chain_lock		shrink_list()
+ *		rmap lock		shrink_list()
  *		    mm->page_table_lock	try_to_unmap_one(), trylock
  */
-static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
 static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
 {
 	pte_t *ptep = rmap_ptep_map(paddr);
@@ -376,7 +374,7 @@ out_unlock:
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock
- * and its pte chain lock.  Return values are:
+ * and its rmap lock.  Return values are:
  *
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a trylock, try again later
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f885e6d17a49..44e214da0270 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39e8ed0fcdd6..35fbca1c5168 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -28,7 +28,7 @@
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
@@ -173,7 +173,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask)
 	return 0;
 }
 
-/* Must be called with page's pte_chain_lock held. */
+/* Must be called with page's rmap lock held. */
 static inline int page_mapping_inuse(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -278,11 +278,11 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 		if (PageWriteback(page))
 			goto keep_locked;
 
-		pte_chain_lock(page);
+		rmap_lock(page);
 		referenced = page_referenced(page);
 		if (referenced && page_mapping_inuse(page)) {
 			/* In active use or really unfreeable.  Activate it. */
-			pte_chain_unlock(page);
+			rmap_unlock(page);
 			goto activate_locked;
 		}
 
@@ -296,10 +296,10 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 		 * XXX: implement swap clustering ?
 		 */
 		if (page_mapped(page) && !mapping && !PagePrivate(page)) {
-			pte_chain_unlock(page);
+			rmap_unlock(page);
 			if (!add_to_swap(page))
 				goto activate_locked;
-			pte_chain_lock(page);
+			rmap_lock(page);
 			mapping = page->mapping;
 		}
 #endif /* CONFIG_SWAP */
@@ -314,16 +314,16 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page)) {
 			case SWAP_FAIL:
-				pte_chain_unlock(page);
+				rmap_unlock(page);
 				goto activate_locked;
 			case SWAP_AGAIN:
-				pte_chain_unlock(page);
+				rmap_unlock(page);
 				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
-		pte_chain_unlock(page);
+		rmap_unlock(page);
 
 		/*
 		 * If the page is dirty, only perform writeback if that write
@@ -657,13 +657,13 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
 				list_add(&page->lru, &l_active);
 				continue;
 			}
-			pte_chain_lock(page);
+			rmap_lock(page);
 			if (page_referenced(page)) {
-				pte_chain_unlock(page);
+				rmap_unlock(page);
 				list_add(&page->lru, &l_active);
 				continue;
 			}
-			pte_chain_unlock(page);
+			rmap_unlock(page);
 		}
 		/*
 		 * FIXME: need to consider page_count(page) here if/when we
-- 
cgit v1.2.3


From 4875a6018bcc53201ddbf745bff35ed723b468eb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:54:03 -0700
Subject: [PATCH] rmap 2 anon and swapcache

From: Hugh Dickins <hugh@veritas.com>

Tracking anonymous pages by anon_vma,pgoff or mm,address needs a
pointer,offset pair in struct page: mapping,index the natural choice.  But
swapcache uses those for &swapper_space,swp_entry_t.

It's trivial to separate swapcache from pagecache with radix tree; most of
swapper_space is actually unused, just a fiction to pretend swap like file;
and page->private is a good place to keep swp_entry_t, now that swap never
uses bufferheads.

Define PG_anon bit, page_add_rmap SetPageAnon and put an oopsable address in
page->mapping to test that we're not confused by it.  Define
page_mapping(page) macro to give NULL when PageAnon, whatever may be in
page->mapping.  Define PG_swapcache bit, deduce swapper_space from that in
the few places we need it.

add_to_swap_cache now distinct from add_to_page_cache.  Separating the caches
somewhat simplifies the tmpfs swizzling in swap_state.c, now the page can
briefly be in both caches.

The rmap method remains pte chains, no change to that yet.  But one small
functional difference: the use of PageAnon implies that a page truncated
while still mapped will no longer be found and freed (swapped out) by
try_to_unmap, will only be freed by exit or munmap.  But normally pages are
unmapped by vmtruncate: this should only affect nonlinear mappings, and a
later patch not in this batch will fix that.
---
 fs/buffer.c                |  19 ++----
 include/linux/mm.h         |  38 +++++------
 include/linux/page-flags.h |  17 +++--
 mm/filemap.c               |  25 ++++---
 mm/memory.c                |   4 +-
 mm/page-writeback.c        |  28 ++++++--
 mm/page_alloc.c            |   9 +++
 mm/page_io.c               |  38 ++---------
 mm/rmap.c                  |  50 +++++++++-----
 mm/swap_state.c            | 163 +++++++++++++++++++++++----------------------
 mm/swapfile.c              |  34 ++++++----
 mm/vmscan.c                |  34 +++++-----
 12 files changed, 242 insertions(+), 217 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8ab66d0b7548..99f1ce112ea9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -836,19 +836,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
  * address_space though.
- *
- * For now, we treat swapper_space specially.  It doesn't use the normal
- * block a_ops.
  */
 int __set_page_dirty_buffers(struct page *page)
 {
 	struct address_space * const mapping = page->mapping;
-	int ret = 0;
-
-	if (mapping == NULL) {
-		SetPageDirty(page);
-		goto out;
-	}
 
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
@@ -877,8 +868,7 @@ int __set_page_dirty_buffers(struct page *page)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 	
-out:
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
 
@@ -1577,8 +1567,7 @@ int try_to_release_page(struct page *page, int gfp_mask)
 {
 	struct address_space * const mapping = page->mapping;
 
-	if (!PageLocked(page))
-		BUG();
+	BUG_ON(!PageLocked(page));
 	if (PageWriteback(page))
 		return 0;
 	
@@ -2895,14 +2884,14 @@ int try_to_free_buffers(struct page *page)
 	if (PageWriteback(page))
 		return 0;
 
-	if (mapping == NULL) {		/* swapped-in anon page */
+	if (mapping == NULL) {		/* can this still happen? */
 		ret = drop_buffers(page, &buffers_to_free);
 		goto out;
 	}
 
 	spin_lock(&mapping->private_lock);
 	ret = drop_buffers(page, &buffers_to_free);
-	if (ret && !PageSwapCache(page)) {
+	if (ret) {
 		/*
 		 * If the filesystem writes its buffers by hand (eg ext3)
 		 * then we can have clean buffers against a dirty page.  We
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6d6abe8c656e..796f498658d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,8 +189,11 @@ struct page {
 					 * protected by PG_chainlock */
 		pte_addr_t direct;
 	} pte;
-	unsigned long private;		/* mapping-private opaque data */
-
+	unsigned long private;		/* Mapping-private opaque data:
+					 * usually used for buffer_heads
+					 * if PagePrivate set; used for
+					 * swp_entry_t if PageSwapCache
+					 */
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
@@ -402,6 +405,19 @@ void page_address_init(void);
 #define page_address_init()  do { } while(0)
 #endif
 
+/*
+ * On an anonymous page mapped into a user virtual memory area,
+ * page->mapping points to its anon_vma, not to a struct address_space.
+ *
+ * Please note that, confusingly, "page_mapping" refers to the inode
+ * address_space which maps the page from disk; whereas "page_mapped"
+ * refers to user virtual address space into which the page is mapped.
+ */
+static inline struct address_space *page_mapping(struct page *page)
+{
+	return PageAnon(page)? NULL: page->mapping;
+}
+
 /*
  * Return true if this page is mapped into pagetables.  Subtle: test pte.direct
  * rather than pte.chain.  Because sometimes pte.direct is 64-bit, and .chain
@@ -471,6 +487,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
+int FASTCALL(set_page_dirty(struct page *page));
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
@@ -497,23 +514,6 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
-/*
- * If the mapping doesn't provide a set_page_dirty a_op, then
- * just fall through and assume that it wants buffer_heads.
- * FIXME: make the method unconditional.
- */
-static inline int set_page_dirty(struct page *page)
-{
-	if (page->mapping) {
-		int (*spd)(struct page *);
-
-		spd = page->mapping->a_ops->set_page_dirty;
-		if (spd)
-			return (*spd)(page);
-	}
-	return __set_page_dirty_buffers(page);
-}
-
 /*
  * On a two-level page table, this ends up being trivial. Thus the
  * inlining and the symmetry break with pte_alloc_map() that does all
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 93f22640b6cb..6959827c9f62 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -75,6 +75,8 @@
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
+#define PG_anon			20	/* Anonymous page: anon_vma in mapping*/
+#define PG_swapcache		21	/* Swap page: swp_entry_t in private */
 
 
 /*
@@ -298,15 +300,16 @@ extern void get_full_page_state(struct page_state *ret);
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
-/*
- * The PageSwapCache predicate doesn't use a PG_flag at this time,
- * but it may again do so one day.
- */
+#define PageAnon(page)		test_bit(PG_anon, &(page)->flags)
+#define SetPageAnon(page)	set_bit(PG_anon, &(page)->flags)
+#define ClearPageAnon(page)	clear_bit(PG_anon, &(page)->flags)
+
 #ifdef CONFIG_SWAP
-extern struct address_space swapper_space;
-#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+#define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
+#define SetPageSwapCache(page)	set_bit(PG_swapcache, &(page)->flags)
+#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags)
 #else
-#define PageSwapCache(page) 0
+#define PageSwapCache(page)	0
 #endif
 
 struct page;	/* forward declaration */
diff --git a/mm/filemap.c b/mm/filemap.c
index dc2f0992d879..ca8fc1148296 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -122,9 +122,13 @@ static inline int sync_page(struct page *page)
 	struct address_space *mapping;
 
 	smp_mb();
-	mapping = page->mapping;
-	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-		return mapping->a_ops->sync_page(page);
+	mapping = page_mapping(page);
+	if (mapping) {
+		if (mapping->a_ops && mapping->a_ops->sync_page)
+			return mapping->a_ops->sync_page(page);
+	} else if (PageSwapCache(page)) {
+		swap_unplug_io_fn(NULL);
+	}
 	return 0;
 }
 
@@ -242,13 +246,9 @@ int filemap_write_and_wait(struct address_space *mapping)
  * This function is used for two things: adding newly allocated pagecache
  * pages and for moving existing anon pages into swapcache.
  *
- * In the case of pagecache pages, the page is new, so we can just run
- * SetPageLocked() against it.  The other page state flags were set by
- * rmqueue()
- *
- * In the case of swapcache, try_to_swap_out() has already locked the page, so
- * SetPageLocked() is ugly-but-OK there too.  The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * This function is used to add newly allocated pagecache pages:
+ * the page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue().
  *
  * This function does not add the page to the LRU.  The caller must do that.
  */
@@ -263,7 +263,10 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
 			SetPageLocked(page);
-			___add_to_page_cache(page, mapping, offset);
+			page->mapping = mapping;
+			page->index = offset;
+			mapping->nrpages++;
+			pagecache_acct(1);
 		} else {
 			page_cache_release(page);
 		}
diff --git a/mm/memory.c b/mm/memory.c
index 40695793393c..95b9b84d8478 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -417,8 +417,8 @@ zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd,
 				if (!PageReserved(page)) {
 					if (pte_dirty(pte))
 						set_page_dirty(page);
-					if (page->mapping && pte_young(pte) &&
-							!PageSwapCache(page))
+					if (pte_young(pte) &&
+							page_mapping(page))
 						mark_page_accessed(page);
 					tlb->freed++;
 					page_remove_rmap(page, ptep);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9cf47af10ccc..22e17333982a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -579,6 +579,24 @@ int __set_page_dirty_nobuffers(struct page *page)
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int fastcall set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int (*spd)(struct page *);
+
+	if (!mapping) {
+		SetPageDirty(page);
+		return 0;
+	}
+	spd = mapping->a_ops->set_page_dirty;
+	return spd? (*spd)(page): __set_page_dirty_buffers(page);
+}
+EXPORT_SYMBOL(set_page_dirty);
+
 /*
  * set_page_dirty() is racy if the caller has no reference against
  * page->mapping->host, and if the page is unlocked.  This is because another
@@ -606,7 +624,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  */
 int test_clear_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	unsigned long flags;
 
 	if (mapping) {
@@ -642,7 +660,7 @@ EXPORT_SYMBOL(test_clear_page_dirty);
  */
 int clear_page_dirty_for_io(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
@@ -661,7 +679,7 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
  */
 int __clear_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 
 	if (mapping) {
 		unsigned long flags;
@@ -681,7 +699,7 @@ int __clear_page_dirty(struct page *page)
 
 int test_clear_page_writeback(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	int ret;
 
 	if (mapping) {
@@ -701,7 +719,7 @@ int test_clear_page_writeback(struct page *page)
 
 int test_set_page_writeback(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	int ret;
 
 	if (mapping) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4148e94eee13..6b4d5dc0c930 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -84,6 +84,9 @@ static void bad_page(const char *function, struct page *page)
 			1 << PG_lru	|
 			1 << PG_active	|
 			1 << PG_dirty	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
 			1 << PG_writeback);
 	set_page_count(page, 0);
 	page->mapping = NULL;
@@ -224,6 +227,9 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_active	|
 			1 << PG_reclaim	|
 			1 << PG_slab	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(function, page);
 	if (PageDirty(page))
@@ -331,6 +337,9 @@ static void prep_new_page(struct page *page, int order)
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(__FUNCTION__, page);
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 7ec159ded5ca..dbbc4e5b2e1e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -16,8 +16,6 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
-#include <linux/buffer_head.h>	/* for block_sync_page() */
-#include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
@@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page *page, bio_end_io_t end_io)
 		swp_entry_t entry;
 
 		BUG_ON(!PageSwapCache(page));
-		entry.val = page->index;
+		entry.val = page->private;
 		sis = get_swap_info_struct(swp_type(entry));
 
 		bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
@@ -132,13 +130,6 @@ out:
 	return ret;
 }
 
-struct address_space_operations swap_aops = {
-	.writepage	= swap_writepage,
-	.readpage	= swap_readpage,
-	.sync_page	= block_sync_page,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
-};
-
 #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
 
 /*
@@ -148,25 +139,15 @@ struct address_space_operations swap_aops = {
 int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 {
 	int ret;
+	unsigned long save_private;
 	struct writeback_control swap_wbc = {
 		.sync_mode = WB_SYNC_ALL,
 	};
 
 	lock_page(page);
-
-	BUG_ON(page->mapping);
-	ret = add_to_page_cache(page, &swapper_space,
-				entry.val, GFP_NOIO|__GFP_NOFAIL);
-	if (ret) {
-		unlock_page(page);
-		goto out;
-	}
-
-	/*
-	 * get one more reference to make page non-exclusive so
-	 * remove_exclusive_swap_page won't mess with it.
-	 */
-	page_cache_get(page);
+	SetPageSwapCache(page);
+	save_private = page->private;
+	page->private = entry.val;
 
 	if (rw == READ) {
 		ret = swap_readpage(NULL, page);
@@ -176,15 +157,10 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 		wait_on_page_writeback(page);
 	}
 
-	lock_page(page);
-	remove_from_page_cache(page);
-	unlock_page(page);
-	page_cache_release(page);
-	page_cache_release(page);	/* For add_to_page_cache() */
-
+	ClearPageSwapCache(page);
+	page->private = save_private;
 	if (ret == 0 && (!PageUptodate(page) || PageError(page)))
 		ret = -EIO;
-out:
 	return ret;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 3f304d8fd38a..455b498a9591 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -35,7 +35,18 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-/* #define DEBUG_RMAP */
+/*
+ * Something oopsable to put for now in the page->mapping
+ * of an anonymous page, to test that it is ignored.
+ */
+#define ANON_MAPPING_DEBUG	((struct address_space *) 0xADB)
+
+static inline void clear_page_anon(struct page *page)
+{
+	BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
+	page->mapping = NULL;
+	ClearPageAnon(page);
+}
 
 /*
  * Shared pages have a chain of pte_chain structures, used to locate
@@ -180,6 +191,10 @@ page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
 		SetPageDirect(page);
+		if (!page->mapping) {
+			SetPageAnon(page);
+			page->mapping = ANON_MAPPING_DEBUG;
+		}
 		inc_page_state(nr_mapped);
 		goto out;
 	}
@@ -271,10 +286,13 @@ void fastcall page_remove_rmap(struct page *page, pte_t *ptep)
 		}
 	}
 out:
-	if (page->pte.direct == 0 && page_test_and_clear_dirty(page))
-		set_page_dirty(page);
-	if (!page_mapped(page))
+	if (!page_mapped(page)) {
+		if (page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+		if (PageAnon(page))
+			clear_page_anon(page);
 		dec_page_state(nr_mapped);
+	}
 out_unlock:
 	rmap_unlock(page);
 }
@@ -330,12 +348,13 @@ static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
 	flush_cache_page(vma, address);
 	pte = ptep_clear_flush(vma, address, ptep);
 
-	if (PageSwapCache(page)) {
+	if (PageAnon(page)) {
+		swp_entry_t entry = { .val = page->private };
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
 		 */
-		swp_entry_t entry = { .val = page->index };
+		BUG_ON(!PageSwapCache(page));
 		swap_duplicate(entry);
 		set_pte(ptep, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*ptep));
@@ -345,6 +364,7 @@ static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
 		 * If a nonlinear mapping then store the file page offset
 		 * in the pte.
 		 */
+		BUG_ON(!page->mapping);
 		pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
 		pgidx += vma->vm_pgoff;
 		pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
@@ -391,20 +411,15 @@ int fastcall try_to_unmap(struct page * page)
 		BUG();
 	if (!PageLocked(page))
 		BUG();
-	/* We need backing store to swap out a page. */
-	if (!page->mapping)
-		BUG();
 
 	if (PageDirect(page)) {
 		ret = try_to_unmap_one(page, page->pte.direct);
 		if (ret == SWAP_SUCCESS) {
-			if (page_test_and_clear_dirty(page))
-				set_page_dirty(page);
 			page->pte.direct = 0;
 			ClearPageDirect(page);
 		}
 		goto out;
-	}		
+	}
 
 	start = page->pte.chain;
 	victim_i = pte_chain_idx(start);
@@ -436,9 +451,6 @@ int fastcall try_to_unmap(struct page * page)
 				} else {
 					start->next_and_idx++;
 				}
-				if (page->pte.direct == 0 &&
-				    page_test_and_clear_dirty(page))
-					set_page_dirty(page);
 				break;
 			case SWAP_AGAIN:
 				/* Skip this pte, remembering status. */
@@ -451,8 +463,14 @@ int fastcall try_to_unmap(struct page * page)
 		}
 	}
 out:
-	if (!page_mapped(page))
+	if (!page_mapped(page)) {
+		if (page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+		if (PageAnon(page))
+			clear_page_anon(page);
 		dec_page_state(nr_mapped);
+		ret = SWAP_SUCCESS;
+	}
 	return ret;
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 97f80d20807c..d76b2d1bcf79 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,25 +16,24 @@
 
 #include <asm/pgtable.h>
 
+/*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_list.  Only those fields initialized below are used.
+ */
+static struct address_space_operations swap_aops = {
+	.writepage	= swap_writepage,
+};
+
 static struct backing_dev_info swap_backing_dev_info = {
-	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
 	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
-extern struct address_space_operations swap_aops;
-
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
-	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
-	.i_mmap_shared	= LIST_HEAD_INIT(swapper_space.i_mmap_shared),
-	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
-	.truncate_count  = ATOMIC_INIT(0),
-	.private_lock	= SPIN_LOCK_UNLOCKED,
-	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
@@ -56,30 +55,55 @@ void show_swap_cache_info(void)
 		swap_cache_info.noent_race, swap_cache_info.exist_race);
 }
 
+/*
+ * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+static int __add_to_swap_cache(struct page *page,
+		swp_entry_t entry, int gfp_mask)
+{
+	int error;
+
+	BUG_ON(PageSwapCache(page));
+	BUG_ON(PagePrivate(page));
+	error = radix_tree_preload(gfp_mask);
+	if (!error) {
+		page_cache_get(page);
+		spin_lock(&swapper_space.tree_lock);
+		error = radix_tree_insert(&swapper_space.page_tree,
+						entry.val, page);
+		if (!error) {
+			SetPageLocked(page);
+			SetPageSwapCache(page);
+			page->private = entry.val;
+			total_swapcache_pages++;
+			pagecache_acct(1);
+		} else
+			page_cache_release(page);
+		spin_unlock(&swapper_space.tree_lock);
+		radix_tree_preload_end();
+	}
+	return error;
+}
+
 static int add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
 	int error;
 
-	if (page->mapping)
-		BUG();
 	if (!swap_duplicate(entry)) {
 		INC_CACHE_INFO(noent_race);
 		return -ENOENT;
 	}
-	error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL);
+	error = __add_to_swap_cache(page, entry, GFP_KERNEL);
 	/*
 	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
 	 */
-	if (error != 0) {
+	if (error) {
 		swap_free(entry);
 		if (error == -EEXIST)
 			INC_CACHE_INFO(exist_race);
 		return error;
 	}
-	if (!PageLocked(page))
-		BUG();
-	if (!PageSwapCache(page))
-		BUG();
 	INC_CACHE_INFO(add_total);
 	return 0;
 }
@@ -93,7 +117,12 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
-	__remove_from_page_cache(page);
+
+	radix_tree_delete(&swapper_space.page_tree, page->private);
+	page->private = 0;
+	ClearPageSwapCache(page);
+	total_swapcache_pages--;
+	pagecache_acct(-1);
 	INC_CACHE_INFO(del_total);
 }
 
@@ -137,8 +166,7 @@ int add_to_swap(struct page * page)
 		/*
 		 * Add it to the swap cache and mark it dirty
 		 */
-		err = add_to_page_cache(page, &swapper_space,
-					entry.val, GFP_ATOMIC);
+		err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
 
 		if (pf_flags & PF_MEMALLOC)
 			current->flags |= PF_MEMALLOC;
@@ -146,8 +174,7 @@ int add_to_swap(struct page * page)
 		switch (err) {
 		case 0:				/* Success */
 			SetPageUptodate(page);
-			__clear_page_dirty(page);
-			set_page_dirty(page);
+			SetPageDirty(page);
 			INC_CACHE_INFO(add_total);
 			return 1;
 		case -EEXIST:
@@ -173,81 +200,55 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
+	BUG_ON(!PageSwapCache(page));
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
   
-	entry.val = page->index;
+	entry.val = page->private;
 
-	spin_lock_irq(&swapper_space.tree_lock);
+	spin_lock(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
 }
 
+/*
+ * Strange swizzling function only for use by shmem_writepage
+ */
 int move_to_swap_cache(struct page *page, swp_entry_t entry)
 {
-	struct address_space *mapping = page->mapping;
-	int err;
-
-	spin_lock_irq(&swapper_space.tree_lock);
-	spin_lock(&mapping->tree_lock);
-
-	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
-	if (!err) {
-		__remove_from_page_cache(page);
-		___add_to_page_cache(page, &swapper_space, entry.val);
-	}
-
-	spin_unlock(&mapping->tree_lock);
-	spin_unlock_irq(&swapper_space.tree_lock);
-
+	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
 	if (!err) {
+		remove_from_page_cache(page);
+		page_cache_release(page);	/* pagecache ref */
 		if (!swap_duplicate(entry))
 			BUG();
-		BUG_ON(PageDirty(page));
-		set_page_dirty(page);
+		SetPageDirty(page);
 		INC_CACHE_INFO(add_total);
 	} else if (err == -EEXIST)
 		INC_CACHE_INFO(exist_race);
 	return err;
 }
 
+/*
+ * Strange swizzling function for shmem_getpage (and shmem_unuse)
+ */
 int move_from_swap_cache(struct page *page, unsigned long index,
 		struct address_space *mapping)
 {
-	swp_entry_t entry;
-	int err;
-
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(PagePrivate(page));
-
-	entry.val = page->index;
-
-	spin_lock_irq(&swapper_space.tree_lock);
-	spin_lock(&mapping->tree_lock);
-
-	err = radix_tree_insert(&mapping->page_tree, index, page);
+	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
 	if (!err) {
-		__delete_from_swap_cache(page);
-		___add_to_page_cache(page, mapping, index);
-	}
-
-	spin_unlock(&mapping->tree_lock);
-	spin_unlock_irq(&swapper_space.tree_lock);
-
-	if (!err) {
-		swap_free(entry);
-		__clear_page_dirty(page);
+		delete_from_swap_cache(page);
+		/* shift page from clean_pages to dirty_pages list */
+		ClearPageDirty(page);
 		set_page_dirty(page);
 	}
 	return err;
 }
 
-
 /* 
  * If we are the only user, then try to free up the swap cache. 
  * 
@@ -305,19 +306,17 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
  */
 struct page * lookup_swap_cache(swp_entry_t entry)
 {
-	struct page *found;
+	struct page *page;
 
-	found = find_get_page(&swapper_space, entry.val);
-	/*
-	 * Unsafe to assert PageSwapCache and mapping on page found:
-	 * if SMP nothing prevents swapoff from deleting this page from
-	 * the swap cache at this moment.  find_lock_page would prevent
-	 * that, but no need to change: we _have_ got the right page.
-	 */
-	INC_CACHE_INFO(find_total);
-	if (found)
+	spin_lock(&swapper_space.tree_lock);
+	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+	if (page) {
+		page_cache_get(page);
 		INC_CACHE_INFO(find_success);
-	return found;
+	}
+	spin_unlock(&swapper_space.tree_lock);
+	INC_CACHE_INFO(find_total);
+	return page;
 }
 
 /* 
@@ -335,10 +334,14 @@ struct page * read_swap_cache_async(swp_entry_t entry)
 		/*
 		 * First check the swap cache.  Since this is normally
 		 * called after lookup_swap_cache() failed, re-calling
-		 * that would confuse statistics: use find_get_page()
-		 * directly.
+		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(&swapper_space, entry.val);
+		spin_lock(&swapper_space.tree_lock);
+		found_page = radix_tree_lookup(&swapper_space.page_tree,
+						entry.val);
+		if (found_page)
+			page_cache_get(found_page);
+		spin_unlock(&swapper_space.tree_lock);
 		if (found_page)
 			break;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 44e214da0270..c3ece5503ddb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -304,16 +304,16 @@ static int exclusive_swap_page(struct page *page)
 	struct swap_info_struct * p;
 	swp_entry_t entry;
 
-	entry.val = page->index;
+	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (p) {
 		/* Is the only swap cache user the cache itself? */
 		if (p->swap_map[swp_offset(entry)] == 1) {
-			/* Recheck the page count with the pagecache lock held.. */
-			spin_lock_irq(&swapper_space.tree_lock);
-			if (page_count(page) - !!PagePrivate(page) == 2)
+			/* Recheck the page count with the swapcache lock held.. */
+			spin_lock(&swapper_space.tree_lock);
+			if (page_count(page) == 2)
 				retval = 1;
-			spin_unlock_irq(&swapper_space.tree_lock);
+			spin_unlock(&swapper_space.tree_lock);
 		}
 		swap_info_put(p);
 	}
@@ -372,7 +372,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (page_count(page) != 2) /* 2: us + cache */
 		return 0;
 
-	entry.val = page->index;
+	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (!p)
 		return 0;
@@ -380,14 +380,14 @@ int remove_exclusive_swap_page(struct page *page)
 	/* Is the only swap cache user the cache itself? */
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
-		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock_irq(&swapper_space.tree_lock);
+		/* Recheck the page count with the swapcache lock held.. */
+		spin_lock(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock(&swapper_space.tree_lock);
 	}
 	swap_info_put(p);
 
@@ -410,8 +410,14 @@ void free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, swp_offset(entry)) == 1)
-			page = find_trylock_page(&swapper_space, entry.val);
+		if (swap_entry_free(p, swp_offset(entry)) == 1) {
+			spin_lock(&swapper_space.tree_lock);
+			page = radix_tree_lookup(&swapper_space.page_tree,
+				entry.val);
+			if (page && TestSetPageLocked(page))
+				page = NULL;
+			spin_unlock(&swapper_space.tree_lock);
+		}
 		swap_info_put(p);
 	}
 	if (page) {
@@ -1053,14 +1059,14 @@ int page_queue_congested(struct page *page)
 
 	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
 
-	bdi = page->mapping->backing_dev_info;
 	if (PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page->index };
+		swp_entry_t entry = { .val = page->private };
 		struct swap_info_struct *sis;
 
 		sis = get_swap_info_struct(swp_type(entry));
 		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
-	}
+	} else
+		bdi = page->mapping->backing_dev_info;
 	return bdi_write_congested(bdi);
 }
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 35fbca1c5168..34151f9aed30 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -176,20 +176,20 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask)
 /* Must be called with page's rmap lock held. */
 static inline int page_mapping_inuse(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping;
 
 	/* Page is in somebody's page tables. */
 	if (page_mapped(page))
 		return 1;
 
-	/* XXX: does this happen ? */
-	if (!mapping)
-		return 0;
-
 	/* Be more reluctant to reclaim swapcache than pagecache */
 	if (PageSwapCache(page))
 		return 1;
 
+	mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+
 	/* File is mmap'd by somebody. */
 	if (!list_empty(&mapping->i_mmap))
 		return 1;
@@ -233,7 +233,7 @@ static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
 	lock_page(page);
-	if (page->mapping == mapping) {
+	if (page_mapping(page) == mapping) {
 		if (error == -ENOSPC)
 			set_bit(AS_ENOSPC, &mapping->flags);
 		else
@@ -286,27 +286,28 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 			goto activate_locked;
 		}
 
-		mapping = page->mapping;
+		mapping = page_mapping(page);
+		may_enter_fs = (gfp_mask & __GFP_FS);
 
 #ifdef CONFIG_SWAP
 		/*
-		 * Anonymous process memory without backing store. Try to
-		 * allocate it some swap space here.
+		 * Anonymous process memory has backing store?
+		 * Try to allocate it some swap space here.
 		 *
 		 * XXX: implement swap clustering ?
 		 */
-		if (page_mapped(page) && !mapping && !PagePrivate(page)) {
+		if (PageAnon(page) && !PageSwapCache(page)) {
 			rmap_unlock(page);
 			if (!add_to_swap(page))
 				goto activate_locked;
 			rmap_lock(page);
-			mapping = page->mapping;
+		}
+		if (PageSwapCache(page)) {
+			mapping = &swapper_space;
+			may_enter_fs = (gfp_mask & __GFP_IO);
 		}
 #endif /* CONFIG_SWAP */
 
-		may_enter_fs = (gfp_mask & __GFP_FS) ||
-				(PageSwapCache(page) && (gfp_mask & __GFP_IO));
-
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
@@ -427,7 +428,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page->index };
+			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			spin_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
@@ -669,8 +670,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
 		 * FIXME: need to consider page_count(page) here if/when we
 		 * reap orphaned pages via the LRU (Daniel's locking stuff)
 		 */
-		if (total_swap_pages == 0 && !page->mapping &&
-						!PagePrivate(page)) {
+		if (total_swap_pages == 0 && PageAnon(page)) {
 			list_add(&page->lru, &l_active);
 			continue;
 		}
-- 
cgit v1.2.3


From fbf7adfafae19dd118facbbfe011510ba6aa8315 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:54:31 -0700
Subject: [PATCH] rmap 3 arches + mapping_mapped

From: Hugh Dickins <hugh@veritas.com>

Some arches refer to page->mapping for their dcache flushing: use
page_mapping(page) for safety, to avoid confusion on anon pages, which will
store a different pointer there - though in most cases flush_dcache_page is
being applied to pagecache pages.

arm has a useful mapping_mapped macro: move that to generic, and add
mapping_writably_mapped, to avoid explicit list_empty checks on i_mmap and
i_mmap_shared in several places.

Very tempted to add page_mapped(page) tests, perhaps along with the
mapping_writably_mapped tests in do_generic_mapping_read and
do_shmem_file_read, to cut down on wasted flush_dcache effort; but the
serialization is not obvious, too unsafe to do in a hurry.
---
 arch/arm/mm/fault-armv.c        |  4 ++--
 arch/mips/mm/cache.c            |  9 +++------
 arch/parisc/kernel/cache.c      |  4 ++--
 arch/sparc64/kernel/smp.c       |  8 ++++----
 arch/sparc64/mm/init.c          | 14 ++++++--------
 fs/locks.c                      | 22 ++++++++--------------
 fs/xfs/linux/xfs_vnode.h        |  4 +---
 include/asm-arm/cacheflush.h    | 12 ++++--------
 include/asm-parisc/cacheflush.h |  3 +--
 include/asm-sh/pgalloc.h        |  4 ++--
 include/linux/fs.h              | 20 ++++++++++++++++++++
 mm/filemap.c                    |  2 +-
 mm/shmem.c                      |  2 +-
 mm/vmscan.c                     |  9 ++-------
 14 files changed, 57 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 7aa6398abdb0..8c5ad6a4c2c0 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -191,7 +191,7 @@ void __flush_dcache_page(struct page *page)
 
 	__cpuc_flush_dcache_page(page_address(page));
 
-	if (!page->mapping)
+	if (!page_mapping(page))
 		return;
 
 	/*
@@ -292,7 +292,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 	if (!pfn_valid(pfn))
 		return;
 	page = pfn_to_page(pfn);
-	if (page->mapping) {
+	if (page_mapping(page)) {
 		int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags);
 
 		if (dirty)
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index d384be0cb00e..5c9e9855caa8 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -57,16 +57,13 @@ void flush_dcache_page(struct page *page)
 {
 	unsigned long addr;
 
-	if (page->mapping &&
-	    list_empty(&page->mapping->i_mmap) &&
-	    list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		SetPageDcacheDirty(page);
-
 		return;
 	}
 
 	/*
-	 * We could delay the flush for the !page->mapping case too.  But that
+	 * We could delay the flush for the !page_mapping case too.  But that
 	 * case is for exec env/arg pages and those are %99 certainly going to
 	 * get faulted into the tlb (and thus flushed) anyways.
 	 */
@@ -81,7 +78,7 @@ void __update_cache(struct vm_area_struct *vma, unsigned long address,
 	unsigned long pfn, addr;
 
 	pfn = pte_pfn(pte);
-	if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page->mapping) &&
+	if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page_mapping(page)) &&
 	    Page_dcache_dirty(page)) {
 		if (pages_do_alias((unsigned long)page_address(page),
 		                   address & PAGE_MASK)) {
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index a23bb15dc2f8..ac36c927ab5b 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -68,7 +68,7 @@ update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
 	struct page *page = pte_page(pte);
 
-	if (VALID_PAGE(page) && page->mapping &&
+	if (VALID_PAGE(page) && page_mapping(page) &&
 	    test_bit(PG_dcache_dirty, &page->flags)) {
 
 		flush_kernel_dcache_page(page_address(page));
@@ -234,7 +234,7 @@ void __flush_dcache_page(struct page *page)
 
 	flush_kernel_dcache_page(page_address(page));
 
-	if (!page->mapping)
+	if (!page_mapping(page))
 		return;
 	/* check shared list first if it's not empty...it's usually
 	 * the shortest */
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index ce479585c484..88fe647652f4 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -671,9 +671,9 @@ static __inline__ void __local_flush_dcache_page(struct page *page)
 #if (L1DCACHE_SIZE > PAGE_SIZE)
 	__flush_dcache_page(page->virtual,
 			    ((tlb_type == spitfire) &&
-			     page->mapping != NULL));
+			     page_mapping(page) != NULL));
 #else
-	if (page->mapping != NULL &&
+	if (page_mapping(page) != NULL &&
 	    tlb_type == spitfire)
 		__flush_icache_page(__pa(page->virtual));
 #endif
@@ -694,7 +694,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 		if (tlb_type == spitfire) {
 			data0 =
 				((u64)&xcall_flush_dcache_page_spitfire);
-			if (page->mapping != NULL)
+			if (page_mapping(page) != NULL)
 				data0 |= ((u64)1 << 32);
 			spitfire_xcall_deliver(data0,
 					       __pa(page->virtual),
@@ -727,7 +727,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 		goto flush_self;
 	if (tlb_type == spitfire) {
 		data0 = ((u64)&xcall_flush_dcache_page_spitfire);
-		if (page->mapping != NULL)
+		if (page_mapping(page) != NULL)
 			data0 |= ((u64)1 << 32);
 		spitfire_xcall_deliver(data0,
 				       __pa(page->virtual),
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 690120faf6c8..81e68ee52f8d 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -139,9 +139,9 @@ __inline__ void flush_dcache_page_impl(struct page *page)
 #if (L1DCACHE_SIZE > PAGE_SIZE)
 	__flush_dcache_page(page->virtual,
 			    ((tlb_type == spitfire) &&
-			     page->mapping != NULL));
+			     page_mapping(page) != NULL));
 #else
-	if (page->mapping != NULL &&
+	if (page_mapping(page) != NULL &&
 	    tlb_type == spitfire)
 		__flush_icache_page(__pa(page->virtual));
 #endif
@@ -203,7 +203,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
 
 	pfn = pte_pfn(pte);
 	if (pfn_valid(pfn) &&
-	    (page = pfn_to_page(pfn), page->mapping) &&
+	    (page = pfn_to_page(pfn), page_mapping(page)) &&
 	    ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) {
 		int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL));
 
@@ -227,9 +227,7 @@ void flush_dcache_page(struct page *page)
 	int dirty = test_bit(PG_dcache_dirty, &page->flags);
 	int dirty_cpu = dcache_dirty_cpu(page);
 
-	if (page->mapping &&
-	    list_empty(&page->mapping->i_mmap) &&
-	    list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		if (dirty) {
 			if (dirty_cpu == smp_processor_id())
 				return;
@@ -237,7 +235,7 @@ void flush_dcache_page(struct page *page)
 		}
 		set_dcache_dirty(page);
 	} else {
-		/* We could delay the flush for the !page->mapping
+		/* We could delay the flush for the !page_mapping
 		 * case too.  But that case is for exec env/arg
 		 * pages and those are %99 certainly going to get
 		 * faulted into the tlb (and thus flushed) anyways.
@@ -279,7 +277,7 @@ static inline void flush_cache_pte_range(struct mm_struct *mm, pmd_t *pmd, unsig
 			if (!pfn_valid(pfn))
 				continue;
 			page = pfn_to_page(pfn);
-			if (PageReserved(page) || !page->mapping)
+			if (PageReserved(page) || !page_mapping(page))
 				continue;
 			pgaddr = (unsigned long) page_address(page);
 			uaddr = address + offset;
diff --git a/fs/locks.c b/fs/locks.c
index c6a6010a7218..da593493962c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1453,13 +1453,10 @@ int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l)
 	 * and shared.
 	 */
 	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
-		struct address_space *mapping = filp->f_mapping;
-
-		if (!list_empty(&mapping->i_mmap_shared)) {
-			error = -EAGAIN;
-			goto out;
-		}
+	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+	    mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
 	}
 
 	error = flock_to_posix_lock(filp, file_lock, &flock);
@@ -1591,13 +1588,10 @@ int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
 	 * and shared.
 	 */
 	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
-		struct address_space *mapping = filp->f_mapping;
-
-		if (!list_empty(&mapping->i_mmap_shared)) {
-			error = -EAGAIN;
-			goto out;
-		}
+	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+	    mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
 	}
 
 	error = flock64_to_posix_lock(filp, file_lock, &flock);
diff --git a/fs/xfs/linux/xfs_vnode.h b/fs/xfs/linux/xfs_vnode.h
index 6736f7aa2b97..af0b65fe5136 100644
--- a/fs/xfs/linux/xfs_vnode.h
+++ b/fs/xfs/linux/xfs_vnode.h
@@ -596,9 +596,7 @@ static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
 /*
  * Some useful predicates.
  */
-#define VN_MAPPED(vp)	\
-	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
-	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
+#define VN_MAPPED(vp)	mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
 #define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
 #define VN_DIRTY(vp)	mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
 					PAGECACHE_TAG_DIRTY)
diff --git a/include/asm-arm/cacheflush.h b/include/asm-arm/cacheflush.h
index 6968e8e90c3e..91b16cc3f502 100644
--- a/include/asm-arm/cacheflush.h
+++ b/include/asm-arm/cacheflush.h
@@ -283,23 +283,19 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr)
  * flush_dcache_page is used when the kernel has written to the page
  * cache page at virtual address page->virtual.
  *
- * If this page isn't mapped (ie, page->mapping = NULL), or it has
- * userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared)
- * then we _must_ always clean + invalidate the dcache entries associated
- * with the kernel mapping.
+ * If this page isn't mapped (ie, page_mapping == NULL), or it might
+ * have userspace mappings, then we _must_ always clean + invalidate
+ * the dcache entries associated with the kernel mapping.
  *
  * Otherwise we can defer the operation, and clean the cache when we are
  * about to change to user space.  This is the same method as used on SPARC64.
  * See update_mmu_cache for the user space part.
  */
-#define mapping_mapped(map)	(!list_empty(&(map)->i_mmap) || \
-				 !list_empty(&(map)->i_mmap_shared))
-
 extern void __flush_dcache_page(struct page *);
 
 static inline void flush_dcache_page(struct page *page)
 {
-	if (page->mapping && !mapping_mapped(page->mapping))
+	if (page_mapping(page) && !mapping_mapped(page->mapping))
 		set_bit(PG_dcache_dirty, &page->flags);
 	else
 		__flush_dcache_page(page);
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index 52b0c6a96aea..7a77986e3738 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -69,8 +69,7 @@ extern void __flush_dcache_page(struct page *page);
 
 static inline void flush_dcache_page(struct page *page)
 {
-	if (page->mapping && list_empty(&page->mapping->i_mmap) &&
-			list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		set_bit(PG_dcache_dirty, &page->flags);
 	} else {
 		__flush_dcache_page(page);
diff --git a/include/asm-sh/pgalloc.h b/include/asm-sh/pgalloc.h
index bd02728a69d5..4584c9e37a75 100644
--- a/include/asm-sh/pgalloc.h
+++ b/include/asm-sh/pgalloc.h
@@ -101,8 +101,8 @@ static inline pte_t ptep_get_and_clear(pte_t *ptep)
 		unsigned long pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (!page->mapping
-			    || list_empty(&page->mapping->i_mmap_shared))
+			if (!page_mapping(page) ||
+			    !mapping_writably_mapped(page->mapping))
 				__clear_bit(PG_mapped, &page->flags);
 		}
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c7f0052b4abd..3d7c320d675e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -373,6 +373,26 @@ struct block_device {
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
+/*
+ * Might pages of this file be mapped into userspace?
+ */
+static inline int mapping_mapped(struct address_space *mapping)
+{
+	return	!list_empty(&mapping->i_mmap) ||
+		!list_empty(&mapping->i_mmap_shared);
+}
+
+/*
+ * Might pages of this file have been modified in userspace?
+ * Note that i_mmap_shared holds all the VM_SHARED vmas: do_mmap_pgoff
+ * marks vma as VM_SHARED if it is shared, and the file was opened for
+ * writing i.e. vma may be mprotected writable even if now readonly.
+ */
+static inline int mapping_writably_mapped(struct address_space *mapping)
+{
+	return	!list_empty(&mapping->i_mmap_shared);
+}
+
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index ca8fc1148296..c83a97b5aed7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -660,7 +660,7 @@ page_ok:
 		 * virtual addresses, take care about potential aliasing
 		 * before reading the page on the kernel side.
 		 */
-		if (!list_empty(&mapping->i_mmap_shared))
+		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
 		/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 345e04cb0f6c..31001df23c3d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1340,7 +1340,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 			 * virtual addresses, take care about potential aliasing
 			 * before reading the page on the kernel side.
 			 */
-			if (!list_empty(&mapping->i_mmap_shared))
+			if (mapping_writably_mapped(mapping))
 				flush_dcache_page(page);
 			/*
 			 * Mark the page accessed if we read the beginning.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 34151f9aed30..55d8feae21a2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -190,13 +190,8 @@ static inline int page_mapping_inuse(struct page *page)
 	if (!mapping)
 		return 0;
 
-	/* File is mmap'd by somebody. */
-	if (!list_empty(&mapping->i_mmap))
-		return 1;
-	if (!list_empty(&mapping->i_mmap_shared))
-		return 1;
-
-	return 0;
+	/* File is mmap'd by somebody? */
+	return mapping_mapped(mapping);
 }
 
 static inline int is_page_cache_freeable(struct page *page)
-- 
cgit v1.2.3


From 66fb1123391ed704997523cb6daea7fae00dcb83 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:54:44 -0700
Subject: [PATCH] rename page_to_nodenum()

From: "Martin J. Bligh" <mbligh@aracnet.com>

I'd prefer we renamed this to page_to_nid() before anyone starts using it.
This fits with the naming convention of everything else (pfn_to_nid, etc).
Nobody uses it right now - I grepped the whole tree.
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 796f498658d6..d453a0ab62a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -351,7 +351,7 @@ static inline unsigned long page_zonenum(struct page *page)
 {
 	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
 }
-static inline unsigned long page_nodenum(struct page *page)
+static inline unsigned long page_to_nid(struct page *page)
 {
 	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
 }
-- 
cgit v1.2.3


From 69a03dedc92b7968fc9ca5c701e8d2d6c481750d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:57:22 -0700
Subject: [PATCH] swsusp update: supports discontingmem/highmem

From: Pavel Machek <pavel@ucw.cz>

Bill Irwin did some work on this.  It makes swsusp behave correctly w.r.t.
discontingmem, and adds highmem handling (very simple-minded, but should work
ok with 1GB).  It now should behave correctly w.r.t.  more than one swap
device, and fixes double restoring of console.
---
 include/linux/suspend.h |   2 +-
 kernel/power/swsusp.c   | 244 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 173 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 810947658d59..7e4409b7c55b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -24,7 +24,7 @@ typedef struct pbe {
 #define SWAP_FILENAME_MAXLENGTH	32
 
 struct suspend_header {
-	__u32 version_code;
+	u32 version_code;
 	unsigned long num_physpages;
 	char machine[8];
 	char version[20];
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae748a467af5..23e577559fd9 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,11 @@
 /*
- * linux/kernel/suspend.c
+ * linux/kernel/power/swsusp.c
  *
  * This file is to realize architecture-independent
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -61,6 +61,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -74,11 +75,6 @@ unsigned char software_suspend_enabled = 0;
 #define NORESUME		1
 #define RESUME_SPECIFIED	2
 
-
-#define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
-#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
-#define ADDRESS2(x) __ADDRESS(__pa(x))		/* Needed for x86-64 where some pages are in memory twice */
-
 /* References to section boundaries */
 extern char __nosave_begin, __nosave_end;
 
@@ -105,6 +101,10 @@ unsigned int nr_copy_pages __nosavedata = 0;
    time of suspend, that must be freed. Second is "pagedir_nosave", 
    allocated at time of resume, that travels through memory not to
    collide with anything.
+
+   Warning: this is even more evil than it seems. Pagedirs this file
+   talks about are completely different from page directories used by
+   MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 static suspend_pagedir_t *pagedir_save;
@@ -139,15 +139,15 @@ static const char name_resume[] = "Resume Machine: ";
 #define TEST_SWSUSP 0		/* Set to 1 to reboot instead of halt machine after suspension */
 
 #ifdef DEBUG_DEFAULT
-# define PRINTK(f, a...)       printk(f, ## a)
+# define PRINTK(f, a...)	printk(f, ## a)
 #else
-# define PRINTK(f, a...)
+# define PRINTK(f, a...)       	do { } while(0)
 #endif
 
 #ifdef DEBUG_SLOW
 #define MDELAY(a) mdelay(a)
 #else
-#define MDELAY(a)
+#define MDELAY(a) do { } while(0)
 #endif
 
 /*
@@ -225,6 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
 static void read_swapfiles(void) /* This is called before saving image */
 {
 	int i, len;
+	char buff[sizeof(resume_file)], *sname;
 	
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
@@ -243,8 +244,11 @@ static void read_swapfiles(void) /* This is called before saving image */
 					swapfile_used[i] = SWAPFILE_IGNORED;				  
 			} else {
 	  			/* we ignore all swap devices that are not the resume_file */
-				if (1) {
-// FIXME				if(resume_device == swap_info[i].swap_device) {
+				sname = d_path(swap_info[i].swap_file->f_dentry,
+					       swap_info[i].swap_file->f_vfsmnt,
+					       buff,
+					       sizeof(buff));
+				if (!strcmp(sname, resume_file)) {
 					swapfile_used[i] = SWAPFILE_SUSPEND;
 					root_swap = i;
 				} else {
@@ -346,7 +350,7 @@ static int write_suspend_image(void)
 
 	cur = (void *) buffer;
 	if (fill_suspend_header(&cur->sh))
-		panic("\nOut of memory while writing header");
+		BUG();		/* Not a BUG_ON(): we want fill_suspend_header to be called, always */
 		
 	cur->link.next = prev;
 
@@ -362,73 +366,165 @@ static int write_suspend_image(void)
 	return 0;
 }
 
-/* if pagedir_p != NULL it also copies the counted pages */
-static int count_and_copy_data_pages(struct pbe *pagedir_p)
-{
-	int chunk_size;
-	int nr_copy_pages = 0;
-	int pfn;
+struct highmem_page {
+	char *data;
 	struct page *page;
-	
-#ifdef CONFIG_DISCONTIGMEM
-	panic("Discontingmem not supported");
-#else
-	BUG_ON (max_pfn != num_physpages);
-#endif
-	for (pfn = 0; pfn < max_pfn; pfn++) {
+	struct highmem_page *next;
+};
+
+struct highmem_page *highmem_copy = NULL;
+
+static void save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		int chunk_size;
+
+		if (!(pfn%200))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
-		if (PageHighMem(page))
-			panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			BUG();
+		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			panic("Not enough memory");
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data)
+			panic("Not enough memory");
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+}
 
-		if (!PageReserved(page)) {
-			if (PageNosave(page))
-				continue;
+static void save_highmem(void)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			save_highmem_zone(zone);
+	}
+}
 
-			if ((chunk_size=is_head_of_free_region(page))!=0) {
-				pfn += chunk_size - 1;
-				continue;
-			}
-		} else if (PageReserved(page)) {
-			BUG_ON (PageNosave(page));
+static int restore_highmem(void)
+{
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+	return 0;
+}
 
-			/*
-			 * Just copy whole code segment. Hopefully it is not that big.
-			 */
-			if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
-			    (ADDRESS(pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
-				PRINTK("[nosave %lx]", ADDRESS(pfn));
-				continue;
-			}
-			/* Hmm, perhaps copying all reserved pages is not too healthy as they may contain 
-			   critical bios data? */
-		} else	BUG();
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
 
-		nr_copy_pages++;
-		if (pagedir_p) {
-			pagedir_p->orig_address = ADDRESS(pfn);
-			copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
-			pagedir_p++;
+/* if *pagedir_p != NULL it also copies the counted pages */
+static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
+{
+	unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
+	struct pbe *pbe = *pagedir_p;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%200))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		BUG_ON(PageReserved(page) && PageNosave(page));
+		if (PageNosave(page))
+			continue;
+		if (PageReserved(page) && pfn_is_nosave(pfn)) {
+			PRINTK("[nosave pfn 0x%lx]", pfn);
+			continue;
 		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		nr_copy_pages++;
+		if (!pbe)
+			continue;
+		pbe->orig_address = (long) page_address(page);
+		copy_page((void *)pbe->address, (void *)pbe->orig_address);
+		pbe++;
 	}
+	*pagedir_p = pbe;
 	return nr_copy_pages;
 }
 
-static void free_suspend_pagedir(unsigned long this_pagedir)
+static int count_and_copy_data_pages(struct pbe *pagedir_p)
 {
-	struct page *page;
-	int pfn;
-	unsigned long this_pagedir_end = this_pagedir +
-		(PAGE_SIZE << pagedir_order);
+	int nr_copy_pages = 0;
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
+	}
+	return nr_copy_pages;
+}
 
-	for(pfn = 0; pfn < num_physpages; pfn++) {
+static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
+{
+	unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
+	pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
+	pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
+	pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
 		if (!TestClearPageNosave(page))
 			continue;
+		else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
+			continue;
+		__free_page(page);
+	}
+}
 
-		if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
-			continue; /* old pagedir gets freed in one */
-		
-		free_page(ADDRESS(pfn));
+static void free_suspend_pagedir(unsigned long this_pagedir)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			free_suspend_pagedir_zone(zone, this_pagedir);
 	}
 	free_pages(this_pagedir, pagedir_order);
 }
@@ -443,7 +539,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
 	pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
 
 	p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
-	if(!pagedir)
+	if (!pagedir)
 		return NULL;
 
 	page = virt_to_page(pagedir);
@@ -492,10 +588,12 @@ static int suspend_prepare_image(void)
 	struct sysinfo i;
 	unsigned int nr_needed_pages = 0;
 
-	drain_local_pages();
-
 	pagedir_nosave = NULL;
-	printk( "/critical section: Counting pages to copy" );
+	printk( "/critical section: Handling highmem" );
+	save_highmem();
+
+	printk(", counting pages to copy" );
+	drain_local_pages();
 	nr_copy_pages = count_and_copy_data_pages(NULL);
 	nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
 	
@@ -603,21 +701,23 @@ asmlinkage void do_magic_resume_2(void)
 
 	PRINTK( "Freeing prev allocated pagedir\n" );
 	free_suspend_pagedir((unsigned long) pagedir_save);
+
+	printk( "Restoring highmem\n" );
+	restore_highmem();
+	printk("done, devices\n");
+
 	device_power_up();
 	spin_unlock_irq(&suspend_pagedir_lock);
 	device_resume();
 
-	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
-	release_console_sem();
-
+	/* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
 	PRINTK( "Fixing swap signatures... " );
 	mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
 	PRINTK( "ok\n" );
 
 #ifdef SUSPEND_CONSOLE
 	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
+	update_screen(fg_console);
 	release_console_sem();
 #endif
 }
-- 
cgit v1.2.3


From 93616c25b89090f28ad3ee509c33d69bddbeb7a8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:01:04 -0700
Subject: [PATCH] remove concatenation with __FUNCTION__ include/*

From: Tony Breeds <tony@bakeyournoodle.com>
---
 include/linux/jbd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 62c7f363ae74..e9f6c69f79db 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -1012,10 +1012,10 @@ extern int	cleanup_journal_tail(journal_t *);
 /* Debugging code only: */
 
 #define jbd_ENOSYS() \
-do {								      \
-	printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \
-	current->state = TASK_UNINTERRUPTIBLE;			      \
-	schedule();						      \
+do {								           \
+	printk (KERN_ERR "JBD unimplemented function %s\n", __FUNCTION__); \
+	current->state = TASK_UNINTERRUPTIBLE;			           \
+	schedule();						           \
 } while (1)
 
 /*
-- 
cgit v1.2.3


From 8447ac2688647d261af7a7397a53548a2a1afc13 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:03:29 -0700
Subject: [PATCH] Rename bitmap_clear to bitmap_zero, remove CLEAR_BITMAP

From: Rusty Russell <rusty@rustcorp.com.au>

clear_bit(n, addr) clears the nth bit.
test_and_clear_bit(n, addr) clears the nth bit.
cpu_clear(n, cpumask) clears the nth bit (vs. cpus_clear()).
bitmap_clear(bitmap, n) clears out all the bits up to n.

Moreover, there's a CLEAR_BITMAP() in linux/types.h which bitmap_clear() is
a wrapper for.

Rename bitmap_clear to bitmap_zero, which is harder to confuse (yes, it bit
me), and make everyone use it.
---
 arch/ia64/sn/kernel/sn2/sn2_smp.c   | 2 +-
 drivers/atm/lanai.c                 | 6 +++---
 drivers/ieee1394/ieee1394_types.h   | 2 +-
 drivers/scsi/atari_NCR5380.c        | 4 ++--
 include/asm-generic/cpumask_array.h | 2 +-
 include/asm-i386/mpspec.h           | 2 +-
 include/asm-x86_64/mpspec.h         | 2 +-
 include/linux/bitmap.h              | 4 ++--
 include/linux/types.h               | 2 --
 lib/bitmap.c                        | 2 +-
 mm/page_alloc.c                     | 2 +-
 11 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 3cfb3cd74d51..e8bc389edebb 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -91,7 +91,7 @@ sn2_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbit
 	short			nasids[NR_NODES], nix;
 	DECLARE_BITMAP(nodes_flushed, NR_NODES);
 
-	CLEAR_BITMAP(nodes_flushed, NR_NODES);
+	bitmap_zero(nodes_flushed, NR_NODES);
 
 	i = 0;
 
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 402e78a5ac97..5a7037af6add 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1743,7 +1743,7 @@ static void run_service(struct lanai_dev *lanai)
 		read_lock(&vcc_sklist_lock);
 		vci_bitfield_iterate(lanai, lanai->transmit_ready,
 		    iter_transmit);
-		CLEAR_BITMAP(&lanai->transmit_ready, NUM_VCI);
+		bitmap_zero(lanai->transmit_ready, NUM_VCI);
 		read_unlock(&vcc_sklist_lock);
 	}
 }
@@ -2158,8 +2158,8 @@ static int __init lanai_dev_open(struct atm_dev *atmdev)
 	/* Basic device fields */
 	lanai->number = atmdev->number;
 	lanai->num_vci = NUM_VCI;
-	CLEAR_BITMAP(&lanai->backlog_vccs, NUM_VCI);
-	CLEAR_BITMAP(&lanai->transmit_ready, NUM_VCI);
+	bitmap_zero(lanai->backlog_vccs, NUM_VCI);
+	bitmap_zero(lanai->transmit_ready, NUM_VCI);
 	lanai->naal0 = 0;
 #ifdef USE_POWERDOWN
 	lanai->nbound = 0;
diff --git a/drivers/ieee1394/ieee1394_types.h b/drivers/ieee1394/ieee1394_types.h
index 552667142ce1..3165609ec1ec 100644
--- a/drivers/ieee1394/ieee1394_types.h
+++ b/drivers/ieee1394/ieee1394_types.h
@@ -24,7 +24,7 @@ struct hpsb_tlabel_pool {
 
 #define HPSB_TPOOL_INIT(_tp)			\
 do {						\
-	CLEAR_BITMAP((_tp)->pool, 64);		\
+	bitmap_zero((_tp)->pool, 64);		\
 	spin_lock_init(&(_tp)->lock);		\
 	(_tp)->next = 0;			\
 	(_tp)->allocations = 0;			\
diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c
index cd8ddb7084a2..5d1e78ebed83 100644
--- a/drivers/scsi/atari_NCR5380.c
+++ b/drivers/scsi/atari_NCR5380.c
@@ -329,7 +329,7 @@ static void __init init_tags( void )
     for( target = 0; target < 8; ++target ) {
 	for( lun = 0; lun < 8; ++lun ) {
 	    ta = &TagAlloc[target][lun];
-	    CLEAR_BITMAP( ta->allocated, MAX_TAGS );
+	    bitmap_zero(ta->allocated, MAX_TAGS);
 	    ta->nr_allocated = 0;
 	    /* At the beginning, assume the maximum queue size we could
 	     * support (MAX_TAGS). This value will be decreased if the target
@@ -438,7 +438,7 @@ static void free_all_tags( void )
     for( target = 0; target < 8; ++target ) {
 	for( lun = 0; lun < 8; ++lun ) {
 	    ta = &TagAlloc[target][lun];
-	    CLEAR_BITMAP( ta->allocated, MAX_TAGS );
+	    bitmap_zero(ta->allocated, MAX_TAGS);
 	    ta->nr_allocated = 0;
 	}
     }
diff --git a/include/asm-generic/cpumask_array.h b/include/asm-generic/cpumask_array.h
index bd5c49133c6c..c7e2db29dc53 100644
--- a/include/asm-generic/cpumask_array.h
+++ b/include/asm-generic/cpumask_array.h
@@ -16,7 +16,7 @@
 
 #define cpus_and(dst,src1,src2)	bitmap_and((dst).mask,(src1).mask, (src2).mask, NR_CPUS)
 #define cpus_or(dst,src1,src2)	bitmap_or((dst).mask, (src1).mask, (src2).mask, NR_CPUS)
-#define cpus_clear(map)		bitmap_clear((map).mask, NR_CPUS)
+#define cpus_clear(map)		bitmap_zero((map).mask, NR_CPUS)
 #define cpus_complement(map)	bitmap_complement((map).mask, NR_CPUS)
 #define cpus_equal(map1, map2)	bitmap_equal((map1).mask, (map2).mask, NR_CPUS)
 #define cpus_empty(map)		bitmap_empty(map.mask, NR_CPUS)
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index 78bd12b7ae42..b376b093749c 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -52,7 +52,7 @@ typedef struct physid_mask physid_mask_t;
 
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_clear(map)			bitmap_clear((map).mask, MAX_APICS)
+#define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
 #define physids_complement(map)			bitmap_complement((map).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index 896b99f11cec..cbe6058e9270 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -211,7 +211,7 @@ typedef struct physid_mask physid_mask_t;
 
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_clear(map)			bitmap_clear((map).mask, MAX_APICS)
+#define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
 #define physids_complement(map)			bitmap_complement((map).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 2ad5fb97fa26..81e73cdc1a62 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -16,9 +16,9 @@ int bitmap_equal(const unsigned long *bitmap1,
 			unsigned long *bitmap2, int bits);
 void bitmap_complement(unsigned long *bitmap, int bits);
 
-static inline void bitmap_clear(unsigned long *bitmap, int bits)
+static inline void bitmap_zero(unsigned long *bitmap, int bits)
 {
-	CLEAR_BITMAP((unsigned long *)bitmap, bits);
+	memset(bitmap, 0, BITS_TO_LONGS(bits)*sizeof(unsigned long));
 }
 
 static inline void bitmap_fill(unsigned long *bitmap, int bits)
diff --git a/include/linux/types.h b/include/linux/types.h
index 93f5f3653561..23c414f11cbe 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -8,8 +8,6 @@
 	(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
-#define CLEAR_BITMAP(name,bits) \
-	memset(name, 0, BITS_TO_LONGS(bits)*sizeof(unsigned long))
 #endif
 
 #include <linux/posix_types.h>
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 602b919ef551..779d30365e46 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -273,7 +273,7 @@ int bitmap_parse(const char __user *ubuf, unsigned int ubuflen,
 	int c, old_c, totaldigits, ndigits, nchunks, nbits;
 	u32 chunk;
 
-	bitmap_clear(maskp, nmaskbits);
+	bitmap_zero(maskp, nmaskbits);
 
 	nchunks = nbits = totaldigits = c = 0;
 	do {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6b4d5dc0c930..8d3f6f46105e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1222,7 +1222,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 	local_node = pgdat->node_id;
 	load = numnodes;
 	prev_node = local_node;
-	CLEAR_BITMAP(used_mask, MAX_NUMNODES);
+	bitmap_zero(used_mask, MAX_NUMNODES);
 	while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
-- 
cgit v1.2.3


From d0d15d849f62d00edbc95de914f3bc655d3b8f7b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:06:06 -0700
Subject: [PATCH] Add CONFIG_SYSFS

From: Patrick Mochel <mochel@digitalimplant.org>

Here is a patch to make sysfs optional.  Note that with CONFIG_SYSFS=n you
must specify the boot device's major:minor on the kernel boot command line
with

	root=03:01

For embedded systems, it will save a significant amount of memory during
runtime.  And, it saves 4k from the built kernel image for me.
---
 fs/Kconfig            | 24 +++++++++++++++
 fs/Makefile           |  2 +-
 fs/namespace.c        |  8 +++++
 include/linux/sysfs.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++-----
 init/do_mounts.c      |  2 ++
 5 files changed, 110 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Kconfig b/fs/Kconfig
index c748a2ce35ee..3c0d06f5e359 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -780,6 +780,30 @@ config PROC_KCORE
 	bool
 	default y if !ARM
 
+config SYSFS
+	bool "sysfs file system support" if EMBEDDED
+	default y
+	help
+	The sysfs filesystem is a virtual filesystem that the kernel uses to
+	export internal kernel objects, their attributes, and their
+	relationships to one another.
+
+	Users can use sysfs to ascertain useful information about the running
+	kernel, such as the devices the kernel has discovered on each bus and
+	which driver each is bound to. sysfs can also be used to tune devices
+	and other kernel subsystems.
+
+	Some system agents rely on the information in sysfs to operate.
+	/sbin/hotplug uses device and object attributes in sysfs to assist in
+	delegating policy decisions, like persistantly naming devices.
+
+	sysfs is currently used by the block subsystem to mount the root
+	partition.  If sysfs is disabled you must specify the boot device on
+	the kernel boot command line via its major and minor numbers.  For
+	example, "root=03:01" for /dev/hda1.
+
+	Designers of embedded systems may wish to say N here to conserve space.
+
 config DEVFS_FS
 	bool "/dev file system support (OBSOLETE)"
 	depends on EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 9647bebd4895..a288c0cb3645 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_QUOTACTL)		+= quota.o
 
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
-obj-y				+= sysfs/
+obj-$(CONFIG_SYSFS)		+= sysfs/
 obj-y				+= devpts/
 
 obj-$(CONFIG_PROFILING)		+= dcookies.o
diff --git a/fs/namespace.c b/fs/namespace.c
index 4584a684c685..3bb33614d764 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -24,7 +24,15 @@
 #include <asm/uaccess.h>
 
 extern int __init init_rootfs(void);
+
+#ifdef CONFIG_SYSFS
 extern int __init sysfs_init(void);
+#else
+static inline int sysfs_init(void)
+{
+	return 0;
+}
+#endif
 
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b34de79dcf3b..de2083939b74 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -18,6 +18,12 @@ struct attribute {
 	mode_t			mode;
 };
 
+struct attribute_group {
+	char			* name;
+	struct attribute	** attrs;
+};
+
+
 struct bin_attribute {
 	struct attribute	attr;
 	size_t			size;
@@ -25,14 +31,13 @@ struct bin_attribute {
 	ssize_t (*write)(struct kobject *, char *, loff_t, size_t);
 };
 
-int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr);
-int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr);
-
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *,char *);
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
 };
 
+#ifdef CONFIG_SYSFS
+
 extern int
 sysfs_create_dir(struct kobject *);
 
@@ -57,13 +62,75 @@ sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name);
 extern void
 sysfs_remove_link(struct kobject *, char * name);
 
-
-struct attribute_group {
-	char			* name;
-	struct attribute	** attrs;
-};
+int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr);
+int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr);
 
 int sysfs_create_group(struct kobject *, const struct attribute_group *);
 void sysfs_remove_group(struct kobject *, const struct attribute_group *);
 
+#else /* CONFIG_SYSFS */
+
+static inline int sysfs_create_dir(struct kobject * k)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_dir(struct kobject * k)
+{
+	;
+}
+
+static inline void sysfs_rename_dir(struct kobject * k, const char *new_name)
+{
+	;
+}
+
+static inline int sysfs_create_file(struct kobject * k, const struct attribute * a)
+{
+	return 0;
+}
+
+static inline int sysfs_update_file(struct kobject * k, const struct attribute * a)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_file(struct kobject * k, const struct attribute * a)
+{
+	;
+}
+
+static inline int sysfs_create_link(struct kobject * k, struct kobject * t, char * n)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_link(struct kobject * k, char * name)
+{
+	;
+}
+
+
+static inline int sysfs_create_bin_file(struct kobject * k, struct bin_attribute * a)
+{
+	return 0;
+}
+
+static inline int sysfs_remove_bin_file(struct kobject * k, struct bin_attribute * a)
+{
+	return 0;
+}
+
+static inline int sysfs_create_group(struct kobject * k, const struct attribute_group *g)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_group(struct kobject * k, const struct attribute_group * g)
+{
+	;
+}
+
+#endif /* CONFIG_SYSFS */
+
 #endif /* _SYSFS_H_ */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index fef791e4dcb6..02385f3c7697 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -141,9 +141,11 @@ dev_t __init name_to_dev_t(char *name)
 	dev_t res = 0;
 	int part;
 
+#ifdef CONFIG_SYSFS
 	sys_mkdir("/sys", 0700);
 	if (sys_mount("sysfs", "/sys", "sysfs", 0, NULL) < 0)
 		goto out;
+#endif
 
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
-- 
cgit v1.2.3


From c28abd70dfe80c4806d0b39fa7314aa50754dbf3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:06:19 -0700
Subject: [PATCH] JBD: BH_Revoke cleanup

Use the bh bit test/set infrastructure rather than open-coding everything.
No functional changes.
---
 fs/jbd/revoke.c     | 42 ++++++++++++++++++------------------------
 include/linux/jbd.h |  4 ++++
 2 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index a084064cb741..1564a48163d2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -358,17 +358,15 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
 		if (bh2) {
 			/* ... and it has RevokeValid status... */
-			if ((bh2 != bh) &&
-			    test_bit(BH_RevokeValid, &bh2->b_state))
+			if (bh2 != bh && buffer_revokevalid(bh2))
 				/* ...then it better be revoked too,
 				 * since it's illegal to create a revoke
 				 * record against a buffer_head which is
 				 * not marked revoked --- that would
 				 * risk missing a subsequent revoke
 				 * cancel. */
-				J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
-							  bh2->b_state));
-			__brelse(bh2);
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
 		}
 	}
 #endif
@@ -377,9 +375,9 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
            first having the revoke cancelled: it's illegal to free a
            block twice without allocating it in between! */
 	if (bh) {
-		J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
-		set_bit(BH_Revoked, &bh->b_state);
-		set_bit(BH_RevokeValid, &bh->b_state);
+		J_ASSERT_BH(bh, !buffer_revoked(bh));
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
 		if (bh_in) {
 			BUFFER_TRACE(bh_in, "call journal_forget");
 			journal_forget(handle, bh_in);
@@ -400,7 +398,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
  * Cancel an outstanding revoke.  For use only internally by the
  * journaling code (called from journal_get_write_access).
  *
- * We trust the BH_Revoked bit on the buffer if the buffer is already
+ * We trust buffer_revoked() on the buffer if the buffer is already
  * being journaled: if there is no revoke pending on the buffer, then we
  * don't do anything here.
  *
@@ -427,11 +425,11 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	 * only perform the full cancel if the revoke bit is set.  If
 	 * not, we can't trust the revoke bit, and we need to do the
 	 * full search for a revoke record. */
-	if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
-		need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
-	else {
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
 		need_cancel = 1;
-		clear_bit(BH_Revoked, &bh->b_state);
+		clear_buffer_revoked(bh);
 	}
 
 	if (need_cancel) {
@@ -462,7 +460,7 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
 		if (bh2) {
 			if (bh2 != bh)
-				clear_bit(BH_Revoked, &bh2->b_state);
+				clear_buffer_revoked(bh2);
 			__brelse(bh2);
 		}
 	}
@@ -597,24 +595,20 @@ static void flush_descriptor(journal_t *journal,
 			     int offset)
 {
 	journal_revoke_header_t *header;
+	struct buffer_head *bh = jh2bh(descriptor);
 
 	if (is_journal_aborted(journal)) {
-		JBUFFER_TRACE(descriptor, "brelse");
-		__brelse(jh2bh(descriptor));
+		put_bh(bh);
 		return;
 	}
 
 	header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
 	header->r_count = htonl(offset);
-	set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
-	{
-		struct buffer_head *bh = jh2bh(descriptor);
-		BUFFER_TRACE(bh, "write");
-		set_buffer_uptodate(bh);
-		ll_rw_block (WRITE, 1, &bh);
-	}
+	set_buffer_jwrite(bh);
+	BUFFER_TRACE(bh, "write");
+	set_buffer_uptodate(bh);
+	ll_rw_block(WRITE, 1, &bh);
 }
-
 #endif
 
 /* 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index e9f6c69f79db..0a625c3cd38b 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -305,6 +305,10 @@ BUFFER_FNS(JBD, jbd)
 BUFFER_FNS(JWrite, jwrite)
 BUFFER_FNS(JBDDirty, jbddirty)
 TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
 BUFFER_FNS(Freed, freed)
 
 static inline struct buffer_head *jh2bh(struct journal_head *jh)
-- 
cgit v1.2.3