From 6ed12ff83c765aeda7d38d3bf9df7d46d24bfb11 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 30 Sep 2002 22:17:42 -0700
Subject: [PATCH] Workqueue Abstraction

This is the next iteration of the workqueue abstraction.

The framework includes:

 - per-CPU queueing support.

on SMP there is a per-CPU worker thread (bound to its CPU) and per-CPU
work queues - this feature is completely transparent to workqueue-users.
keventd automatically uses this feature. XFS can now update to work-queues
and have the same per-CPU performance as it had with its per-CPU worker
threads.

 - delayed work submission

there's a new queue_delayed_work(wq, work, delay) function and a new
schedule_delayed_work(work, delay) function. The later one is used to
correctly fix former tq_timer users. I've reverted those changes in 2.5.40
that changed tq_timer uses to schedule_work() - eg. in the case of
random.c or the tty flip queue it was definitely the wrong thing to do.

delayed work means a timer embedded in struct work_struct.  I considered
using split struct work_struct and delayed_work_struct types, but lots
of code actively uses task-queues in both delayed and non-delayed mode,
so i went for the more generic approach that allows both methods of work
submission.  Delayed timers do not cause any other overhead in the
normal submission path otherwise.

 - multithreaded run_workqueue() implementation

the run_workqueue() function can now be called from multiple contexts, and
a worker thread will only use up a single entryy - this property is used
by the flushing code, and can potentially be used in the future to extend
the number of per-CPU worker threads.

 - more reliable flushing

there's now a 'pending work' counter, which is used to accurately detect
when the last work-function has finished execution. It's also used to
correctly flush against timed requests. I'm not convinced whether the old
keventd implementation got this detail right.

 - i switched the arguments of the queueing function(s) per Jeff's
   suggestion, it's more straightforward this way.


Driver fixes:

i have converted almost every affected driver to the new framework. This
cleaned up tons of code. I also fixed a number of drivers that were still
using BHs (these drivers did not compile in 2.5.40).

while this means lots of changes, it might ease the QA decision whether to
put this patch into 2.5.

The pach converts roughly 80% of all tqueue-using code to workqueues - and
all the places that are not converted to workqueues yet are places that do
not compile in vanilla 2.5.40 anyway, due to unrelated changes. I've
converted a fair number of drivers that do not compile in 2.5.40, and i
think i've managed to convert every driver that compiles under 2.5.40.
---
 include/linux/blkdev.h            |  1 -
 include/linux/compatmac.h         |  2 +-
 include/linux/cyclades.h          |  2 +-
 include/linux/hayesesp.h          |  4 +--
 include/linux/if_wanpipe_common.h |  2 +-
 include/linux/isdn.h              |  2 +-
 include/linux/isicom.h            |  4 +--
 include/linux/istallion.h         |  2 +-
 include/linux/jffs2_fs_sb.h       |  4 +--
 include/linux/kbd_kern.h          |  2 +-
 include/linux/ncp_fs_sb.h         |  8 ++---
 include/linux/reiserfs_fs.h       |  2 +-
 include/linux/reiserfs_fs_sb.h    |  2 +-
 include/linux/sched.h             |  3 --
 include/linux/serial167.h         |  2 +-
 include/linux/serialP.h           |  4 +--
 include/linux/stallion.h          |  2 +-
 include/linux/sunrpc/debug.h      |  2 +-
 include/linux/sunrpc/sched.h      |  1 -
 include/linux/sunrpc/types.h      |  2 +-
 include/linux/suspend.h           |  2 +-
 include/linux/tqueue.h            | 55 ---------------------------------
 include/linux/tty.h               |  8 ++---
 include/linux/tty_flip.h          |  2 +-
 include/linux/wanpipe.h           |  8 ++---
 include/linux/workqueue.h         | 64 +++++++++++++++++++++++++++++++++++++++
 26 files changed, 98 insertions(+), 94 deletions(-)
 delete mode 100644 include/linux/tqueue.h
 create mode 100644 include/linux/workqueue.h

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ee1171a3ae67..55e0742783ed 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -4,7 +4,6 @@
 #include <linux/major.h>
 #include <linux/sched.h>
 #include <linux/genhd.h>
-#include <linux/tqueue.h>
 #include <linux/list.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
diff --git a/include/linux/compatmac.h b/include/linux/compatmac.h
index 1e28380cabb7..5ae68a6b58e7 100644
--- a/include/linux/compatmac.h
+++ b/include/linux/compatmac.h
@@ -102,7 +102,7 @@ static inline void *ioremap(unsigned long base, long length)
 
 #define my_iounmap(x, b)             (((long)x<0x100000)?0:vfree ((void*)x))
 
-#define tty_flip_buffer_push(tty)    queue_task(&tty->flip.tqueue, &tq_timer)
+#define tty_flip_buffer_push(tty)    schedule_delayed_work(&tty->flip.work, 1)
 #define signal_pending(current)      (current->signal & ~current->blocked)
 #define schedule_timeout(to)         do {current->timeout = jiffies + (to);schedule ();} while (0)
 #define time_after(t1,t2)            (((long)t1-t2) > 0)
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 0fe53b96a2e4..4d6235fb96cf 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -605,7 +605,7 @@ struct cyclades_port {
 	struct cyclades_monitor	mon;
 	struct cyclades_idle_stats	idle_stats;
 	struct cyclades_icount	icount;
-	struct tq_struct	tqueue;
+	struct work_struct	tqueue;
 	wait_queue_head_t       open_wait;
 	wait_queue_head_t       close_wait;
 	wait_queue_head_t       shutdown_wait;
diff --git a/include/linux/hayesesp.h b/include/linux/hayesesp.h
index 7efc5e31dc11..00774e5d4550 100644
--- a/include/linux/hayesesp.h
+++ b/include/linux/hayesesp.h
@@ -102,8 +102,8 @@ struct esp_struct {
 	int			xmit_head;
 	int			xmit_tail;
 	int			xmit_cnt;
-	struct tq_struct	tqueue;
-	struct tq_struct	tqueue_hangup;
+	struct work_struct	tqueue;
+	struct work_struct	tqueue_hangup;
 	struct termios		normal_termios;
 	struct termios		callout_termios;
 	wait_queue_head_t	open_wait;
diff --git a/include/linux/if_wanpipe_common.h b/include/linux/if_wanpipe_common.h
index e1c95b5d4cda..a5cab08d6990 100644
--- a/include/linux/if_wanpipe_common.h
+++ b/include/linux/if_wanpipe_common.h
@@ -39,7 +39,7 @@ typedef struct {
 	int   (*func) (struct sk_buff *, netdevice_t *, 
                        struct sock *);
 
-	struct tq_struct wanpipe_task;    /* Immediate BH handler task */
+	struct work_struct wanpipe_work;    /* deferred keventd work */
 	unsigned char rw_bind;			  /* Sock bind state */
 	unsigned char usedby;
 	unsigned char state;
diff --git a/include/linux/isdn.h b/include/linux/isdn.h
index 9bcc7e3dba80..7de3324606ee 100644
--- a/include/linux/isdn.h
+++ b/include/linux/isdn.h
@@ -362,7 +362,7 @@ typedef struct isdn_net_local_s {
   char cisco_line_state;		/* state of line according to keepalive packets */
   char cisco_debserint;			/* debugging flag of cisco hdlc with slarp */
   struct timer_list cisco_timer;
-  struct tq_struct tqueue;
+  struct work_struct tqueue;
   struct isdn_netif_ops   *ops;
 } isdn_net_local;
 
diff --git a/include/linux/isicom.h b/include/linux/isicom.h
index 6b0fc14f83e5..72114196a2fc 100644
--- a/include/linux/isicom.h
+++ b/include/linux/isicom.h
@@ -154,8 +154,8 @@ struct	isi_port {
 	struct tty_struct 	* tty;
 	wait_queue_head_t	close_wait;
 	wait_queue_head_t	open_wait;
-	struct tq_struct	hangup_tq;
-	struct tq_struct	bh_tqueue;
+	struct work_struct	hangup_tq;
+	struct work_struct	bh_tqueue;
 	unsigned char		* xmit_buf;
 	int			xmit_head;
 	int			xmit_tail;
diff --git a/include/linux/istallion.h b/include/linux/istallion.h
index e8a2709f66fb..c93624048244 100644
--- a/include/linux/istallion.h
+++ b/include/linux/istallion.h
@@ -79,7 +79,7 @@ typedef struct {
 	wait_queue_head_t	close_wait;
 	wait_queue_head_t	raw_wait;
 #endif
-	struct tq_struct	tqhangup;
+	struct work_struct	tqhangup;
 	struct termios		normaltermios;
 	struct termios		callouttermios;
 	asysigs_t		asig;
diff --git a/include/linux/jffs2_fs_sb.h b/include/linux/jffs2_fs_sb.h
index cab82ddbddc5..611aa1b5b129 100644
--- a/include/linux/jffs2_fs_sb.h
+++ b/include/linux/jffs2_fs_sb.h
@@ -5,7 +5,7 @@
 
 #include <linux/types.h>
 #include <linux/spinlock.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #include <linux/completion.h>
 #include <asm/semaphore.h>
 #include <linux/list.h>
@@ -82,7 +82,7 @@ struct jffs2_sb_info {
 	uint32_t wbuf_ofs;
 	uint32_t wbuf_len;
 	uint32_t wbuf_pagesize;
-	struct tq_struct wbuf_task;		/* task for timed wbuf flush */
+	struct work_struct wbuf_task;		/* task for timed wbuf flush */
 	struct timer_list wbuf_timer;		/* timer for flushing wbuf */
 
 	/* OS-private pointer for getting back to master superblock info */
diff --git a/include/linux/kbd_kern.h b/include/linux/kbd_kern.h
index abc0c3e7bad2..8e6d600dc534 100644
--- a/include/linux/kbd_kern.h
+++ b/include/linux/kbd_kern.h
@@ -150,7 +150,7 @@ extern unsigned int keymap_count;
 
 static inline void con_schedule_flip(struct tty_struct *t)
 {
-	schedule_task(&t->flip.tqueue);
+	schedule_work(&t->flip.work);
 }
 
 #endif
diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h
index 02ef9bb50b51..9febc18c6a85 100644
--- a/include/linux/ncp_fs_sb.h
+++ b/include/linux/ncp_fs_sb.h
@@ -13,7 +13,7 @@
 
 #ifdef __KERNEL__
 
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 
 #define NCP_DEFAULT_OPTIONS 0		/* 2 for packet signatures */
 
@@ -91,7 +91,7 @@ struct ncp_server {
 	void (*error_report)(struct sock* sk);
 	void (*write_space)(struct sock* sk);	/* STREAM mode only */
 	struct {
-		struct tq_struct tq;		/* STREAM/DGRAM: data/error ready */
+		struct work_struct tq;		/* STREAM/DGRAM: data/error ready */
 		struct ncp_request_reply* creq;	/* STREAM/DGRAM: awaiting reply from this request */
 		struct semaphore creq_sem;	/* DGRAM only: lock accesses to rcv.creq */
 
@@ -110,11 +110,11 @@ struct ncp_server {
 	} rcv;
 	struct {
 		struct list_head requests;	/* STREAM only: queued requests */
-		struct tq_struct tq;		/* STREAM only: transmitter ready */
+		struct work_struct tq;		/* STREAM only: transmitter ready */
 		struct ncp_request_reply* creq;	/* STREAM only: currently transmitted entry */
 	} tx;
 	struct timer_list timeout_tm;		/* DGRAM only: timeout timer */
-	struct tq_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
+	struct work_struct timeout_tq;		/* DGRAM only: associated queue, we run timers from process context */
 	int timeout_last;			/* DGRAM only: current timeout length */
 	int timeout_retries;			/* DGRAM only: retries left */
 	struct {
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 0deab78dea6b..ee20abf5bb6c 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -16,7 +16,7 @@
 #ifdef __KERNEL__
 #include <linux/slab.h>
 #include <linux/interrupt.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #include <asm/unaligned.h>
 #include <linux/bitops.h>
 #include <linux/proc_fs.h>
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 74bd8e0a1d3e..28b31edd08dc 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -5,7 +5,7 @@
 #define _LINUX_REISER_FS_SB
 
 #ifdef __KERNEL__
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #endif
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a361b76cf43..850a01b4455c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -172,9 +172,6 @@ extern unsigned long cache_decay_ticks;
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);
 
-extern int start_context_thread(void);
-extern int current_is_keventd(void);
-
 struct namespace;
 
 /* Maximum number of active map areas.. This is a random (large) number */
diff --git a/include/linux/serial167.h b/include/linux/serial167.h
index 2c805a1791cf..1683e8941c04 100644
--- a/include/linux/serial167.h
+++ b/include/linux/serial167.h
@@ -51,7 +51,7 @@ struct cyclades_port {
 	int			xmit_cnt;
         int                     default_threshold;
         int                     default_timeout;
-	struct tq_struct	tqueue;
+	struct work_struct	tqueue;
 	struct termios		normal_termios;
 	struct termios		callout_termios;
 	wait_queue_head_t	open_wait;
diff --git a/include/linux/serialP.h b/include/linux/serialP.h
index 40c5b938472c..6fcb341a8776 100644
--- a/include/linux/serialP.h
+++ b/include/linux/serialP.h
@@ -21,7 +21,7 @@
 
 #include <linux/config.h>
 #include <linux/termios.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #include <linux/circ_buf.h>
 #include <linux/wait.h>
 #if (LINUX_VERSION_CODE < 0x020300)
@@ -86,7 +86,7 @@ struct async_struct {
 	u8			*iomem_base;
 	u16			iomem_reg_shift;
 	int			io_type;
-	struct tq_struct	tqueue;
+	struct work_struct			work;
 #ifdef DECLARE_WAITQUEUE
 	wait_queue_head_t	open_wait;
 	wait_queue_head_t	close_wait;
diff --git a/include/linux/stallion.h b/include/linux/stallion.h
index 072f89508a13..50cc96401196 100644
--- a/include/linux/stallion.h
+++ b/include/linux/stallion.h
@@ -104,7 +104,7 @@ typedef struct stlport {
 #endif
 	struct termios		normaltermios;
 	struct termios		callouttermios;
-	struct tq_struct	tqueue;
+	struct work_struct	tqueue;
 	comstats_t		stats;
 	stlrq_t			tx;
 } stlport_t;
diff --git a/include/linux/sunrpc/debug.h b/include/linux/sunrpc/debug.h
index 5d1f842c8512..b338aa49201e 100644
--- a/include/linux/sunrpc/debug.h
+++ b/include/linux/sunrpc/debug.h
@@ -12,7 +12,7 @@
 #include <linux/config.h>
 
 #include <linux/timer.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 
 /*
  * Enable RPC debugging/profiling.
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index ff3a4ad22a13..601899c6ed87 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -10,7 +10,6 @@
 #define _LINUX_SUNRPC_SCHED_H_
 
 #include <linux/timer.h>
-#include <linux/tqueue.h>
 #include <linux/sunrpc/types.h>
 #include <linux/wait.h>
 
diff --git a/include/linux/sunrpc/types.h b/include/linux/sunrpc/types.h
index d524016fb4ba..d222f47550af 100644
--- a/include/linux/sunrpc/types.h
+++ b/include/linux/sunrpc/types.h
@@ -10,7 +10,7 @@
 #define _LINUX_SUNRPC_TYPES_H_
 
 #include <linux/timer.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/list.h>
 
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index ccc76b9ba88b..23e0ccdab015 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -43,7 +43,7 @@ struct suspend_header {
 
 #define SUSPEND_PD_PAGES(x)     (((x)*sizeof(struct pbe))/PAGE_SIZE+1)
    
-extern struct tq_struct suspend_tq;
+extern struct work_struct suspend_tq;
 
 /* mm/vmscan.c */
 extern int shrink_mem(void);
diff --git a/include/linux/tqueue.h b/include/linux/tqueue.h
deleted file mode 100644
index cca0b193617b..000000000000
--- a/include/linux/tqueue.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * tqueue.h --- task queue handling for Linux.
- *
- * Modified version of previous incarnations of task-queues,
- * written by:
- *
- * (C) 1994 Kai Petzke, wpp@marie.physik.tu-berlin.de
- * Modified for use in the Linux kernel by Theodore Ts'o,
- * tytso@mit.edu.
- */
-
-#ifndef _LINUX_TQUEUE_H
-#define _LINUX_TQUEUE_H
-
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/bitops.h>
-#include <asm/system.h>
-
-struct tq_struct {
-	struct list_head list;		/* linked list of active tq's */
-	unsigned long sync;		/* must be initialized to zero */
-	void (*routine)(void *);	/* function to call */
-	void *data;			/* argument to function */
-};
-
-/*
- * Emit code to initialise a tq_struct's routine and data pointers
- */
-#define PREPARE_TQUEUE(_tq, _routine, _data)			\
-	do {							\
-		(_tq)->routine = _routine;			\
-		(_tq)->data = _data;				\
-	} while (0)
-
-/*
- * Emit code to initialise all of a tq_struct
- */
-#define INIT_TQUEUE(_tq, _routine, _data)			\
-	do {							\
-		INIT_LIST_HEAD(&(_tq)->list);			\
-		(_tq)->sync = 0;				\
-		PREPARE_TQUEUE((_tq), (_routine), (_data));	\
-	} while (0)
-
-#define DECLARE_TASK_QUEUE(q)	LIST_HEAD(q)
-
-/* Schedule a tq to run in process context */
-extern int schedule_task(struct tq_struct *task);
-
-/* finish all currently pending tasks - do not call from irq context */
-extern void flush_scheduled_tasks(void);
-
-#endif
-
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 3474a7dba9c2..81c8d745f708 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -20,7 +20,7 @@
 #include <linux/fs.h>
 #include <linux/major.h>
 #include <linux/termios.h>
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_ldisc.h>
 
@@ -138,7 +138,7 @@ extern struct screen_info screen_info;
 #define TTY_FLIPBUF_SIZE 512
 
 struct tty_flip_buffer {
-	struct tq_struct tqueue;
+	struct work_struct		work;
 	struct semaphore pty_sem;
 	char		*char_buf_ptr;
 	unsigned char	*flag_buf_ptr;
@@ -279,7 +279,7 @@ struct tty_struct {
 	int alt_speed;		/* For magic substitution of 38400 bps */
 	wait_queue_head_t write_wait;
 	wait_queue_head_t read_wait;
-	struct tq_struct tq_hangup;
+	struct work_struct hangup_work;
 	void *disc_data;
 	void *driver_data;
 	struct list_head tty_files;
@@ -309,7 +309,7 @@ struct tty_struct {
 	struct semaphore atomic_write;
 	spinlock_t read_lock;
 	/* If the tty has a pending do_SAK, queue it here - akpm */
-	struct tq_struct SAK_tq;
+	struct work_struct SAK_work;
 };
 
 /* tty magic number */
diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h
index 738ffcd53264..abe9bfcf226c 100644
--- a/include/linux/tty_flip.h
+++ b/include/linux/tty_flip.h
@@ -19,7 +19,7 @@ _INLINE_ void tty_insert_flip_char(struct tty_struct *tty,
 
 _INLINE_ void tty_schedule_flip(struct tty_struct *tty)
 {
-	schedule_task(&tty->flip.tqueue);
+	schedule_delayed_work(&tty->flip.work, 1);
 }
 
 #undef _INLINE_
diff --git a/include/linux/wanpipe.h b/include/linux/wanpipe.h
index 061090a368f8..74b35ec9099c 100644
--- a/include/linux/wanpipe.h
+++ b/include/linux/wanpipe.h
@@ -320,7 +320,7 @@ typedef struct {
 
 #include <linux/sdladrv.h>	/* SDLA support module API definitions */
 #include <linux/sdlasfm.h>	/* SDLA firmware module definitions */
-#include <linux/tqueue.h>
+#include <linux/workqueue.h>
 #ifdef LINUX_2_4
   #include <linux/serial.h>
   #include <linux/serialP.h>
@@ -389,7 +389,7 @@ typedef struct sdla
 	unsigned int tty_open;
 	unsigned char *tty_buf;
 	unsigned char *tty_rx;
-	struct tq_struct tty_task_queue;
+	struct work_struct tty_work;
 	
 	union
 	{
@@ -422,7 +422,7 @@ typedef struct sdla
 			u8 oob_on_modem;	/* Option to send modem status to the api */
 			u16 num_of_ch;		/* Number of channels configured by the user */
 
-			struct tq_struct x25_poll_task;
+			struct work_struct x25_poll_work;
 			struct timer_list x25_timer;
 		} x;
 		struct
@@ -545,7 +545,7 @@ int wsppp_init (sdla_t* card, wandev_conf_t* conf);	/* Sync PPP on top of RAW CH
 extern sdla_t * wanpipe_find_card(char *);
 extern sdla_t * wanpipe_find_card_num (int);
 
-extern void wanpipe_queue_tq (struct tq_struct *);
+extern void wanpipe_queue_work (struct work_struct *);
 extern void wanpipe_mark_bh (void);
 extern void wakeup_sk_bh (netdevice_t *);
 extern int change_dev_flags (netdevice_t *, unsigned); 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
new file mode 100644
index 000000000000..7828c7bef55f
--- /dev/null
+++ b/include/linux/workqueue.h
@@ -0,0 +1,64 @@
+/*
+ * workqueue.h --- work queue handling for Linux.
+ */
+
+#ifndef _LINUX_WORKQUEUE_H
+#define _LINUX_WORKQUEUE_H
+
+#include <linux/timer.h>
+
+struct workqueue_struct;
+
+struct work_struct {
+	unsigned long pending;
+	struct list_head entry;
+	void (*func)(void *);
+	void *data;
+	void *wq_data;
+	timer_t timer;
+};
+
+#define __WORK_INITIALIZER(n, f, d) {				\
+        .entry	= { &(n).entry, &(n).entry },			\
+	.func = (f),						\
+	.data = (d) }
+
+#define DECLARE_WORK(n, f, d)					\
+	struct work_struct n = __WORK_INITIALIZER(n, f, d)
+
+/*
+ * initialize a work-struct's func and data pointers:
+ */
+#define PREPARE_WORK(_work, _func, _data)			\
+	do {							\
+		(_work)->func = _func;				\
+		(_work)->data = _data;				\
+	} while (0)
+
+/*
+ * initialize all of a work-struct:
+ */
+#define INIT_WORK(_work, _func, _data)				\
+	do {							\
+		INIT_LIST_HEAD(&(_work)->entry);		\
+		(_work)->pending = 0;				\
+		PREPARE_WORK((_work), (_func), (_data));	\
+		init_timer(&(_work)->timer);			\
+	} while (0)
+
+extern struct workqueue_struct *create_workqueue(const char *name);
+extern void destroy_workqueue(struct workqueue_struct *wq);
+
+extern int FASTCALL(queue_work(struct workqueue_struct *wq, struct work_struct *work));
+extern int FASTCALL(queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay));
+extern void FASTCALL(flush_workqueue(struct workqueue_struct *wq));
+
+extern int FASTCALL(schedule_work(struct work_struct *work));
+extern int FASTCALL(schedule_delayed_work(struct work_struct *work, unsigned long delay));
+extern void flush_scheduled_work(void);
+extern int current_is_keventd(void);
+
+extern void init_workqueues(void);
+
+#endif
+
-- 
cgit v1.2.3


From 1d819b9d528aecf6a1792713164a73560978cd6f Mon Sep 17 00:00:00 2001
From: Pavel Machek <pavel@ucw.cz>
Date: Tue, 1 Oct 2002 01:52:15 -0700
Subject: [PATCH] Swsusp updates, do not thrash ide disk on suspend

This cleans up swsusp a little bit and fixes ide disk corruption on
suspend/resume.
									Pavel
---
 Documentation/swsusp.txt | 17 +++++++++++
 drivers/ide/ide-disk.c   | 73 +++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/ide/ide-pnp.c    |  4 ++-
 drivers/ide/ide-probe.c  |  6 ++++
 include/linux/ide.h      |  2 ++
 kernel/suspend.c         | 19 +++++++------
 6 files changed, 111 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/swsusp.txt b/Documentation/swsusp.txt
index 88bcefb64403..9a8fd611d139 100644
--- a/Documentation/swsusp.txt
+++ b/Documentation/swsusp.txt
@@ -156,6 +156,23 @@ Drivers that need support
 - do IDE cdroms need some kind of support?
 - IDE CD-RW -- how to deal with that?
 
+FAQ:
+
+Q: well, suspending a server is IMHO a really stupid thing,
+but... (Diego Zuccato):
+
+A: You bought new UPS for your server. How do you install it without
+bringing machine down? Suspend to disk, rearrange power cables,
+resume.
+
+You have your server on UPS. Power died, and UPS is indicating 30
+seconds to failure. What do you do? Suspend to disk.
+
+Ethernet card in your server died. You want to replace it. Your
+server is not hotplug capable. What do you do? Suspend to disk,
+replace ethernet card, resume. If you are fast your users will not
+even see broken connections.
+
 Any other idea you might have tell me!
 
 Contacting the author
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index 58438b363cf9..b5927f2e2787 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -544,6 +544,7 @@ static ide_startstop_t lba_48_rw_disk(ide_drive_t *, struct request *, unsigned
  */
 static ide_startstop_t do_rw_disk (ide_drive_t *drive, struct request *rq, unsigned long block)
 {
+	BUG_ON(drive->blocked);
 	if (!blk_fs_request(rq)) {
 		blk_dump_rq_flags(rq, "do_rw_disk - bad command");
 		ide_end_request(drive, 0, 0);
@@ -1495,8 +1496,70 @@ static void idedisk_add_settings(ide_drive_t *drive)
  	ide_add_setting(drive,	"max_failures",		SETTING_RW,					-1,			-1,			TYPE_INT,	0,	65535,				1,	1,	&drive->max_failures,		NULL);
 }
 
+static int idedisk_suspend(struct device *dev, u32 state, u32 level)
+{
+	ide_drive_t *drive = dev->driver_data;
+
+	printk("Suspending device %lx\n", dev->driver_data);
+
+	/* I hope that every freeze operation from the upper levels have
+	 * already been done...
+	 */
+
+	if (level != SUSPEND_SAVE_STATE)
+		return 0;
+	BUG_ON(in_interrupt());
+
+	printk("Waiting for commands to finish\n");
+
+	/* wait until all commands are finished */
+	/* FIXME: waiting for spinlocks should be done instead. */
+	if (!(HWGROUP(drive)))
+		printk("No hwgroup?\n");
+	while (HWGROUP(drive)->handler)
+		yield();
+
+	/* set the drive to standby */
+	printk(KERN_INFO "suspending: %s ", drive->name);
+	if (drive->driver) {
+		if (drive->driver->standby)
+			drive->driver->standby(drive);
+	}
+	drive->blocked = 1;
+
+	while (HWGROUP(drive)->handler)
+		yield();
+
+	return 0;
+}
+
+static int idedisk_resume(struct device *dev, u32 level)
+{
+	ide_drive_t *drive = dev->driver_data;
+
+	if (level != RESUME_RESTORE_STATE)
+		return 0;
+	if (!drive->blocked)
+		panic("ide: Resume but not suspended?\n");
+
+	drive->blocked = 0;
+	return 0;
+}
+
+
+/* This is just a hook for the overall driver tree.
+ */
+
+static struct device_driver idedisk_devdrv = {
+	.bus = &ide_bus_type,
+	.name = "IDE disk driver",
+
+	.suspend = idedisk_suspend,
+	.resume = idedisk_resume,
+};
+
 static int idedisk_ioctl (ide_drive_t *drive, struct inode *inode,
-		struct file *file, unsigned int cmd, unsigned long arg)
+	struct file *file, unsigned int cmd, unsigned long arg)
 {
 #if 0
 HDIO_GET_ADDRESS
@@ -1540,6 +1603,11 @@ static void idedisk_setup (ide_drive_t *drive)
 			drive->doorlocking = 1;
 		}
 	}
+	{
+		sprintf(drive->disk->disk_dev.name, "ide-disk");
+		drive->disk->disk_dev.driver = &idedisk_devdrv;
+		drive->disk->disk_dev.driver_data = drive;
+	}
 
 #if 1
 	(void) probe_lba_addressing(drive, 1);
@@ -1623,6 +1691,8 @@ static void idedisk_setup (ide_drive_t *drive)
 static int idedisk_cleanup (ide_drive_t *drive)
 {
 	struct gendisk *g = drive->disk;
+
+	put_device(&drive->disk->disk_dev);
 	if ((drive->id->cfs_enable_2 & 0x3000) && drive->wcache)
 		if (do_idedisk_flushcache(drive))
 			printk (KERN_INFO "%s: Write Cache FAILED Flushing!\n",
@@ -1721,6 +1791,7 @@ static void __exit idedisk_exit (void)
 static int idedisk_init (void)
 {
 	ide_register_driver(&idedisk_driver);
+	driver_register(&idedisk_devdrv);
 	return 0;
 }
 
diff --git a/drivers/ide/ide-pnp.c b/drivers/ide/ide-pnp.c
index e0209a663bbe..6dcf255e7a7c 100644
--- a/drivers/ide/ide-pnp.c
+++ b/drivers/ide/ide-pnp.c
@@ -52,6 +52,7 @@ struct pnp_dev_t {
 static int __init pnpide_generic_init(struct pci_dev *dev, int enable)
 {
 	hw_regs_t hw;
+	ide_hwif_t *hwif;
 	int index;
 
 	if (!enable)
@@ -67,10 +68,11 @@ static int __init pnpide_generic_init(struct pci_dev *dev, int enable)
 //			generic_pnp_ide_iops,
 			DEV_IRQ(dev, 0));
 
-	index = ide_register_hw(&hw, NULL);
+	index = ide_register_hw(&hw, &hwif);
 
 	if (index != -1) {
 	    	printk(KERN_INFO "ide%d: %s IDE interface\n", index, DEV_NAME(dev));
+		hwif->pci_dev = dev;
 		return 0;
 	}
 
diff --git a/drivers/ide/ide-probe.c b/drivers/ide/ide-probe.c
index 7f8c04f6125c..2ce345fc60a4 100644
--- a/drivers/ide/ide-probe.c
+++ b/drivers/ide/ide-probe.c
@@ -46,6 +46,7 @@
 #include <linux/delay.h>
 #include <linux/ide.h>
 #include <linux/spinlock.h>
+#include <linux/pci.h>
 
 #include <asm/byteorder.h>
 #include <asm/irq.h>
@@ -566,6 +567,11 @@ static void hwif_register (ide_hwif_t *hwif)
 	/* register with global device tree */
 	strncpy(hwif->gendev.bus_id,hwif->name,BUS_ID_SIZE);
 	snprintf(hwif->gendev.name,DEVICE_NAME_SIZE,"IDE Controller");
+	hwif->gendev.driver_data = hwif;
+	if (hwif->pci_dev)
+		hwif->gendev.parent = &hwif->pci_dev->dev;
+	else
+		hwif->gendev.parent = NULL; /* Would like to do = &device_legacy */
 	device_register(&hwif->gendev);
 
 	if (hwif->mmio == 2)
diff --git a/include/linux/ide.h b/include/linux/ide.h
index ca112e24acdb..74fff672eaef 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/bitops.h>
 #include <linux/bio.h>
+#include <linux/device.h>
 #include <linux/pci.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
@@ -789,6 +790,7 @@ typedef struct ide_drive_s {
 	unsigned autotune	: 2;	/* 1=autotune, 2=noautotune, 0=default */
 	unsigned remap_0_to_1	: 2;	/* 0=remap if ezdrive, 1=remap, 2=noremap */
 	unsigned ata_flash	: 1;	/* 1=present, 0=default */
+	unsigned blocked        : 1;	/* 1=powermanagment told us not to do anything, so sleep nicely */
 	unsigned addressing;		/*      : 3;
 					 *  0=28-bit
 					 *  1=48-bit
diff --git a/kernel/suspend.c b/kernel/suspend.c
index 05b32764dc1f..6eec7ed03b07 100644
--- a/kernel/suspend.c
+++ b/kernel/suspend.c
@@ -470,19 +470,22 @@ static int count_and_copy_data_pages(struct pbe *pagedir_p)
 	int nr_copy_pages = 0;
 	int pfn;
 	struct page *page;
-
-#ifndef CONFIG_DISCONTIGMEM	
-	if (max_mapnr != num_physpages)
-		panic("mapnr is not expected");
+	
+#ifdef CONFIG_DISCONTIGMEM
+	panic("Discontingmem not supported");
+#else
+	BUG_ON (max_mapnr != num_physpages);
 #endif
-	for (pfn = 0; pfn < num_physpages; pfn++) {
+	for (pfn = 0; pfn < max_mapnr; pfn++) {
 		page = pfn_to_page(pfn);
 		if (PageHighMem(page))
 			panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
+
 		if (!PageReserved(page)) {
 			if (PageNosave(page))
 				continue;
 
+
 			if ((chunk_size=is_head_of_free_region(page))!=0) {
 				pfn += chunk_size - 1;
 				continue;
@@ -776,9 +779,10 @@ void do_magic_resume_2(void)
 	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	BUG_ON (pagedir_order_check != pagedir_order);
 
+	__flush_tlb_global();		/* Even mappings of "global" things (vmalloc) need to be fixed */
+
 	PRINTK( "Freeing prev allocated pagedir\n" );
 	free_suspend_pagedir((unsigned long) pagedir_save);
-	__flush_tlb_global();		/* Even mappings of "global" things (vmalloc) need to be fixed */
 	drivers_resume(RESUME_ALL_PHASES);
 	spin_unlock_irq(&suspend_pagedir_lock);
 
@@ -809,12 +813,10 @@ void do_magic_suspend_2(void)
 
 	barrier();
 	mb();
-	drivers_resume(RESUME_PHASE2);
 	spin_lock_irq(&suspend_pagedir_lock);	/* Done to disable interrupts */ 
 	mdelay(1000);
 
 	free_pages((unsigned long) pagedir_nosave, pagedir_order);
-	drivers_resume(RESUME_PHASE1);
 	spin_unlock_irq(&suspend_pagedir_lock);
 	mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
 	PRINTK(KERN_WARNING "%sLeaving do_magic_suspend_2...\n", name_suspend);	
@@ -1037,6 +1039,7 @@ static int bdev_write_page(struct block_device *bdev, long pos, void *buf)
 	return 0;
 #endif
 	printk(KERN_CRIT "%sWarning %s: Fixing swap signatures unimplemented...\n", name_resume, resume_file);
+	return 0;
 }
 
 extern kdev_t __init name_to_kdev_t(const char *line);
-- 
cgit v1.2.3


From 8069b968cec4d0f78b066bdd10ca7277d1bacd02 Mon Sep 17 00:00:00 2001
From: Art Haas <ahaas@neosoft.com>
Date: Tue, 1 Oct 2002 02:32:20 -0700
Subject: [PATCH] C99 designated initializers for include/linux/isapnp.h

---
 include/linux/isapnp.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/isapnp.h b/include/linux/isapnp.h
index 136dac530b93..4884d178909a 100644
--- a/include/linux/isapnp.h
+++ b/include/linux/isapnp.h
@@ -132,11 +132,11 @@ struct isapnp_resources {
 #define ISAPNP_CARD_DEVS	8
 
 #define ISAPNP_CARD_ID(_va, _vb, _vc, _device) \
-		card_vendor: ISAPNP_VENDOR(_va, _vb, _vc), card_device: ISAPNP_DEVICE(_device)
+		.card_vendor = ISAPNP_VENDOR(_va, _vb, _vc), .card_device = ISAPNP_DEVICE(_device)
 #define ISAPNP_CARD_END \
-		card_vendor: 0, card_device: 0
+		.card_vendor = 0, .card_device = 0
 #define ISAPNP_DEVICE_ID(_va, _vb, _vc, _function) \
-		{ vendor: ISAPNP_VENDOR(_va, _vb, _vc), function: ISAPNP_FUNCTION(_function) }
+		{ .vendor = ISAPNP_VENDOR(_va, _vb, _vc), .function = ISAPNP_FUNCTION(_function) }
 
 /* export used IDs outside module */
 #define ISAPNP_CARD_TABLE(name) \
@@ -151,10 +151,10 @@ struct isapnp_card_id {
 };
 
 #define ISAPNP_DEVICE_SINGLE(_cva, _cvb, _cvc, _cdevice, _dva, _dvb, _dvc, _dfunction) \
-		card_vendor: ISAPNP_VENDOR(_cva, _cvb, _cvc), card_device: ISAPNP_DEVICE(_cdevice), \
-		vendor: ISAPNP_VENDOR(_dva, _dvb, _dvc), function: ISAPNP_FUNCTION(_dfunction)
+		.card_vendor = ISAPNP_VENDOR(_cva, _cvb, _cvc), .card_device =  ISAPNP_DEVICE(_cdevice), \
+		.vendor = ISAPNP_VENDOR(_dva, _dvb, _dvc), .function = ISAPNP_FUNCTION(_dfunction)
 #define ISAPNP_DEVICE_SINGLE_END \
-		card_vendor: 0, card_device: 0
+		.card_vendor = 0, .card_device = 0
 
 struct isapnp_device_id {
 	unsigned short card_vendor, card_device;
-- 
cgit v1.2.3


From 28c8df48a1548c9eb3f39d902c38da1e2c438474 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@penguin.transmeta.com>
Date: Tue, 1 Oct 2002 03:08:39 -0700
Subject: bitmap_member() => DECLARE_BITMAP()

---
 drivers/zorro/zorro.c | 2 +-
 include/linux/types.h | 2 +-
 include/linux/zorro.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c
index 9b793823655e..5c03d4a7ba54 100644
--- a/drivers/zorro/zorro.c
+++ b/drivers/zorro/zorro.c
@@ -80,7 +80,7 @@ struct zorro_dev *zorro_find_device(zorro_id id, struct zorro_dev *from)
      *  FIXME: use the normal resource management
      */
 
-bitmap_member(zorro_unused_z2ram, 128);
+DECLARE_BITMAP(zorro_unused_z2ram, 128);
 
 
 static void __init mark_region(unsigned long start, unsigned long end,
diff --git a/include/linux/types.h b/include/linux/types.h
index 3ee38ccc272c..582d35492314 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -4,7 +4,7 @@
 #ifdef	__KERNEL__
 #include <linux/config.h>
 
-#define bitmap_member(name,bits) \
+#define DECLARE_BITMAP(name,bits) \
 	unsigned long name[((bits)+BITS_PER_LONG-1)/BITS_PER_LONG]
 #endif
 
diff --git a/include/linux/zorro.h b/include/linux/zorro.h
index 38d128833a76..a623f17fd028 100644
--- a/include/linux/zorro.h
+++ b/include/linux/zorro.h
@@ -199,7 +199,7 @@ extern struct zorro_dev *zorro_find_device(zorro_id id,
      *  the corresponding bits.
      */
 
-extern bitmap_member(zorro_unused_z2ram, 128);
+extern DECLARE_BITMAP(zorro_unused_z2ram, 128);
 
 #define Z2RAM_START		(0x00200000)
 #define Z2RAM_END		(0x00a00000)
-- 
cgit v1.2.3


From d7d2454a2adf6d816a63dfc080186b30d60ec275 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@nuts.ninka.net>
Date: Wed, 2 Oct 2002 14:20:42 -0700
Subject: [EQL]: Rewrite to be SMP safe.

---
 drivers/net/eql.c      | 1007 +++++++++++++++---------------------------------
 include/linux/if_eql.h |   55 +--
 2 files changed, 330 insertions(+), 732 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index 300f28d51b36..bfe053425de6 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -4,6 +4,7 @@
  * (c) Copyright 1995 Simon "Guru Aleph-Null" Janes
  * NCM: Network and Communications Management, Inc.
  *
+ * (c) Copyright 2002 David S. Miller (davem@redhat.com)
  *
  *	This software may be used and distributed according to the terms
  *	of the GNU General Public License, incorporated herein by reference.
@@ -107,6 +108,7 @@
  * 	the locking mechanism and timer stuff must be written however,
  * 	this version will not work otherwise
  *
+ * Sorry, I had to rewrite most of this for 2.5.x -DaveM
  */
 
 #include <linux/module.h>
@@ -121,112 +123,69 @@
 
 #include <asm/uaccess.h>
 
-static char version[] __initdata = 
-	"Equalizer1996: $Revision: 1.2.1 $ $Date: 1996/09/22 13:52:00 $ Simon Janes (simon@ncm.com)\n";
-
-#ifndef EQL_DEBUG
-/* #undef EQL_DEBUG      -* print nothing at all, not even a boot-banner */
-/* #define EQL_DEBUG 1   -* print only the boot-banner */
-/* #define EQL_DEBUG 5   -* print major function entries */
-/* #define EQL_DEBUG 20  -* print subfunction entries */
-/* #define EQL_DEBUG 50  -* print utility entries */
-/* #define EQL_DEBUG 100 -* print voluminous function entries */
-#define EQL_DEBUG 1
-#endif
-static unsigned int eql_debug = EQL_DEBUG;
-
-static int eql_init(struct net_device *dev); /*  */
-static int eql_open(struct net_device *dev); /*  */
-static int eql_close(struct net_device *dev); /*  */
-static int eql_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); /*  */
-static int eql_slave_xmit(struct sk_buff *skb, struct net_device *dev); /*  */
-
-static struct net_device_stats *eql_get_stats(struct net_device *dev); /*  */
-
-/* ioctl() handlers
-   ---------------- */
-static int eql_enslave(struct net_device *dev,  slaving_request_t *srq); /*  */
-static int eql_emancipate(struct net_device *dev, slaving_request_t *srq); /*  */
-
-static int eql_g_slave_cfg(struct net_device *dev, slave_config_t *sc); /*  */
-static int eql_s_slave_cfg(struct net_device *dev, slave_config_t *sc); /*  */
-
-static int eql_g_master_cfg(struct net_device *dev, master_config_t *mc); /*  */
-static int eql_s_master_cfg(struct net_device *dev, master_config_t *mc); /*  */
-
-static inline int eql_is_slave(struct net_device *dev); /*  */
-static inline int eql_is_master(struct net_device *dev); /*  */
-
-static slave_t *eql_new_slave(void); /*  */
-static void eql_delete_slave(slave_t *slave); /*  */
+static int eql_open(struct net_device *dev);
+static int eql_close(struct net_device *dev);
+static int eql_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd);
+static int eql_slave_xmit(struct sk_buff *skb, struct net_device *dev);
+static struct net_device_stats *eql_get_stats(struct net_device *dev);
 
-/* static long eql_slave_priority(slave_t *slave); -*  */
-static inline int eql_number_slaves(slave_queue_t *queue); /*  */
+#define eql_is_slave(dev)	((dev->flags & IFF_SLAVE) == IFF_SLAVE)
+#define eql_is_master(dev)	((dev->flags & IFF_MASTER) == IFF_MASTER)
 
-static inline int eql_is_empty(slave_queue_t *queue); /*  */
-static inline int eql_is_full(slave_queue_t *queue); /*  */
+static void eql_kill_one_slave(slave_t *slave);
 
-static slave_queue_t *eql_new_slave_queue(struct net_device *dev); /*  */
-static void eql_delete_slave_queue(slave_queue_t *queue); /*  */
-
-static int eql_insert_slave(slave_queue_t *queue, slave_t *slave); /*  */
-static slave_t *eql_remove_slave(slave_queue_t *queue, slave_t *slave); /*  */
-
-/* static int eql_insert_slave_dev(slave_queue_t *queue, struct net_device *dev); -*  */
-static int eql_remove_slave_dev(slave_queue_t *queue, struct net_device *dev); /*  */
-
-static inline struct net_device *eql_best_slave_dev(slave_queue_t *queue); /*  */
-static inline slave_t *eql_best_slave(slave_queue_t *queue); /*  */
-static inline slave_t *eql_first_slave(slave_queue_t *queue); /*  */
-static inline slave_t *eql_next_slave(slave_queue_t *queue, slave_t *slave); /*  */
-
-static inline void eql_set_best_slave(slave_queue_t *queue, slave_t *slave); /*  */
-static inline void eql_schedule_slaves(slave_queue_t *queue); /*  */
+static void eql_timer(unsigned long param)
+{
+	equalizer_t *eql = (equalizer_t *) param;
+	struct list_head *this, *tmp, *head;
+	
+	spin_lock_bh(&eql->queue.lock);
+	head = &eql->queue.all_slaves;
+	list_for_each_safe(this, tmp, head) {
+		slave_t *slave = list_entry(this, slave_t, list);
+
+		if ((slave->dev->flags & IFF_UP) == IFF_UP) {
+			slave->bytes_queued -= slave->priority_Bps;
+			if (slave->bytes_queued < 0)
+				slave->bytes_queued = 0;
+		} else {
+			eql_kill_one_slave(slave);
+		}
 
-static slave_t *eql_find_slave_dev(slave_queue_t *queue, struct net_device *dev); /*  */
+	}
+	spin_unlock_bh(&eql->queue.lock);
 
-/* static inline eql_lock_slave_queue(slave_queue_t *queue); -*  */
-/* static inline eql_unlock_slave_queue(slave_queue_t *queue); -*  */
+	eql->timer.expires = jiffies + EQL_DEFAULT_RESCHED_IVAL;
+	add_timer(&eql->timer);
+}
 
-static void eql_timer(unsigned long param);	/*  */
-
-/* struct net_device * interface functions 
-   ---------------------------------------------------------
-   */
+static char version[] __initdata = 
+	"Equalizer2002: Simon Janes (simon@ncm.com) and David S. Miller (davem@redhat.com)\n";
 
 static int __init eql_init(struct net_device *dev)
 {
-	static unsigned version_printed;
-	/* static unsigned num_masters     = 0; */
-	equalizer_t *eql = 0;
+	static unsigned int version_printed;
+	equalizer_t *eql;
 
 	SET_MODULE_OWNER(dev);
 
-	if ( version_printed++ == 0 && eql_debug > 0)
+	if (version_printed++ == 0)
 		printk(version);
-	/*
-	 *	Initialize the device structure. 
-	 */
-	dev->priv = kmalloc (sizeof (equalizer_t), GFP_KERNEL);
+
+	dev->priv = kmalloc(sizeof (equalizer_t), GFP_KERNEL);
 	if (dev->priv == NULL)
 		return -ENOMEM;
-	memset (dev->priv, 0, sizeof (equalizer_t));
-	eql = (equalizer_t *) dev->priv;
-
-	eql->stats = kmalloc (sizeof (struct net_device_stats), GFP_KERNEL);
-	if (eql->stats == NULL) 
-	{
-		kfree(dev->priv);
-		dev->priv = NULL;
-		return -ENOMEM;
-	}
-	memset (eql->stats, 0, sizeof (struct net_device_stats));
+	memset(dev->priv, 0, sizeof (equalizer_t));
+	eql = dev->priv;
 
-	init_timer (&eql->timer);
+	init_timer(&eql->timer);
 	eql->timer.data     	= (unsigned long) dev->priv;
-	eql->timer.expires  	= jiffies+EQL_DEFAULT_RESCHED_IVAL;
-	eql->timer.function 	= &eql_timer;
-	eql->timer_on       	= 0;
+	eql->timer.expires  	= jiffies + EQL_DEFAULT_RESCHED_IVAL;
+	eql->timer.function 	= eql_timer;
+
+	spin_lock_init(&eql->queue.lock);
+	INIT_LIST_HEAD(&eql->queue.all_slaves);
+	eql->queue.master_dev	= dev;
 
 	dev->open		= eql_open;
 	dev->stop		= eql_close;
@@ -234,17 +193,12 @@ static int __init eql_init(struct net_device *dev)
 	dev->hard_start_xmit	= eql_slave_xmit;
 	dev->get_stats		= eql_get_stats;
   
-  	/*
-  	 *	Fill in the fields of the device structure with 
-	 *	eql-generic values. 
-	 */
-
 	/*
 	 *	Now we undo some of the things that eth_setup does
 	 * 	that we don't like 
 	 */
 	 
-	dev->mtu        	= EQL_DEFAULT_MTU;	/* set to 576 in eql.h */
+	dev->mtu        	= EQL_DEFAULT_MTU;	/* set to 576 in if_eql.h */
 	dev->flags      	= IFF_MASTER;
 
 	dev->type       	= ARPHRD_SLIP;
@@ -255,257 +209,330 @@ static int __init eql_init(struct net_device *dev)
 
 static int eql_open(struct net_device *dev)
 {
-	equalizer_t *eql = (equalizer_t *) dev->priv;
-	slave_queue_t *new_queue;
-
-#ifdef EQL_DEBUG
-	if (eql_debug >= 5)
-		printk ("%s: open\n", dev->name);
-#endif
-
-	printk ("%s: remember to turn off Van-Jacobson compression on your slave devices.\n", dev->name);
-
-	new_queue = eql_new_slave_queue (dev);
-    
-	if (new_queue != 0)
-	{
-		new_queue->master_dev = dev;
-		eql->queue = new_queue;
-		eql->queue->lock = 0;
-		eql->min_slaves = 1;
-		eql->max_slaves = EQL_DEFAULT_MAX_SLAVES; /* 4 usually... */
-
-		printk ("%s: adding timer\n", dev->name);
-		eql->timer_on = 1;
-		add_timer (&eql->timer);
+	equalizer_t *eql = dev->priv;
 
-		return 0;
-	}
-	return -ENOMEM;
+	/* XXX We should force this off automatically for the user. */
+	printk(KERN_INFO "%s: remember to turn off Van-Jacobson compression on "
+	       "your slave devices.\n", dev->name);
+
+	if (!list_empty(&eql->queue.all_slaves))
+		BUG();
+
+	eql->min_slaves = 1;
+	eql->max_slaves = EQL_DEFAULT_MAX_SLAVES; /* 4 usually... */
+
+	add_timer(&eql->timer);
+
+	return 0;
 }
 
+static void eql_kill_one_slave(slave_t *slave)
+{
+	list_del(&slave->list);
+	slave->dev->flags &= ~IFF_SLAVE;
+	dev_put(slave->dev);
+	kfree(slave);
+}
+
+static void eql_kill_slave_queue(slave_queue_t *queue)
+{ 
+	struct list_head *head, *tmp, *this;
+
+	spin_lock_bh(&queue->lock);
+
+	head = &queue->all_slaves;
+	list_for_each_safe(this, tmp, head) {
+		slave_t *s = list_entry(this, slave_t, list);
+
+		eql_kill_one_slave(s);
+		queue->num_slaves--;
+	}
+
+	spin_unlock_bh(&queue->lock);
+}
 
 static int eql_close(struct net_device *dev)
 {
-	equalizer_t *eql = (equalizer_t *) dev->priv;
+	equalizer_t *eql = dev->priv;
 
-#ifdef EQL_DEBUG
-	if ( eql_debug >= 5)
-		printk ("%s: close\n", dev->name);
-#endif
 	/*
 	 *	The timer has to be stopped first before we start hacking away
 	 *	at the data structure it scans every so often... 
 	 */
 
-#ifdef EQL_DEBUG
-	printk ("%s: stopping timer\n", dev->name);
-#endif	
-	eql->timer_on = 0;
-	del_timer (&eql->timer);
+	del_timer_sync(&eql->timer);
 
-	eql_delete_slave_queue (eql->queue);
+	eql_kill_slave_queue(&eql->queue);
 
 	return 0;
 }
 
+static int eql_enslave(struct net_device *dev,  slaving_request_t *srq);
+static int eql_emancipate(struct net_device *dev, slaving_request_t *srq);
+
+static int eql_g_slave_cfg(struct net_device *dev, slave_config_t *sc);
+static int eql_s_slave_cfg(struct net_device *dev, slave_config_t *sc);
+
+static int eql_g_master_cfg(struct net_device *dev, master_config_t *mc);
+static int eql_s_master_cfg(struct net_device *dev, master_config_t *mc);
 
 static int eql_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
 {  
-	if(cmd!=EQL_GETMASTRCFG && cmd!=EQL_GETSLAVECFG && 
-	   !capable(CAP_NET_ADMIN))
+	if (cmd != EQL_GETMASTRCFG && cmd != EQL_GETSLAVECFG &&
+	    !capable(CAP_NET_ADMIN))
 	  	return -EPERM;
-	switch (cmd)
-	{
+
+	switch (cmd) {
 		case EQL_ENSLAVE:
-			return eql_enslave (dev, (slaving_request_t *) ifr->ifr_data);
+			return eql_enslave(dev,
+					   (slaving_request_t *) ifr->ifr_data);
 		case EQL_EMANCIPATE:
-			return eql_emancipate (dev, (slaving_request_t *) ifr->ifr_data);
+			return eql_emancipate(dev,
+					      (slaving_request_t *) ifr->ifr_data);
 		case EQL_GETSLAVECFG:
-			return eql_g_slave_cfg (dev, (slave_config_t *) ifr->ifr_data);
+			return eql_g_slave_cfg(dev,
+					       (slave_config_t *) ifr->ifr_data);
 		case EQL_SETSLAVECFG:
-			return eql_s_slave_cfg (dev, (slave_config_t *) ifr->ifr_data);
+			return eql_s_slave_cfg(dev,
+					       (slave_config_t *) ifr->ifr_data);
 		case EQL_GETMASTRCFG:
-			return eql_g_master_cfg (dev, (master_config_t *) ifr->ifr_data);
+			return eql_g_master_cfg(dev,
+						(master_config_t *) ifr->ifr_data);
 		case EQL_SETMASTRCFG:
-			return eql_s_master_cfg (dev, (master_config_t *) ifr->ifr_data);
+			return eql_s_master_cfg(dev,
+						(master_config_t *) ifr->ifr_data);
 		default:
 			return -EOPNOTSUPP;
-	}
+	};
 }
 
+/* queue->lock must be held */
+static slave_t *__eql_schedule_slaves(slave_queue_t *queue)
+{
+	unsigned long best_load = ~0UL;
+	struct list_head *this, *tmp, *head;
+	slave_t *best_slave;
+
+	best_slave = NULL;
+
+	/* Make a pass to set the best slave. */
+	head = &queue->all_slaves;
+	list_for_each_safe(this, tmp, head) {
+		slave_t *slave = list_entry(this, slave_t, list);
+		unsigned long slave_load, bytes_queued, priority_Bps; 
+
+		/* Go through the slave list once, updating best_slave
+		 * whenever a new best_load is found.
+		 */
+		bytes_queued = slave->bytes_queued;
+		priority_Bps = slave->priority_Bps;    
+		if ((slave->dev->flags & IFF_UP) == IFF_UP) {
+			slave_load = (~0UL - (~0UL / 2)) - 
+				(priority_Bps) + bytes_queued * 8;
+
+			if (slave_load < best_load) {
+				best_load = slave_load;
+				best_slave = slave;
+			}
+		} else {
+			/* We found a dead slave, kill it. */
+			eql_kill_one_slave(slave);
+		}
+	}
+	return best_slave;
+}
 
 static int eql_slave_xmit(struct sk_buff *skb, struct net_device *dev)
 {
-	equalizer_t *eql = (equalizer_t *) dev->priv;
-	struct net_device *slave_dev = 0;
+	equalizer_t *eql = dev->priv;
 	slave_t *slave;
 
-	if (skb == NULL)
-		return 0;
+	spin_lock(&eql->queue.lock);
+
+	slave = __eql_schedule_slaves(&eql->queue);
+	if (slave) {
+		struct net_device *slave_dev = slave->dev;
 
-	eql_schedule_slaves (eql->queue);
-  
-	slave = eql_best_slave (eql->queue); 
-	slave_dev = slave ? slave->dev : 0;
-
-	if ( slave_dev != 0 )
-	{
-#ifdef EQL_DEBUG
-		if (eql_debug >= 100)
-			printk ("%s: %d slaves xmitng %d B %s\n", 
-				dev->name, eql_number_slaves (eql->queue), skb->len,
-				slave_dev->name);
-#endif
 		skb->dev = slave_dev;
 		skb->priority = 1;
 		slave->bytes_queued += skb->len; 
-		dev_queue_xmit (skb);
-		eql->stats->tx_packets++;
-	}
-	else
-	{
-		/*
-		 *	The alternative for this is the return 1 and have
-		 *	dev_queue_xmit just queue it up on the eql's queue. 
-		 */
-
-		eql->stats->tx_dropped++;
+		dev_queue_xmit(skb);
+		eql->stats.tx_packets++;
+	} else {
+		eql->stats.tx_dropped++;
 		dev_kfree_skb(skb);
 	}	  
+
+	spin_unlock(&eql->queue.lock);
+
 	return 0;
 }
 
-
 static struct net_device_stats * eql_get_stats(struct net_device *dev)
 {
-	equalizer_t *eql = (equalizer_t *) dev->priv;
-	return eql->stats;
+	equalizer_t *eql = dev->priv;
+	return &eql->stats;
 }
 
 /*
  *	Private ioctl functions
  */
 
-static int eql_enslave(struct net_device *dev, slaving_request_t *srqp)
+/* queue->lock must be held */
+static slave_t *__eql_find_slave_dev(slave_queue_t *queue, struct net_device *dev)
+{
+	struct list_head *this, *head;
+
+	head = &queue->all_slaves;
+	list_for_each(this, head) {
+		slave_t *slave = list_entry(this, slave_t, list);
+
+		if (slave->dev == dev)
+			return slave;
+	}
+
+	return NULL;
+}
+
+static inline int eql_is_full(slave_queue_t *queue)
+{
+	equalizer_t *eql = queue->master_dev->priv;
+
+	if (queue->num_slaves >= eql->max_slaves)
+		return 1;
+	return 0;
+}
+
+/* queue->lock must be held */
+static int __eql_insert_slave(slave_queue_t *queue, slave_t *slave)
+{
+	if (!eql_is_full(queue)) {
+		slave_t *duplicate_slave = 0;
+
+		duplicate_slave = __eql_find_slave_dev(queue, slave->dev);
+		if (duplicate_slave != 0)
+			eql_kill_one_slave(duplicate_slave);
+
+		list_add(&slave->list, &queue->all_slaves);
+		queue->num_slaves++;
+		slave->dev->flags |= IFF_SLAVE;
+
+		return 0;
+	}
+
+	return -ENOSPC;
+}
+
+static int eql_enslave(struct net_device *master_dev, slaving_request_t *srqp)
 {
-	struct net_device *master_dev;
 	struct net_device *slave_dev;
 	slaving_request_t srq;
 
 	if (copy_from_user(&srq, srqp, sizeof (slaving_request_t)))
-	  {
-#ifdef EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("EQL enslave: error detected by copy_from_user\n");
-#endif  
 		return -EFAULT;
-	  }
-
-#ifdef EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("%s: enslave '%s' %ld bps\n", dev->name, 
-			srq.slave_name, srq.priority);
-#endif  
-	master_dev = dev;		/* for "clarity" */
-	slave_dev  = __dev_get_by_name (srq.slave_name);
-
-	if (master_dev != 0 && slave_dev != 0)
-	{
-		if ((master_dev->flags & IFF_UP) == IFF_UP)
-                {
-			/*slave is not a master & not already a slave:*/
-			if (! eql_is_master (slave_dev)  &&
-			    ! eql_is_slave (slave_dev) )
-			{
-				slave_t *s = eql_new_slave ();
-				equalizer_t *eql = 
-					(equalizer_t *) master_dev->priv;
-				if (!s)
+
+	slave_dev  = dev_get_by_name(srq.slave_name);
+	if (slave_dev) {
+		if ((master_dev->flags & IFF_UP) == IFF_UP) {
+			/* slave is not a master & not already a slave: */
+			if (!eql_is_master(slave_dev) &&
+			    !eql_is_slave(slave_dev)) {
+				slave_t *s = kmalloc(sizeof(*s), GFP_KERNEL);
+				equalizer_t *eql = master_dev->priv;
+				int ret;
+
+				if (!s) {
+					dev_put(slave_dev);
 					return -ENOMEM;
+				}
+
+				memset(s, 0, sizeof(*s));
 				s->dev = slave_dev;
 				s->priority = srq.priority;
 				s->priority_bps = srq.priority;
 				s->priority_Bps = srq.priority / 8;
-				slave_dev->flags |= IFF_SLAVE;
-				eql_insert_slave (eql->queue, s);
-				return 0;
+
+				spin_lock_bh(&eql->queue.lock);
+				ret = __eql_insert_slave(&eql->queue, s);
+				if (ret) {
+					dev_put(slave_dev);
+					kfree(s);
+				}
+				spin_unlock_bh(&eql->queue.lock);
+
+				return ret;
 			}
-#ifdef EQL_DEBUG
-			else if (eql_debug >= 20)
-				printk ("EQL enslave: slave is master or slave is already slave\n");
-#endif  
 		}
-#ifdef EQL_DEBUG
-		else if (eql_debug >= 20)
-			printk ("EQL enslave: master device not up!\n");
-#endif  
+		dev_put(slave_dev);
 	}
-#ifdef EQL_DEBUG
-	else if (eql_debug >= 20)
-		printk ("EQL enslave: master or slave are NULL");
-#endif  
+
 	return -EINVAL;
 }
 
-static int eql_emancipate(struct net_device *dev, slaving_request_t *srqp)
+static int eql_emancipate(struct net_device *master_dev, slaving_request_t *srqp)
 {
-	struct net_device *master_dev;
+	equalizer_t *eql = master_dev->priv;
 	struct net_device *slave_dev;
 	slaving_request_t srq;
+	int ret;
 
 	if (copy_from_user(&srq, srqp, sizeof (slaving_request_t)))
 		return -EFAULT;
 
-#ifdef EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("%s: emancipate `%s`\n", dev->name, srq.slave_name);
-#endif
-	master_dev = dev;		/* for "clarity" */
-	slave_dev  = __dev_get_by_name (srq.slave_name);
-
-	if ( eql_is_slave (slave_dev) )	/* really is a slave */
-	{
-		equalizer_t *eql = (equalizer_t *) master_dev->priv;
-		slave_dev->flags = slave_dev->flags & ~IFF_SLAVE;
-		eql_remove_slave_dev (eql->queue, slave_dev);
-		return 0;
+	slave_dev = dev_get_by_name(srq.slave_name);
+	ret = -EINVAL;
+	if (slave_dev) {
+		spin_lock_bh(&eql->queue.lock);
+
+		if (eql_is_slave(slave_dev)) {
+			slave_t *slave = __eql_find_slave_dev(&eql->queue,
+							      slave_dev);
+
+			if (slave) {
+				eql_kill_one_slave(slave);
+				ret = 0;
+			}
+		}
+		dev_put(slave_dev);
+
+		spin_unlock_bh(&eql->queue.lock);
 	}
-	return -EINVAL;
-}
 
+	return ret;
+}
 
 static int eql_g_slave_cfg(struct net_device *dev, slave_config_t *scp)
 {
+	equalizer_t *eql = dev->priv;
 	slave_t *slave;
-	equalizer_t *eql;
 	struct net_device *slave_dev;
 	slave_config_t sc;
+	int ret;
 
-	if (copy_from_user (&sc, scp, sizeof (slave_config_t)))
+	if (copy_from_user(&sc, scp, sizeof (slave_config_t)))
 		return -EFAULT;
 
-#ifdef EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("%s: get config for slave `%s'\n", dev->name, sc.slave_name);
-#endif
-	eql = (equalizer_t *) dev->priv;
-	slave_dev = __dev_get_by_name (sc.slave_name);
-
-	if ( eql_is_slave (slave_dev) )
-	{
-		slave = eql_find_slave_dev (eql->queue,  slave_dev);
-		if (slave != 0)
-		{
+	slave_dev = dev_get_by_name(sc.slave_name);
+
+	ret = -EINVAL;
+
+	spin_lock_bh(&eql->queue.lock);
+	if (eql_is_slave(slave_dev)) {
+		slave = __eql_find_slave_dev(&eql->queue, slave_dev);
+		if (slave) {
 			sc.priority = slave->priority;
-			if (copy_to_user (scp, &sc, sizeof (slave_config_t)))
-				return -EFAULT;
-			return 0;
+			ret = 0;
 		}
 	}
-	return -EINVAL;
-}
+	spin_unlock_bh(&eql->queue.lock);
+
+	dev_put(slave_dev);
 
+	if (!ret && copy_to_user(scp, &sc, sizeof (slave_config_t)))
+		ret = -EFAULT;
+
+	return ret;
+}
 
 static int eql_s_slave_cfg(struct net_device *dev, slave_config_t *scp)
 {
@@ -513,71 +540,57 @@ static int eql_s_slave_cfg(struct net_device *dev, slave_config_t *scp)
 	equalizer_t *eql;
 	struct net_device *slave_dev;
 	slave_config_t sc;
+	int ret;
 
-	if (copy_from_user (&sc, scp, sizeof (slave_config_t)))
+	if (copy_from_user(&sc, scp, sizeof (slave_config_t)))
 		return -EFAULT;
 
-#ifdef EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("%s: set config for slave `%s'\n", dev->name, sc.slave_name);
-#endif
-  
+	eql = dev->priv;
+	slave_dev = dev_get_by_name(sc.slave_name);
 
-	eql = (equalizer_t *) dev->priv;
-	slave_dev = __dev_get_by_name (sc.slave_name);
+	ret = -EINVAL;
 
-	if ( eql_is_slave (slave_dev) )
-	{
-		slave = eql_find_slave_dev (eql->queue, slave_dev);
-		if (slave != 0)
-		{
+	spin_lock_bh(&eql->queue.lock);
+	if (eql_is_slave(slave_dev)) {
+		slave = __eql_find_slave_dev(&eql->queue, slave_dev);
+		if (slave) {
 			slave->priority = sc.priority;
 			slave->priority_bps = sc.priority;
 			slave->priority_Bps = sc.priority / 8;
-			return 0;
+			ret = 0;
 		}
 	}
-	return -EINVAL;
-}
+	spin_unlock_bh(&eql->queue.lock);
 
+	return ret;
+}
 
 static int eql_g_master_cfg(struct net_device *dev, master_config_t *mcp)
 {
 	equalizer_t *eql;
 	master_config_t mc;
 
-#if EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("%s: get master config\n", dev->name);
-#endif
-
-	if ( eql_is_master (dev) )
-	{
-		eql = (equalizer_t *) dev->priv;
+	if (eql_is_master(dev)) {
+		eql = dev->priv;
 		mc.max_slaves = eql->max_slaves;
 		mc.min_slaves = eql->min_slaves;
-		if (copy_to_user (mcp, &mc, sizeof (master_config_t)))
+		if (copy_to_user(mcp, &mc, sizeof (master_config_t)))
 			return -EFAULT;
 		return 0;
 	}
 	return -EINVAL;
 }
 
-
 static int eql_s_master_cfg(struct net_device *dev, master_config_t *mcp)
 {
 	equalizer_t *eql;
 	master_config_t mc;
 
-	if (copy_from_user (&mc, mcp, sizeof (master_config_t)))
+	if (copy_from_user(&mc, mcp, sizeof (master_config_t)))
 		return -EFAULT;
-#if EQL_DEBUG
-	if (eql_debug >= 20)
-		printk ("%s: set master config\n", dev->name);
-#endif
-	if ( eql_is_master (dev) )
-	{
-		eql = (equalizer_t *) dev->priv;
+
+	if (eql_is_master(dev)) {
+		eql = dev->priv;
 		eql->max_slaves = mc.max_slaves;
 		eql->min_slaves = mc.min_slaves;
 		return 0;
@@ -585,415 +598,6 @@ static int eql_s_master_cfg(struct net_device *dev, master_config_t *mcp)
 	return -EINVAL;
 }
 
-/*
- *	Private device support functions
- */
-
-static inline int eql_is_slave(struct net_device *dev)
-{
-	if (dev)
-	{
-		if ((dev->flags & IFF_SLAVE) == IFF_SLAVE)
-			return 1;
-	}
-	return 0;
-}
-
-
-static inline int eql_is_master(struct net_device *dev)
-{
-	if (dev)
-	{
-		if ((dev->flags & IFF_MASTER) == IFF_MASTER)
-		return 1;
-	}
-	return 0;
-}
-
-
-static slave_t *eql_new_slave(void)
-{
-	slave_t *slave;
-
-	slave = (slave_t *) kmalloc (sizeof (slave_t), GFP_KERNEL);
-	if (slave)
-		memset(slave, 0, sizeof (slave_t));
-	return slave;
-}
-
-
-static void eql_delete_slave(slave_t *slave)
-{
-	kfree (slave);
-}
-
-
-#if 0				/* not currently used, will be used
-				   when we really use a priority queue */
-static long slave_Bps(slave_t *slave)
-{
-	return (slave->priority_Bps);
-}
-
-static long slave_bps(slave_t *slave)
-{
-	return (slave->priority_bps);
-}
-
-#endif
-
-static inline int eql_number_slaves(slave_queue_t *queue)
-{
-	return queue->num_slaves;
-}
-
-static inline int eql_is_empty(slave_queue_t *queue)
-{
-	if (eql_number_slaves (queue) == 0)
-		return 1;
-	return 0;
-}
-
-static inline int eql_is_full(slave_queue_t *queue)
-{
-	equalizer_t *eql = (equalizer_t *) queue->master_dev->priv;
-
-	if (eql_number_slaves (queue) == eql->max_slaves)
-		return 1;
-	return 0;
-}
-
-static slave_queue_t *eql_new_slave_queue(struct net_device *dev)
-{
-	slave_queue_t *queue;
-	slave_t *head_slave;
-	slave_t *tail_slave;
-
-	queue = (slave_queue_t *) kmalloc (sizeof (slave_queue_t), GFP_KERNEL);
-	if (!queue)
-		goto err_out;
-
-	head_slave = eql_new_slave ();
-	if (!head_slave)
-		goto err_out_queue;
-
-	tail_slave = eql_new_slave ();
-	if (!tail_slave)
-		goto err_out_hs;
-
-	memset (queue, 0, sizeof (slave_queue_t));
-
-	head_slave->next = tail_slave;
-	tail_slave->next = 0;
-	queue->head = head_slave;
-	queue->num_slaves = 0;
-	queue->master_dev = dev;
-	return queue;
-
-err_out_hs:
-	kfree (head_slave);
-err_out_queue:
-	kfree (queue);
-err_out:
-	return NULL;
-}
-
-
-static void eql_delete_slave_queue(slave_queue_t *queue)
-{ 
-	slave_t *zapped;
-	/*
-	 *	This should only be called when there isn't a
-	 *	timer running that scans the data periodically.. 
-	 *	dev_close stops the timer... 
-	 */
-
-	while ( ! eql_is_empty (queue) )
-	{
-		zapped = eql_remove_slave (queue, queue->head->next);
-		eql_delete_slave (zapped);
-	}
-	kfree (queue->head->next);
-	kfree (queue->head);
-	kfree (queue);
-}
-
-static int eql_insert_slave(slave_queue_t *queue, slave_t *slave)
-{
-	unsigned long flags;
-
-	save_flags(flags);
-	cli ();
-
-	if ( ! eql_is_full (queue) )
-	{
-		slave_t *duplicate_slave = 0;
-		duplicate_slave = eql_find_slave_dev (queue, slave->dev);
-		if (duplicate_slave != 0)
-		{
-/*			  printk ("%s: found a duplicate, killing it and replacing\n",
-				  queue->master_dev->name); */
-			eql_delete_slave (eql_remove_slave (queue, duplicate_slave));
-		}
-		slave->next = queue->head->next;
-		queue->head->next = slave;
-		queue->num_slaves++;
-		restore_flags(flags);
-		return 0;
-	}
-	restore_flags(flags);
-	return 1;
-}
-
-
-static slave_t *eql_remove_slave(slave_queue_t *queue, slave_t *slave)
-{
-	slave_t *prev;
-	slave_t *curr;
-	unsigned long flags;
-
-	save_flags(flags);
-	cli ();
-
-	prev = queue->head;
-	curr = queue->head->next;
-	while (curr != slave && 
-		curr->dev != 0 )
-	{
-/* 		printk ("%s: remove_slave; searching...\n", queue->master_dev->name); */
-		prev = curr;
-		curr = curr->next;
-	}
-
-	if (curr == slave)
-	{
-		prev->next = curr->next;
-		queue->num_slaves--;
-		curr->dev->flags = curr->dev->flags & ~IFF_SLAVE;
-		restore_flags(flags);
-		return curr;
-	}
-	restore_flags(flags);
-	return 0;			/* not found */
-}
-
-
-static int eql_remove_slave_dev(slave_queue_t *queue, struct net_device *dev)
-{
-	slave_t *prev;
-	slave_t *curr;
-	slave_t *target;
-
-	target = eql_find_slave_dev (queue, dev);
-
-	if (target != 0)
-	{
-		unsigned long flags;
-
-		save_flags(flags);
-		cli ();
-		prev = queue->head;
-		curr = prev->next;
-		while (curr != target)
-		{
-			prev = curr;
-			curr = curr->next;
-		}
-		prev->next = curr->next;
-		queue->num_slaves--;
-		restore_flags(flags);
-		eql_delete_slave (curr);
-		return 0;
-	}
-	return 1;
-}
-
-
-static inline struct net_device *eql_best_slave_dev(slave_queue_t *queue)
-{
-	if (queue->best_slave != 0)
-	{
-		if (queue->best_slave->dev != 0)
-			return queue->best_slave->dev;
-		else
-			return 0;
-	}
-	else
-		return 0;
-}
-
-
-static inline slave_t *eql_best_slave(slave_queue_t *queue)
-{
-	return queue->best_slave;
-}
-
-static inline void eql_schedule_slaves(slave_queue_t *queue)
-{
-	struct net_device *master_dev = queue->master_dev;
-	slave_t *best_slave = 0;
-	slave_t *slave_corpse = 0;
-
-#ifdef EQL_DEBUG
-	if (eql_debug >= 100)
-		printk ("%s: schedule %d slaves\n", 
-			master_dev->name, eql_number_slaves (queue));
-#endif
-	if ( eql_is_empty (queue) )
-	{
-		/*
-		 *	No slaves to play with 
-		 */
-		eql_set_best_slave (queue, (slave_t *) 0);
-		return;
-	}
-	else
-	{		
-		/*
-		 *	Make a pass to set the best slave 
-		 */
-		unsigned long best_load = (unsigned long) ULONG_MAX;
-		slave_t *slave = 0;
-		unsigned long flags;
-		int i;
-
-		save_flags(flags);
-		cli ();
-		for (i = 1, slave = eql_first_slave (queue);
-			i <= eql_number_slaves (queue);
-			i++, slave = eql_next_slave (queue, slave))
-		{
-			/*
-			 *	Go through the slave list once, updating best_slave 
-			 *      whenever a new best_load is found, whenever a dead
-			 *	slave is found, it is marked to be pulled out of the 
-			 *	queue 
-			 */
-		
-			unsigned long slave_load;
-			unsigned long bytes_queued; 
-			unsigned long priority_Bps; 
-	  	
-	  		if (slave != 0)
-			{
-				bytes_queued = slave->bytes_queued;
-				priority_Bps = slave->priority_Bps;    
-				if ( slave->dev != 0)
-				{
-					if ((slave->dev->flags & IFF_UP) == IFF_UP )
-					{
-						slave_load = (ULONG_MAX - (ULONG_MAX / 2)) - 
-							(priority_Bps) + bytes_queued * 8;
-
-		      				if (slave_load < best_load)
-						{
-							best_load = slave_load;
-							best_slave = slave;
-						}
-					}
-					else		/* we found a dead slave */
-					{
-						/* 
-						 *	We only bury one slave at a time, if more than
-						 *	one slave dies, we will bury him on the next 
-						 *	reschedule. slaves don't die all at once that 
-						 *	much anyway 
-						 */
-						slave_corpse = slave;
-					}
-				}
-			}
-		} /* for */
-		restore_flags(flags);
-		eql_set_best_slave (queue, best_slave);
-	} /* else */
-	if (slave_corpse != 0)
-	{
-		printk ("eql: scheduler found dead slave, burying...\n");
-		eql_delete_slave (eql_remove_slave (queue, slave_corpse));
-	}
-	return;
-}
-
-
-static slave_t * eql_find_slave_dev(slave_queue_t *queue, struct net_device *dev)
-{
-	slave_t *slave = 0;
-	slave = eql_first_slave(queue);
-
-	while (slave != 0 && slave->dev != dev && slave != 0)
-	{
-#if 0
-		if (slave->dev != 0)
-			printk ("eql: find_slave_dev; looked at '%s'...\n", slave->dev->name);
-		else
-			printk ("eql: find_slave_dev; looked at nothing...\n");
-#endif
-		slave = slave->next;
-	}
-	return slave;
-}
-
-
-static inline slave_t *eql_first_slave(slave_queue_t *queue)
-{
-	return queue->head->next;
-}
-
-
-static inline slave_t *eql_next_slave(slave_queue_t *queue, slave_t *slave)
-{
-	return slave->next;
-}
-
-static inline void eql_set_best_slave(slave_queue_t *queue, slave_t *slave)
-{
-	queue->best_slave = slave;
-}
-
-static void eql_timer(unsigned long param)
-{
-	equalizer_t *eql = (equalizer_t *) param;
-	slave_t *slave;
-	slave_t *slave_corpse = 0;
-	int i;
-	unsigned long flags;
-	
-	if ( ! eql_is_empty (eql->queue) )
-	{
-		save_flags(flags);
-		cli ();
-		for (i = 1, slave = eql_first_slave (eql->queue);
-			i <= eql_number_slaves (eql->queue);
-			i++, slave = eql_next_slave (eql->queue, slave))
-		{
-			if (slave != 0)
-			{
-				if ((slave->dev->flags & IFF_UP) == IFF_UP )
-				{
-					slave->bytes_queued -= slave->priority_Bps;
-					if (slave->bytes_queued < 0)
-						slave->bytes_queued = 0;
-				}
-				else
-					slave_corpse = slave;
-			}
-		}
-		restore_flags(flags);
-		if (slave_corpse != 0)
-		{
-			printk ("eql: timer found dead slave, burying...\n");
-			eql_delete_slave (eql_remove_slave (eql->queue, slave_corpse));
-		}
-	}
-
-	if (eql->timer_on != 0) 
-	{
-		eql->timer.expires = jiffies+EQL_DEFAULT_RESCHED_IVAL;
-		add_timer (&eql->timer);
-	}
-}
-
 static struct net_device dev_eql;
 
 static int __init eql_init_module(void)
@@ -1009,7 +613,6 @@ static int __init eql_init_module(void)
 
 static void __exit eql_cleanup_module(void)
 {
-	kfree(((equalizer_t *)dev_eql.priv)->stats );
 	kfree(dev_eql.priv);
 	unregister_netdev(&dev_eql);
 }
@@ -1017,11 +620,3 @@ static void __exit eql_cleanup_module(void)
 module_init(eql_init_module);
 module_exit(eql_cleanup_module);
 MODULE_LICENSE("GPL");
-
-/*
- * Local Variables: 
- * compile-command: "gcc -D__KERNEL__ -I/usr/src/linux/net/inet -Wall -Wstrict-prototypes -O6 -m486 -c eql.c"
- * version-control: t
- * kept-new-versions: 20
- * End:
- */
diff --git a/include/linux/if_eql.h b/include/linux/if_eql.h
index 112a3df19067..b68752fdc5c4 100644
--- a/include/linux/if_eql.h
+++ b/include/linux/if_eql.h
@@ -19,8 +19,6 @@
 #ifndef _LINUX_IF_EQL_H
 #define _LINUX_IF_EQL_H
 
-#include <linux/timer.h>
-
 #define EQL_DEFAULT_SLAVE_PRIORITY 28800
 #define EQL_DEFAULT_MAX_SLAVES     4
 #define EQL_DEFAULT_MTU            576
@@ -35,46 +33,51 @@
 #define EQL_GETMASTRCFG (SIOCDEVPRIVATE + 4)
 #define EQL_SETMASTRCFG (SIOCDEVPRIVATE + 5)
 
+#ifdef __KERNEL__
+
+#include <linux/timer.h>
+#include <linux/spinlock.h>
+
 typedef struct slave {
-  struct net_device *dev;
-  long priority;
-  long priority_bps;
-  long priority_Bps;
-  long bytes_queued;
-  struct slave *next;
+	struct list_head	list;
+	struct net_device	*dev;
+	long			priority;
+	long			priority_bps;
+	long			priority_Bps;
+	long			bytes_queued;
 } slave_t;
 
 typedef struct slave_queue {
-  slave_t *head;
-  slave_t *best_slave;
-  int num_slaves;
-  struct net_device *master_dev;
-  char lock;
+	spinlock_t		lock;
+	struct list_head	all_slaves;
+	int			num_slaves;
+	struct net_device	*master_dev;
 } slave_queue_t;
 
 typedef struct equalizer {
-  slave_queue_t *queue;
-  int min_slaves;
-  int max_slaves;
-  struct net_device_stats *stats;
-  struct timer_list timer;
-  char timer_on;
+	slave_queue_t		queue;
+	int			min_slaves;
+	int			max_slaves;
+	struct net_device_stats	stats;
+	struct timer_list	timer;
 } equalizer_t;  
 
+#endif /* __KERNEL__ */
+
 typedef struct master_config {
-  char master_name[16];
-  int max_slaves;
-  int min_slaves;
+	char	master_name[16];
+	int	max_slaves;
+	int	min_slaves;
 } master_config_t;
 
 typedef struct slave_config {
-  char slave_name[16];
-  long priority;
+	char	slave_name[16];
+	long	priority;
 } slave_config_t;
 
 typedef struct slaving_request {
-  char slave_name[16];
-  long priority;
+	char	slave_name[16];
+	long	priority;
 } slaving_request_t;
 
 
-- 
cgit v1.2.3


From 55b407322622e0d86acda07f756cb08879030a90 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Wed, 2 Oct 2002 22:57:24 -0700
Subject: [PATCH] radix tree gang lookup

Adds a gang lookup facility to radix trees.  It provides an efficient
means of locating a bunch of pages starting at a particular offset.

The implementation is a bit dumb, but is efficient enough.  And it is
amenable to the `tagged lookup' extension which is proving tricky to
write, but which will allow the dirty pages within a mapping to be
located in pgoff_t order.

Thanks are due to Huch Dickins for finding and fixing an unpleasant bug
in here.
---
 fs/inode.c                 |   6 ++-
 include/linux/radix-tree.h |   3 ++
 lib/radix-tree.c           | 109 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 114 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index c07e1e7e1a35..866b27bd1b7d 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -147,10 +147,12 @@ static void destroy_inode(struct inode *inode)
 	if (inode_has_buffers(inode))
 		BUG();
 	security_ops->inode_free_security(inode);
-	if (inode->i_sb->s_op->destroy_inode)
+	if (inode->i_sb->s_op->destroy_inode) {
 		inode->i_sb->s_op->destroy_inode(inode);
-	else
+	} else {
+		BUG_ON(inode->i_data.page_tree.rnode != NULL);
 		kmem_cache_free(inode_cachep, (inode));
+	}
 }
 
 
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index fb2e3f3350d3..56d1c668ff2e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -45,5 +45,8 @@ extern int radix_tree_reserve(struct radix_tree_root *, unsigned long, void ***)
 extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
 extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
 extern int radix_tree_delete(struct radix_tree_root *, unsigned long);
+extern unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items);
 
 #endif /* _LINUX_RADIX_TREE_H */
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e17cd888fc3d..757e814bc8dd 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -43,6 +43,7 @@ struct radix_tree_path {
 };
 
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
 
 /*
  * Radix tree node cache.
@@ -218,9 +219,113 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 
 	return (void *) *slot;
 }
-
 EXPORT_SYMBOL(radix_tree_lookup);
 
+static /* inline */ unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index,
+	unsigned long max_index)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	while (height > 0) {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (slot->slots[i] != NULL)
+				break;
+			index &= ~((1 << shift) - 1);
+			index += 1 << shift;
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		if (height == 0) {
+			/* Bottom level: grab some items */
+			unsigned long j;
+
+			BUG_ON((shift + RADIX_TREE_MAP_SHIFT) != 0);
+			
+			j = index & RADIX_TREE_MAP_MASK;
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (slot->slots[j]) {
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		slot = slot->slots[i];
+	}
+out:
+	*next_index = index;
+	return nr_found;
+	
+}
+/**
+ *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	them at *@results and returns the number of items which were placed at
+ *	*@results.
+ *
+ *	The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	if (root->rnode == NULL)
+		goto out;
+	if (max_index == 0) {			/* Bah.  Special case */
+		if (first_index == 0) {
+			if (max_items > 0) {
+				*results = root->rnode;
+				ret = 1;
+			}
+		}
+		goto out;
+	}
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup(root, results + ret, cur_index,
+				max_items - ret, &next_index, max_index);
+		if (nr_found == 0) {
+			 if (!(cur_index & RADIX_TREE_MAP_MASK))
+				break;
+			/*
+			 * It could be that there simply were no items to the
+			 * right of `cur_index' in the leaf node.  So we still
+			 * need to search for additional nodes to the right of
+			 * this one.
+			 */
+		}
+		ret += nr_found;
+		cur_index = next_index;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup);
 
 /**
  *	radix_tree_delete    -    delete an item from a radix tree
@@ -231,7 +336,7 @@ EXPORT_SYMBOL(radix_tree_lookup);
  */
 int radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
-	struct radix_tree_path path[RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2], *pathp = path;
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
 	unsigned int height, shift;
 
 	height = root->height;
-- 
cgit v1.2.3


From 735a257344c83ebe06e2b4df1d4b3e5769704e19 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Wed, 2 Oct 2002 22:57:40 -0700
Subject: [PATCH] truncate/invalidate_inode_pages rewrite

Rewrite these functions to use gang lookup.

- This probably has similar performance to the old code in the common case.

- It will be vastly quicker than current code for the worst case
  (single-page truncate).

- invalidate_inode_pages() has been changed.  It used to use
  page_count(page) as the "is it mapped into pagetables" heuristic.  It
  now uses the (page->pte.direct != 0) heuristic.

- Removes the worst cause of scheduling latency in the kernel.

- It's a big code cleanup.

- invalidate_inode_pages() has been changed to take an address_space
  *, not an inode *.

- the maximum hold times for mapping->page_lock are enormously reduced,
  making it quite feasible to turn this into an irq-safe lock.  Which, it
  seems, is a requirement for sane AIO<->direct-io integration, as well
  as possibly other AIO things.

(Thanks Hugh for fixing a bug in this one as well).

(Christoph added some stuff too)
---
 fs/buffer.c             |   2 +-
 fs/jffs/inode-v23.c     |   4 +-
 fs/nfs/dir.c            |   4 +-
 fs/nfs/inode.c          |   4 +-
 fs/smbfs/inode.c        |   4 +-
 include/linux/fs.h      |   2 +-
 include/linux/pagemap.h |   3 +
 include/linux/pagevec.h |   3 +
 mm/Makefile             |   3 +-
 mm/filemap.c            | 366 ++++--------------------------------------------
 mm/swap.c               |  23 +++
 mm/truncate.c           | 204 +++++++++++++++++++++++++++
 12 files changed, 276 insertions(+), 346 deletions(-)
 create mode 100644 mm/truncate.c

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 4f1c380230be..d1da2c0ffac8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -443,7 +443,7 @@ void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
 	 * We really want to use invalidate_inode_pages2() for
 	 * that, but not until that's cleaned up.
 	 */
-	invalidate_inode_pages(bdev->bd_inode);
+	invalidate_inode_pages(bdev->bd_inode->i_mapping);
 }
 
 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
diff --git a/fs/jffs/inode-v23.c b/fs/jffs/inode-v23.c
index a9711a7fefae..ae299f9c1f33 100644
--- a/fs/jffs/inode-v23.c
+++ b/fs/jffs/inode-v23.c
@@ -301,7 +301,7 @@ jffs_setattr(struct dentry *dentry, struct iattr *iattr)
 		inode->i_blocks = (inode->i_size + 511) >> 9;
 
 		if (len) {
-			invalidate_inode_pages(inode);
+			invalidate_inode_pages(inode->i_mapping);
 		}
 		inode->i_ctime = CURRENT_TIME;
 		inode->i_mtime = inode->i_ctime;
@@ -1520,7 +1520,7 @@ jffs_file_write(struct file *filp, const char *buf, size_t count,
 	}
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 
  out_isem:
 	return err;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3eb3c79eb210..5bb31a8ee6f0 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -125,14 +125,14 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
 	 *	 throught inode->i_sem or some other mechanism.
 	 */
 	if (page->index == 0)
-		invalidate_inode_pages(inode);
+		invalidate_inode_pages(inode->i_mapping);
 	unlock_page(page);
 	return 0;
  error:
 	SetPageError(page);
 	kunmap(page);
 	unlock_page(page);
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 	desc->error = error;
 	return -EIO;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3870a98c55fd..d5c144efe016 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -564,7 +564,7 @@ nfs_zap_caches(struct inode *inode)
 	NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 	NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
 
-	invalidate_inode_pages(inode);
+	invalidate_inode_pages(inode->i_mapping);
 
 	memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	NFS_CACHEINV(inode);
@@ -1130,7 +1130,7 @@ __nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr)
 	if (invalid) {
 		NFS_ATTRTIMEO(inode) = NFS_MINATTRTIMEO(inode);
 		NFS_ATTRTIMEO_UPDATE(inode) = jiffies;
-		invalidate_inode_pages(inode);
+		invalidate_inode_pages(inode->i_mapping);
 		memset(NFS_COOKIEVERF(inode), 0, sizeof(NFS_COOKIEVERF(inode)));
 	} else if (time_after(jiffies, NFS_ATTRTIMEO_UPDATE(inode)+NFS_ATTRTIMEO(inode))) {
 		if ((NFS_ATTRTIMEO(inode) <<= 1) > NFS_MAXATTRTIMEO(inode))
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index 1164428753c0..aea30ce323f7 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -210,7 +210,7 @@ smb_set_inode_attr(struct inode *inode, struct smb_fattr *fattr)
 			(long) last_sz, (long) inode->i_size);
 
 		if (!S_ISDIR(inode->i_mode))
-			invalidate_inode_pages(inode);
+			invalidate_inode_pages(inode->i_mapping);
 	}
 }
 
@@ -274,7 +274,7 @@ smb_refresh_inode(struct dentry *dentry)
 			 * But we do want to invalidate the caches ...
 			 */
 			if (!S_ISDIR(inode->i_mode))
-				invalidate_inode_pages(inode);
+				invalidate_inode_pages(inode->i_mapping);
 			else
 				smb_invalid_dir_cache(inode);
 			error = -EIO;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 56f2bab87d7f..f0ba1e96325c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1140,7 +1140,7 @@ extern int full_check_disk_change(struct block_device *);
 extern int __check_disk_change(dev_t);
 extern int invalidate_inodes(struct super_block *);
 extern int invalidate_device(kdev_t, int);
-extern void invalidate_inode_pages(struct inode *);
+extern void invalidate_inode_pages(struct address_space *mapping);
 extern void invalidate_inode_pages2(struct address_space *mapping);
 extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index bfc986131fe6..1fe640eaf601 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -41,6 +41,9 @@ extern struct page * find_trylock_page(struct address_space *mapping,
 				unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
 				unsigned long index, unsigned int gfp_mask);
+extern unsigned int find_get_pages(struct address_space *mapping,
+				pgoff_t start, unsigned int nr_pages,
+				struct page **pages);
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 278689b2fb2a..0207270b0fe7 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -8,6 +8,7 @@
 #define PAGEVEC_SIZE	16
 
 struct page;
+struct address_space;
 
 struct pagevec {
 	unsigned nr;
@@ -21,6 +22,8 @@ void __pagevec_lru_add(struct pagevec *pvec);
 void lru_add_drain(void);
 void pagevec_deactivate_inactive(struct pagevec *pvec);
 void pagevec_strip(struct pagevec *pvec);
+unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned int nr_pages);
 
 static inline void pagevec_init(struct pagevec *pvec)
 {
diff --git a/mm/Makefile b/mm/Makefile
index 5f8ef6639971..6c09600e4376 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,6 +9,7 @@ obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
 	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
-	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o
+	    pdflush.o page-writeback.o rmap.o madvise.o vcache.o \
+	    truncate.o
 
 include $(TOPDIR)/Rules.make
diff --git a/mm/filemap.c b/mm/filemap.c
index 27c4140e6f36..8dd115d5aacd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -104,341 +104,6 @@ static inline int sync_page(struct page *page)
 	return 0;
 }
 
-/**
- * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
- * @inode: the inode which pages we want to invalidate
- *
- * This function only removes the unlocked pages, if you want to
- * remove all the pages of one inode, you must call truncate_inode_pages.
- */
-
-void invalidate_inode_pages(struct inode * inode)
-{
-	struct list_head *head, *curr;
-	struct page * page;
-	struct address_space *mapping = inode->i_mapping;
-	struct pagevec pvec;
-
-	head = &mapping->clean_pages;
-	pagevec_init(&pvec);
-	write_lock(&mapping->page_lock);
-	curr = head->next;
-
-	while (curr != head) {
-		page = list_entry(curr, struct page, list);
-		curr = curr->next;
-
-		/* We cannot invalidate something in dirty.. */
-		if (PageDirty(page))
-			continue;
-
-		/* ..or locked */
-		if (TestSetPageLocked(page))
-			continue;
-
-		if (PagePrivate(page) && !try_to_release_page(page, 0))
-			goto unlock;
-
-		if (page_count(page) != 1)
-			goto unlock;
-
-		__remove_from_page_cache(page);
-		unlock_page(page);
-		if (!pagevec_add(&pvec, page))
-			__pagevec_release(&pvec);
-		continue;
-unlock:
-		unlock_page(page);
-		continue;
-	}
-
-	write_unlock(&mapping->page_lock);
-	pagevec_release(&pvec);
-}
-
-static int do_invalidatepage(struct page *page, unsigned long offset)
-{
-	int (*invalidatepage)(struct page *, unsigned long);
-	invalidatepage = page->mapping->a_ops->invalidatepage;
-	if (invalidatepage)
-		return (*invalidatepage)(page, offset);
-	return block_invalidatepage(page, offset);
-}
-
-static inline void truncate_partial_page(struct page *page, unsigned partial)
-{
-	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-	if (PagePrivate(page))
-		do_invalidatepage(page, partial);
-}
-
-/*
- * If truncate cannot remove the fs-private metadata from the page, the page
- * becomes anonymous.  It will be left on the LRU and may even be mapped into
- * user pagetables if we're racing with filemap_nopage().
- */
-static void truncate_complete_page(struct page *page)
-{
-	if (PagePrivate(page))
-		do_invalidatepage(page, 0);
-
-	clear_page_dirty(page);
-	ClearPageUptodate(page);
-	remove_from_page_cache(page);
-	page_cache_release(page);
-}
-
-/*
- * Writeback walks the page list in ->prev order, which is low-to-high file
- * offsets in the common case where he file was written linearly. So truncate
- * walks the page list in the opposite (->next) direction, to avoid getting
- * into lockstep with writeback's cursor.  To prune as many pages as possible
- * before the truncate cursor collides with the writeback cursor.
- */
-static int truncate_list_pages(struct address_space *mapping,
-	struct list_head *head, unsigned long start, unsigned *partial)
-{
-	struct list_head *curr;
-	struct page * page;
-	int unlocked = 0;
-	struct pagevec release_pvec;
-
-	pagevec_init(&release_pvec);
-restart:
-	curr = head->next;
-	while (curr != head) {
-		unsigned long offset;
-
-		page = list_entry(curr, struct page, list);
-		offset = page->index;
-
-		/* Is one of the pages to truncate? */
-		if ((offset >= start) || (*partial && (offset + 1) == start)) {
-			int failed;
-
-			page_cache_get(page);
-			failed = TestSetPageLocked(page);
-			if (!failed && PageWriteback(page)) {
-				unlock_page(page);
-				list_del(head);
-				list_add_tail(head, curr);
-				write_unlock(&mapping->page_lock);
-				wait_on_page_writeback(page);
-				if (!pagevec_add(&release_pvec, page))
-					__pagevec_release(&release_pvec);
-				unlocked = 1;
-				write_lock(&mapping->page_lock);
-				goto restart;
-			}
-
-			list_del(head);
-			if (!failed)		/* Restart after this page */
-				list_add(head, curr);
-			else			/* Restart on this page */
-				list_add_tail(head, curr);
-
-			write_unlock(&mapping->page_lock);
-			unlocked = 1;
-
- 			if (!failed) {
-				if (*partial && (offset + 1) == start) {
-					truncate_partial_page(page, *partial);
-					*partial = 0;
-				} else {
-					truncate_complete_page(page);
-				}
-				unlock_page(page);
-			} else {
- 				wait_on_page_locked(page);
-			}
-			if (!pagevec_add(&release_pvec, page))
-				__pagevec_release(&release_pvec);
-			cond_resched();
-			write_lock(&mapping->page_lock);
-			goto restart;
-		}
-		curr = curr->next;
-	}
-	if (pagevec_count(&release_pvec)) {
-		write_unlock(&mapping->page_lock);
-		pagevec_release(&release_pvec);
-		write_lock(&mapping->page_lock);
-		unlocked = 1;
-	}
-	return unlocked;
-}
-
-/*
- * Unconditionally clean all pages outside `start'.  The mapping lock
- * must be held.
- */
-static void clean_list_pages(struct address_space *mapping,
-		struct list_head *head, unsigned long start)
-{
-	struct page *page;
-	struct list_head *curr;
-
-	for (curr = head->next; curr != head; curr = curr->next) {
-		page = list_entry(curr, struct page, list);
-		if (page->index > start)
-			clear_page_dirty(page);
-	}
-}
-
-/**
- * truncate_inode_pages - truncate *all* the pages from an offset
- * @mapping: mapping to truncate
- * @lstart: offset from with to truncate
- *
- * Truncate the page cache at a set offset, removing the pages
- * that are beyond that offset (and zeroing out partial pages).
- * If any page is locked we wait for it to become unlocked.
- */
-void truncate_inode_pages(struct address_space * mapping, loff_t lstart) 
-{
-	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
-	int unlocked;
-
-	write_lock(&mapping->page_lock);
-	clean_list_pages(mapping, &mapping->io_pages, start);
-	clean_list_pages(mapping, &mapping->dirty_pages, start);
-	do {
-		unlocked = truncate_list_pages(mapping,
-				&mapping->io_pages, start, &partial);
-		unlocked |= truncate_list_pages(mapping,
-				&mapping->dirty_pages, start, &partial);
-		unlocked |= truncate_list_pages(mapping,
-				&mapping->clean_pages, start, &partial);
-		unlocked |= truncate_list_pages(mapping,
-				&mapping->locked_pages, start, &partial);
-	} while (unlocked);
-	/* Traversed all three lists without dropping the lock */
-	write_unlock(&mapping->page_lock);
-}
-
-static inline int invalidate_this_page2(struct address_space * mapping,
-					struct page * page,
-					struct list_head * curr,
-					struct list_head * head)
-{
-	int unlocked = 1;
-
-	/*
-	 * The page is locked and we hold the mapping lock as well
-	 * so both page_count(page) and page_buffers stays constant here.
-	 * AKPM: fixme: No global lock any more.  Is this still OK?
-	 */
-	if (page_count(page) == 1 + !!page_has_buffers(page)) {
-		/* Restart after this page */
-		list_del(head);
-		list_add_tail(head, curr);
-
-		page_cache_get(page);
-		write_unlock(&mapping->page_lock);
-		truncate_complete_page(page);
-	} else {
-		if (page_has_buffers(page)) {
-			/* Restart after this page */
-			list_del(head);
-			list_add_tail(head, curr);
-
-			page_cache_get(page);
-			write_unlock(&mapping->page_lock);
-			do_invalidatepage(page, 0);
-		} else
-			unlocked = 0;
-
-		clear_page_dirty(page);
-		ClearPageUptodate(page);
-	}
-
-	return unlocked;
-}
-
-static int invalidate_list_pages2(struct address_space * mapping,
-				  struct list_head * head)
-{
-	struct list_head *curr;
-	struct page * page;
-	int unlocked = 0;
-	struct pagevec release_pvec;
-
-	pagevec_init(&release_pvec);
-restart:
-	curr = head->prev;
-	while (curr != head) {
-		page = list_entry(curr, struct page, list);
-
-		if (!TestSetPageLocked(page)) {
-			int __unlocked;
-
-			if (PageWriteback(page)) {
-				write_unlock(&mapping->page_lock);
-				wait_on_page_writeback(page);
-				unlocked = 1;
-				write_lock(&mapping->page_lock);
-				unlock_page(page);
-				goto restart;
-			}
-
-			__unlocked = invalidate_this_page2(mapping,
-						page, curr, head);
-			unlock_page(page);
-			unlocked |= __unlocked;
-			if (!__unlocked) {
-				curr = curr->prev;
-				continue;
-			}
-		} else {
-			/* Restart on this page */
-			list_del(head);
-			list_add(head, curr);
-
-			page_cache_get(page);
-			write_unlock(&mapping->page_lock);
-			unlocked = 1;
-			wait_on_page_locked(page);
-		}
-
-		if (!pagevec_add(&release_pvec, page))
-			__pagevec_release(&release_pvec);
-		cond_resched();
-		write_lock(&mapping->page_lock);
-		goto restart;
-	}
-	if (pagevec_count(&release_pvec)) {
-		write_unlock(&mapping->page_lock);
-		pagevec_release(&release_pvec);
-		write_lock(&mapping->page_lock);
-		unlocked = 1;
-	}
-	return unlocked;
-}
-
-/**
- * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
- * free the pages because they're mapped.
- * @mapping: the address_space which pages we want to invalidate
- */
-void invalidate_inode_pages2(struct address_space *mapping)
-{
-	int unlocked;
-
-	write_lock(&mapping->page_lock);
-	do {
-		unlocked = invalidate_list_pages2(mapping,
-				&mapping->clean_pages);
-		unlocked |= invalidate_list_pages2(mapping,
-				&mapping->dirty_pages);
-		unlocked |= invalidate_list_pages2(mapping,
-				&mapping->io_pages);
-		unlocked |= invalidate_list_pages2(mapping,
-				&mapping->locked_pages);
-	} while (unlocked);
-	write_unlock(&mapping->page_lock);
-}
-
 /*
  * In-memory filesystems have to fail their
  * writepage function - and this has to be
@@ -823,6 +488,37 @@ repeat:
 	return page;
 }
 
+/**
+ * find_get_pages - gang pagecache lookup
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ * @pages:	Where the resulting pages are placed
+ *
+ * find_get_pages() will search for and return a group of up to
+ * @nr_pages pages in the mapping.  The pages are placed at @pages.
+ * find_get_pages() takes a reference against the returned pages.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * find_get_pages() returns the number of pages which were found.
+ */
+unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
+			    unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	read_lock(&mapping->page_lock);
+	ret = radix_tree_gang_lookup(&mapping->page_tree,
+				(void **)pages, start, nr_pages);
+	for (i = 0; i < ret; i++)
+		page_cache_get(pages[i]);
+	read_unlock(&mapping->page_lock);
+	return ret;
+}
+
 /*
  * Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
diff --git a/mm/swap.c b/mm/swap.c
index 4528369df084..3142364de84b 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -238,6 +238,29 @@ void pagevec_strip(struct pagevec *pvec)
 	}
 }
 
+/**
+ * pagevec_lookup - gang pagecache lookup
+ * @pvec:	Where the resulting pages are placed
+ * @mapping:	The address_space to search
+ * @start:	The starting page index
+ * @nr_pages:	The maximum number of pages
+ *
+ * pagevec_lookup() will search for and return a group of up to @nr_pages pages
+ * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
+ * reference against the pages in @pvec.
+ *
+ * The search returns a group of mapping-contiguous pages with ascending
+ * indexes.  There may be holes in the indices due to not-present pages.
+ *
+ * pagevec_lookup() returns the number of pages which were found.
+ */
+unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned int nr_pages)
+{
+	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
 /*
  * Perform any setup for the swap system
  */
diff --git a/mm/truncate.c b/mm/truncate.c
new file mode 100644
index 000000000000..00f489e1b1b6
--- /dev/null
+++ b/mm/truncate.c
@@ -0,0 +1,204 @@
+/*
+ * mm/truncate.c - code for taking down pages from address_spaces
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 10Sep2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/pagevec.h>
+#include <linux/buffer_head.h>	/* grr. try_to_release_page,
+				   block_invalidatepage */
+
+
+static int do_invalidatepage(struct page *page, unsigned long offset)
+{
+	int (*invalidatepage)(struct page *, unsigned long);
+	invalidatepage = page->mapping->a_ops->invalidatepage;
+	if (invalidatepage == NULL)
+		invalidatepage = block_invalidatepage;
+	return (*invalidatepage)(page, offset);
+}
+
+static inline void truncate_partial_page(struct page *page, unsigned partial)
+{
+	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	if (PagePrivate(page))
+		do_invalidatepage(page, partial);
+}
+
+/*
+ * If truncate cannot remove the fs-private metadata from the page, the page
+ * becomes anonymous.  It will be left on the LRU and may even be mapped into
+ * user pagetables if we're racing with filemap_nopage().
+ */
+static void truncate_complete_page(struct page *page)
+{
+	if (PagePrivate(page))
+		do_invalidatepage(page, 0);
+
+	clear_page_dirty(page);
+	ClearPageUptodate(page);
+	remove_from_page_cache(page);
+	page_cache_release(page);
+}
+
+/**
+ * truncate_inode_pages - truncate *all* the pages from an offset
+ * @mapping: mapping to truncate
+ * @lstart: offset from which to truncate
+ *
+ * Truncate the page cache at a set offset, removing the pages that are beyond
+ * that offset (and zeroing out partial pages).
+ *
+ * Truncate takes two passes - the first pass is nonblocking.  It will not
+ * block on page locks and it will not block on writeback.  The second pass
+ * will wait.  This is to prevent as much IO as possible in the affected region.
+ * The first pass will remove most pages, so the search cost of the second pass
+ * is low.
+ *
+ * Called under (and serialised by) inode->i_sem.
+ */
+void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
+{
+	const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
+	const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+	struct pagevec pvec;
+	pgoff_t next;
+	int i;
+
+	pagevec_init(&pvec);
+	next = start;
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			next = page->index + 1;
+			if (TestSetPageLocked(page))
+				continue;
+			if (PageWriteback(page)) {
+				unlock_page(page);
+				continue;
+			}
+			truncate_complete_page(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+
+	if (partial) {
+		struct page *page = find_lock_page(mapping, start - 1);
+		if (page) {
+			wait_on_page_writeback(page);
+			truncate_partial_page(page, partial);
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	}
+
+	next = start;
+	for ( ; ; ) {
+		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+			if (next == start)
+				break;
+			next = start;
+			continue;
+		}
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			wait_on_page_writeback(page);
+			next = page->index + 1;
+			truncate_complete_page(page);
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+	}
+	if (lstart == 0 && mapping->nrpages)
+		printk("%s: I goofed!\n", __FUNCTION__);
+}
+
+/**
+ * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
+ * @inode: the inode which pages we want to invalidate
+ *
+ * This function only removes the unlocked pages, if you want to
+ * remove all the pages of one inode, you must call truncate_inode_pages.
+ *
+ * invalidate_inode_pages() will not block on IO activity. It will not
+ * invalidate pages which are dirty, locked, under writeback or mapped into
+ * pagetables.
+ */
+void invalidate_inode_pages(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec);
+	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			if (TestSetPageLocked(page)) {
+				next++;
+				continue;
+			}
+			next = page->index + 1;
+			if (PageDirty(page) || PageWriteback(page))
+				goto unlock;
+			if (PagePrivate(page) && !try_to_release_page(page, 0))
+				goto unlock;
+			if (page_mapped(page))
+				goto unlock;
+			truncate_complete_page(page);
+unlock:
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
+
+/**
+ * invalidate_inode_pages2 - remove all unmapped pages from an address_space
+ * @mapping - the address_space
+ *
+ * invalidate_inode_pages2() is like truncate_inode_pages(), except for the case
+ * where the page is seen to be mapped into process pagetables.  In that case,
+ * the page is marked clean but is left attached to its address_space.
+ *
+ * FIXME: invalidate_inode_pages2() is probably trivially livelockable.
+ */
+void invalidate_inode_pages2(struct address_space *mapping)
+{
+	struct pagevec pvec;
+	pgoff_t next = 0;
+	int i;
+
+	pagevec_init(&pvec);
+	while (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			lock_page(page);
+			if (page->mapping) {	/* truncate race? */
+				wait_on_page_writeback(page);
+				next = page->index + 1;
+				if (page_mapped(page))
+					clear_page_dirty(page);
+				else
+					truncate_complete_page(page);
+			}
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+}
-- 
cgit v1.2.3


From 15e19695f630adab6b1988d21a069cb0a0bfe677 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Wed, 2 Oct 2002 22:57:57 -0700
Subject: [PATCH] add /proc/vmstat (start of /proc/stat cleanup)

Moves the VM accounting out of /proc/stat and into /proc/vmstat.

The VM accounting is now per-cpu.

It also moves kstat.pgpgin and kstat.pgpgout into /proc/vmstat.
Which is a bit of a duplication of /proc/diskstats (SARD), but it's
easy, super-cheap and makes life a lot easier for all the system
monitoring applications which we just broke.

We now require procps 2.0.9.

Updated versions of top and vmstat are available at http://surriel.com
and the Cygnus CVS is uptodate for these changes.  (Rik has the CVS
info at the above site).

This tidies up kernel_stat quite a lot - it now only contains CPU
things (interrupts and CPU loads) and disk things.  So we now have:

/proc/stat:	CPU things and disk things
/proc/vmstat:	VM things	(plus pgpgin, pgpgout)

The SARD patch removes the disk things from /proc/stat as well.
---
 Documentation/Changes       |   9 ++--
 drivers/block/ll_rw_blk.c   |  11 +----
 fs/proc/proc_misc.c         |  47 ++++++------------
 include/linux/kernel_stat.h |   7 ---
 include/linux/page-flags.h  |  24 ++++++++-
 init/main.c                 |   2 -
 mm/filemap.c                |   2 +-
 mm/memory.c                 |   4 +-
 mm/page_alloc.c             | 116 +++++++++++++++++++++++++++++++++++++-------
 mm/page_io.c                |   4 +-
 mm/swap.c                   |   2 +-
 mm/vmscan.c                 |  12 ++---
 12 files changed, 156 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/Changes b/Documentation/Changes
index 9efd9ac3b3f8..52a91c6a37db 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -31,7 +31,7 @@ al espa
 Eine deutsche Version dieser Datei finden Sie unter
 <http://www.stefan-winter.de/Changes-2.4.0.txt>.
 
-Last updated: January 22, 2002
+Last updated: October 1st, 2002
 
 Chris Ricker (kaboom@gatech.edu or chris.ricker@genetics.utah.edu).
 
@@ -60,7 +60,8 @@ o  xfsprogs               2.1.0                   # xfs_db -V
 o  pcmcia-cs              3.1.21                  # cardmgr -V
 o  PPP                    2.4.0                   # pppd --version
 o  isdn4k-utils           3.1pre1                 # isdnctrl 2>&1|grep version
-			  
+o  procps                 2.0.9                   # ps --version
+
 Kernel compilation
 ==================
 
@@ -80,9 +81,7 @@ almost certainly bugs (mainly, but not exclusively, in the kernel) that
 will need to be fixed in order to use these compilers. In any case, using
 pgcc instead of plain gcc is just asking for trouble.
 
-Note that gcc 2.7.2.3 and  gcc 2.91.66 (egcs-1.1.2) are no longer supported
-kernel compilers. The kernel no longer works around bugs in these versions,
-and, in fact, will refuse to be compiled with it.
+gcc 2.91.66 (egcs-1.1.2) continues to be supported for SPARC64 requirements.
 
 The Red Hat gcc 2.96 compiler subtree can also be used to build this tree.
 You should ensure you use gcc-2.96-74 or later. gcc-2.96-54 will not build
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 341b07f7b316..22af563f7461 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1856,21 +1856,14 @@ int submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 
-	/*
-	 * do some validity checks...
-	 */
 	BUG_ON(!bio->bi_end_io);
-
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
-
 	bio->bi_rw = rw;
-
 	if (rw & WRITE)
-		kstat.pgpgout += count;
+		mod_page_state(pgpgout, count);
 	else
-		kstat.pgpgin += count;
-
+		mod_page_state(pgpgin, count);
 	generic_make_request(bio);
 	return 1;
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 234119dddb5b..0b743d745096 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -252,6 +252,18 @@ static struct file_operations proc_cpuinfo_operations = {
 	.release	= seq_release,
 };
 
+extern struct seq_operations vmstat_op;
+static int vmstat_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vmstat_op);
+}
+static struct file_operations proc_vmstat_file_operations = {
+	open:		vmstat_open,
+	read:		seq_read,
+	llseek:		seq_lseek,
+	release:	seq_release,
+};
+
 #ifdef CONFIG_PROC_HARDWARE
 static int hardware_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
@@ -359,16 +371,8 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 				   + kstat.per_cpu_nice[i] \
 				   + kstat.per_cpu_system[i])));
 	}
-	len += sprintf(page + len,
-		"page %u %u\n"
-		"swap %u %u\n"
-		"intr %u",
-			kstat.pgpgin >> 1,
-			kstat.pgpgout >> 1,
-			kstat.pswpin,
-			kstat.pswpout,
-			sum
-	);
+	len += sprintf(page + len, "intr %u", sum);
+
 #if !defined(CONFIG_ARCH_S390)
 	for (i = 0 ; i < NR_IRQS ; i++)
 		len += sprintf(page + len, " %u", kstat_irqs(i));
@@ -395,29 +399,9 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	}
 
 	len += sprintf(page + len,
-		"\npageallocs %u\n"
-		"pagefrees %u\n"
-		"pageactiv %u\n"
-		"pagedeact %u\n"
-		"pagefault %u\n"
-		"majorfault %u\n"
-		"pagescan %u\n"
-		"pagesteal %u\n"
-		"pageoutrun %u\n"
-		"allocstall %u\n"
-		"ctxt %lu\n"
+		"\nctxt %lu\n"
 		"btime %lu\n"
 		"processes %lu\n",
-		kstat.pgalloc,
-		kstat.pgfree,
-		kstat.pgactivate,
-		kstat.pgdeactivate,
-		kstat.pgfault,
-		kstat.pgmajfault,
-		kstat.pgscan,
-		kstat.pgsteal,
-		kstat.pageoutrun,
-		kstat.allocstall,
 		nr_context_switches(),
 		xtime.tv_sec - jif / HZ,
 		total_forks);
@@ -646,6 +630,7 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
+	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
 #ifdef CONFIG_MODULES
 	create_seq_entry("modules", 0, &proc_modules_operations);
 	create_seq_entry("ksyms", 0, &proc_ksyms_operations);
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 75533ee86b73..38ec5b8edc2e 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -24,13 +24,6 @@ struct kernel_stat {
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rblk[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wblk[DK_MAX_MAJOR][DK_MAX_DISK];
-	unsigned int pgpgin, pgpgout;
-	unsigned int pswpin, pswpout;
-	unsigned int pgalloc, pgfree;
-	unsigned int pgactivate, pgdeactivate;
-	unsigned int pgfault, pgmajfault;
-	unsigned int pgscan, pgsteal;
-	unsigned int pageoutrun, allocstall;
 #if !defined(CONFIG_ARCH_S390)
 	unsigned int irqs[NR_CPUS][NR_IRQS];
 #endif
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 0e32a1b9dd5e..62bb6403c160 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -70,7 +70,8 @@
 #define PG_direct		16	/* ->pte_chain points directly at pte */
 
 /*
- * Global page accounting.  One instance per CPU.
+ * Global page accounting.  One instance per CPU.  Only unsigned longs are
+ * allowed.
  */
 extern struct page_state {
 	unsigned long nr_dirty;
@@ -80,9 +81,30 @@ extern struct page_state {
 	unsigned long nr_reverse_maps;
 	unsigned long nr_mapped;
 	unsigned long nr_slab;
+#define GET_PAGE_STATE_LAST nr_slab
+
+	/*
+	 * The below are zeroed by get_page_state().  Use get_full_page_state()
+	 * to add up all these.
+	 */
+	unsigned long pgpgin;
+	unsigned long pgpgout;
+	unsigned long pswpin;
+	unsigned long pswpout;
+	unsigned long pgalloc;
+	unsigned long pgfree;
+	unsigned long pgactivate;
+	unsigned long pgdeactivate;
+	unsigned long pgfault;
+	unsigned long pgmajfault;
+	unsigned long pgscan;
+	unsigned long pgsteal;
+	unsigned long pageoutrun;
+	unsigned long allocstall;
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];
 
 extern void get_page_state(struct page_state *ret);
+extern void get_full_page_state(struct page_state *ret);
 
 #define mod_page_state(member, delta)					\
 	do {								\
diff --git a/init/main.c b/init/main.c
index b47b623aa6a0..4484dc869b6b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -554,8 +554,6 @@ static int init(void * unused)
 	unlock_kernel();
 	system_running = 1;
 
-	kstat.pgfree = 0;
-
 	if (open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 8dd115d5aacd..e9918139934d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1098,7 +1098,7 @@ no_cached_page:
 	return NULL;
 
 page_not_uptodate:
-	KERNEL_STAT_INC(pgmajfault);
+	inc_page_state(pgmajfault);
 	lock_page(page);
 
 	/* Did it get unhashed while we waited for it? */
diff --git a/mm/memory.c b/mm/memory.c
index 2217d546758d..91a971df3e71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1216,7 +1216,7 @@ static int do_swap_page(struct mm_struct * mm,
 
 		/* Had to read the page from swap area: Major fault */
 		ret = VM_FAULT_MAJOR;
-		KERNEL_STAT_INC(pgmajfault);
+		inc_page_state(pgmajfault);
 	}
 
 	mark_page_accessed(page);
@@ -1461,7 +1461,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma,
 	current->state = TASK_RUNNING;
 	pgd = pgd_offset(mm, address);
 
-	KERNEL_STAT_INC(pgfault);
+	inc_page_state(pgfault);
 	/*
 	 * We need the page table lock to synchronize with kswapd
 	 * and the SMP-safe atomic PTE updates.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 86b534cef5f8..1daea542a882 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -13,7 +13,7 @@
  */
 
 #include <linux/config.h>
-#include <linux/kernel_stat.h>
+#include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/interrupt.h>
@@ -24,6 +24,7 @@
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
+#include <linux/slab.h>
 
 unsigned long totalram_pages;
 unsigned long totalhigh_pages;
@@ -86,7 +87,7 @@ void __free_pages_ok (struct page *page, unsigned int order)
 	struct page *base;
 	struct zone *zone;
 
-	KERNEL_STAT_ADD(pgfree, 1<<order);
+	mod_page_state(pgfree, 1<<order);
 
 	BUG_ON(PageLRU(page));
 	BUG_ON(PagePrivate(page));
@@ -324,7 +325,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();
 
-	KERNEL_STAT_ADD(pgalloc, 1<<order);
+	mod_page_state(pgalloc, 1<<order);
 
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	classzone = zones[0]; 
@@ -397,7 +398,7 @@ nopage:
 	if (!(gfp_mask & __GFP_WAIT))
 		goto nopage;
 
-	KERNEL_STAT_INC(allocstall);
+	inc_page_state(allocstall);
 	page = balance_classzone(classzone, gfp_mask, order, &freed);
 	if (page)
 		return page;
@@ -555,28 +556,39 @@ unsigned int nr_free_highpages (void)
 struct page_state page_states[NR_CPUS] __cacheline_aligned;
 EXPORT_SYMBOL(page_states);
 
-void get_page_state(struct page_state *ret)
+void __get_page_state(struct page_state *ret, int nr)
 {
-	int pcpu;
+	int cpu;
 
 	memset(ret, 0, sizeof(*ret));
-	for (pcpu = 0; pcpu < NR_CPUS; pcpu++) {
-		struct page_state *ps;
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		unsigned long *in, *out, off;
 
-		if (!cpu_online(pcpu))
+		if (!cpu_online(cpu))
 			continue;
 
-		ps = &page_states[pcpu];
-		ret->nr_dirty += ps->nr_dirty;
-		ret->nr_writeback += ps->nr_writeback;
-		ret->nr_pagecache += ps->nr_pagecache;
-		ret->nr_page_table_pages += ps->nr_page_table_pages;
-		ret->nr_reverse_maps += ps->nr_reverse_maps;
-		ret->nr_mapped += ps->nr_mapped;
-		ret->nr_slab += ps->nr_slab;
+		in = (unsigned long *)(page_states + cpu);
+		out = (unsigned long *)ret;
+		for (off = 0; off < nr; off++)
+			*out++ += *in++;
 	}
 }
 
+void get_page_state(struct page_state *ret)
+{
+	int nr;
+
+	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
+	nr /= sizeof(unsigned long);
+
+	__get_page_state(ret, nr + 1);
+}
+
+void get_full_page_state(struct page_state *ret)
+{
+	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long));
+}
+
 void get_zone_counts(unsigned long *active, unsigned long *inactive)
 {
 	struct zone *zone;
@@ -1048,4 +1060,74 @@ struct seq_operations fragmentation_op = {
 	.show	= frag_show,
 };
 
+static char *vmstat_text[] = {
+	"nr_dirty",
+	"nr_writeback",
+	"nr_pagecache",
+	"nr_page_table_pages",
+	"nr_reverse_maps",
+	"nr_mapped",
+	"nr_slab",
+
+	"pgpgin",
+	"pgpgout",
+	"pswpin",
+	"pswpout",
+	"pgalloc",
+	"pgfree",
+	"pgactivate",
+	"pgdeactivate",
+	"pgfault",
+	"pgmajfault",
+	"pgscan",
+	"pgsteal",
+	"pageoutrun",
+	"allocstall",
+};
+
+static void *vmstat_start(struct seq_file *m, loff_t *pos)
+{
+	struct page_state *ps;
+
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+
+	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
+	m->private = ps;
+	if (!ps)
+		return ERR_PTR(-ENOMEM);
+	get_full_page_state(ps);
+	return (unsigned long *)ps + *pos;
+}
+
+static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+	(*pos)++;
+	if (*pos >= ARRAY_SIZE(vmstat_text))
+		return NULL;
+	return (unsigned long *)m->private + *pos;
+}
+
+static int vmstat_show(struct seq_file *m, void *arg)
+{
+	unsigned long *l = arg;
+	unsigned long off = l - (unsigned long *)m->private;
+
+	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
+	return 0;
+}
+
+static void vmstat_stop(struct seq_file *m, void *arg)
+{
+	kfree(m->private);
+	m->private = NULL;
+}
+
+struct seq_operations vmstat_op = {
+	.start	= vmstat_start,
+	.next	= vmstat_next,
+	.stop	= vmstat_stop,
+	.show	= vmstat_show,
+};
+
 #endif /* CONFIG_PROC_FS */
diff --git a/mm/page_io.c b/mm/page_io.c
index 47de394d5576..50d137ca40fa 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -100,7 +100,7 @@ int swap_writepage(struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	kstat.pswpout++;
+	inc_page_state(pswpout);
 	SetPageWriteback(page);
 	unlock_page(page);
 	submit_bio(WRITE, bio);
@@ -119,7 +119,7 @@ int swap_readpage(struct file *file, struct page *page)
 		ret = -ENOMEM;
 		goto out;
 	}
-	kstat.pswpin++;
+	inc_page_state(pswpin);
 	submit_bio(READ, bio);
 out:
 	return ret;
diff --git a/mm/swap.c b/mm/swap.c
index 3142364de84b..f0cd9260bb29 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -38,7 +38,7 @@ void activate_page(struct page *page)
 		del_page_from_inactive_list(zone, page);
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
-		KERNEL_STAT_INC(pgactivate);
+		inc_page_state(pgactivate);
 	}
 	spin_unlock_irq(&zone->lru_lock);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index be8e9481c947..c366e57a38e8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -312,8 +312,8 @@ keep:
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
-	KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages);
-	KERNEL_STAT_ADD(pgactivate, pgactivate);
+	mod_page_state(pgsteal, nr_pages_in - nr_pages);
+	mod_page_state(pgactivate, pgactivate);
 	return nr_pages;
 }
 
@@ -380,7 +380,7 @@ shrink_cache(int nr_pages, struct zone *zone,
 			goto done;
 
 		max_scan -= nr_scan;
-		KERNEL_STAT_ADD(pgscan, nr_scan);
+		mod_page_state(pgscan, nr_scan);
 		nr_pages = shrink_list(&page_list, nr_pages,
 				gfp_mask, &max_scan, nr_mapped);
 
@@ -527,8 +527,8 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
-	KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages);
-	KERNEL_STAT_ADD(pgdeactivate, pgdeactivate);
+	mod_page_state(pgscan, nr_pages_in - nr_pages);
+	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
 static /* inline */ int
@@ -641,7 +641,7 @@ try_to_free_pages(struct zone *classzone,
 	int priority = DEF_PRIORITY;
 	int nr_pages = SWAP_CLUSTER_MAX;
 
-	KERNEL_STAT_INC(pageoutrun);
+	inc_page_state(pageoutrun);
 
 	for (priority = DEF_PRIORITY; priority; priority--) {
 		int total_scanned = 0;
-- 
cgit v1.2.3


From 7e96bae145f0f6bf287e5ba11679c114ed76f5d7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Wed, 2 Oct 2002 22:58:11 -0700
Subject: [PATCH] add kswapd success accounting to /proc/vmstat

Tells us how many pages were reclaimed by kswapd.

The `pgsteal' statistic tells us how many pages were reclaimed
altogether.  So

	kswapd_steal - pgsteal

is the number of pages which were directly reclaimed by page allocating
processes.


Also, the `pgscan' data is currently counting the number of pages
scanned in shrink_cache() plus the number of pages scanned in
refill_inactive_zone().  These are rather separate concepts, so I
created the new `pgrefill' counter for refill_inactive_zone().
`pgscan' is now just the number of pages scanned in shrink_cache().
---
 include/linux/page-flags.h | 2 ++
 include/linux/sched.h      | 1 +
 mm/page_alloc.c            | 2 ++
 mm/vmscan.c                | 6 ++++--
 4 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 62bb6403c160..5c770f49787a 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -98,7 +98,9 @@ extern struct page_state {
 	unsigned long pgfault;
 	unsigned long pgmajfault;
 	unsigned long pgscan;
+	unsigned long pgrefill;
 	unsigned long pgsteal;
+	unsigned long kswapd_steal;
 	unsigned long pageoutrun;
 	unsigned long allocstall;
 } ____cacheline_aligned_in_smp page_states[NR_CPUS];
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8a361b76cf43..26b85f1661c4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -433,6 +433,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_FROZEN	0x00040000	/* frozen for system suspend */
 #define PF_SYNC		0x00080000	/* performing fsync(), etc */
 #define PF_FSTRANS	0x00100000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00200000	/* I am kswapd */
 
 /*
  * Ptrace flags
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1daea542a882..b9cced8d19a0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1080,7 +1080,9 @@ static char *vmstat_text[] = {
 	"pgfault",
 	"pgmajfault",
 	"pgscan",
+	"pgrefill",
 	"pgsteal",
+	"kswapd_steal",
 	"pageoutrun",
 	"allocstall",
 };
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c366e57a38e8..f8b879a35775 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -313,6 +313,8 @@ keep:
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgsteal, nr_pages_in - nr_pages);
+	if (current->flags & PF_KSWAPD)
+		mod_page_state(kswapd_steal, nr_pages_in - nr_pages);
 	mod_page_state(pgactivate, pgactivate);
 	return nr_pages;
 }
@@ -527,7 +529,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	spin_unlock_irq(&zone->lru_lock);
 	pagevec_release(&pvec);
 
-	mod_page_state(pgscan, nr_pages_in - nr_pages);
+	mod_page_state(pgrefill, nr_pages_in - nr_pages);
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
@@ -757,7 +759,7 @@ int kswapd(void *p)
 	 * us from recursively trying to free more memory as we're
 	 * trying to free the first piece of memory in the first place).
 	 */
-	tsk->flags |= PF_MEMALLOC;
+	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
 
 	/*
 	 * Kswapd main loop.
-- 
cgit v1.2.3


From 7b88e5e0bdf25a3c7d7b6efd5caa54cbcdfec861 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Wed, 2 Oct 2002 22:58:25 -0700
Subject: [PATCH] "io wait" process accounting

Patch from Rik adds "I/O wait" statistics to /proc/stat.

This allows us to determine how much system time is being spent
awaiting IO completion.  This is an important statistic, as it tends to
directly subtract from job completion time.

procps-2.0.9 is OK with this, but doesn't report it.
---
 drivers/block/ll_rw_blk.c   | 26 ++++++++++++++++++++++++--
 fs/direct-io.c              |  3 ++-
 fs/proc/proc_misc.c         | 16 +++++++++-------
 include/linux/blkdev.h      |  4 ++++
 include/linux/kernel_stat.h |  4 +++-
 kernel/sched.c              |  5 +++++
 mm/filemap.c                |  6 +++---
 7 files changed, 50 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 22af563f7461..01720fdccb1c 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -58,6 +58,7 @@ static int queue_nr_requests;
 static int batch_requests;
 
 unsigned long blk_max_low_pfn, blk_max_pfn;
+atomic_t nr_iowait_tasks = ATOMIC_INIT(0);
 int blk_nohighio = 0;
 
 static struct congestion_state {
@@ -116,6 +117,27 @@ static void set_queue_congested(request_queue_t *q, int rw)
 		atomic_inc(&congestion_states[rw].nr_congested_queues);
 }
 
+/*
+ * This task is about to go to sleep on IO.  Increment nr_iowait_tasks so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+void io_schedule(void)
+{
+	atomic_inc(&nr_iowait_tasks);
+	schedule();
+	atomic_dec(&nr_iowait_tasks);
+}
+
+void io_schedule_timeout(long timeout)
+{
+	atomic_inc(&nr_iowait_tasks);
+	schedule_timeout(timeout);
+	atomic_dec(&nr_iowait_tasks);
+}
+
 /**
  * bdev_get_queue: - return the queue that matches the given device
  * @bdev:    device
@@ -1274,7 +1296,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
 		prepare_to_wait_exclusive(&rl->wait, &wait,
 					TASK_UNINTERRUPTIBLE);
 		if (!rl->count)
-			schedule();
+			io_schedule();
 		finish_wait(&rl->wait, &wait);
 		spin_lock_irq(q->queue_lock);
 		rq = get_request(q, rw);
@@ -1497,7 +1519,7 @@ void blk_congestion_wait(int rw, long timeout)
 	blk_run_queues();
 	prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE);
 	if (atomic_read(&cs->nr_congested_queues) != 0)
-		schedule_timeout(timeout);
+		io_schedule_timeout(timeout);
 	finish_wait(&cs->wqh, &wait);
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 9fd5459ca757..4126f259cc3d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/wait.h>
 #include <linux/err.h>
+#include <linux/blkdev.h>
 #include <linux/buffer_head.h>
 #include <linux/rwsem.h>
 #include <asm/atomic.h>
@@ -230,7 +231,7 @@ static struct bio *dio_await_one(struct dio *dio)
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_list_lock, flags);
 			blk_run_queues();
-			schedule();
+			io_schedule();
 			spin_lock_irqsave(&dio->bio_list_lock, flags);
 			dio->waiter = NULL;
 		}
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 0b743d745096..ae949a9b1722 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -339,7 +339,7 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 	int i, len;
 	extern unsigned long total_forks;
 	unsigned long jif = jiffies;
-	unsigned int sum = 0, user = 0, nice = 0, system = 0;
+	unsigned int sum = 0, user = 0, nice = 0, system = 0, idle = 0, iowait = 0;
 	int major, disk;
 
 	for (i = 0 ; i < NR_CPUS; i++) {
@@ -349,27 +349,29 @@ static int kstat_read_proc(char *page, char **start, off_t off,
 		user += kstat.per_cpu_user[i];
 		nice += kstat.per_cpu_nice[i];
 		system += kstat.per_cpu_system[i];
+		idle += kstat.per_cpu_idle[i];
+		iowait += kstat.per_cpu_iowait[i];
 #if !defined(CONFIG_ARCH_S390)
 		for (j = 0 ; j < NR_IRQS ; j++)
 			sum += kstat.irqs[i][j];
 #endif
 	}
 
-	len = sprintf(page, "cpu  %u %u %u %lu\n",
+	len = sprintf(page, "cpu  %u %u %u %u %u\n",
 		jiffies_to_clock_t(user),
 		jiffies_to_clock_t(nice),
 		jiffies_to_clock_t(system),
-		jiffies_to_clock_t(jif * num_online_cpus() - (user + nice + system)));
+		jiffies_to_clock_t(idle),
+		jiffies_to_clock_t(iowait));
 	for (i = 0 ; i < NR_CPUS; i++){
 		if (!cpu_online(i)) continue;
-		len += sprintf(page + len, "cpu%d %u %u %u %lu\n",
+		len += sprintf(page + len, "cpu%d %u %u %u %u %u\n",
 			i,
 			jiffies_to_clock_t(kstat.per_cpu_user[i]),
 			jiffies_to_clock_t(kstat.per_cpu_nice[i]),
 			jiffies_to_clock_t(kstat.per_cpu_system[i]),
-			jiffies_to_clock_t(jif - (  kstat.per_cpu_user[i] \
-				   + kstat.per_cpu_nice[i] \
-				   + kstat.per_cpu_system[i])));
+			jiffies_to_clock_t(kstat.per_cpu_idle[i]),
+			jiffies_to_clock_t(kstat.per_cpu_iowait[i]));
 	}
 	len += sprintf(page + len, "intr %u", sum);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ee1171a3ae67..187213441ee7 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -401,4 +401,8 @@ static inline void put_dev_sector(Sector p)
 	page_cache_release(p.v);
 }
 
+extern atomic_t nr_iowait_tasks;
+void io_schedule(void);
+void io_schedule_timeout(long timeout);
+
 #endif
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 38ec5b8edc2e..a687f5b224e9 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -18,7 +18,9 @@
 struct kernel_stat {
 	unsigned int per_cpu_user[NR_CPUS],
 	             per_cpu_nice[NR_CPUS],
-	             per_cpu_system[NR_CPUS];
+	             per_cpu_system[NR_CPUS],
+	             per_cpu_idle[NR_CPUS],
+	             per_cpu_iowait[NR_CPUS];
 	unsigned int dk_drive[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_rio[DK_MAX_MAJOR][DK_MAX_DISK];
 	unsigned int dk_drive_wio[DK_MAX_MAJOR][DK_MAX_DISK];
diff --git a/kernel/sched.c b/kernel/sched.c
index aa13d6a55721..d28180b44322 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -28,6 +28,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
+#include <linux/blkdev.h>
 #include <linux/delay.h>
 #include <linux/timer.h>
 
@@ -866,6 +867,10 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
 			kstat.per_cpu_system[cpu] += sys_ticks;
+		else if (atomic_read(&nr_iowait_tasks) > 0)
+			kstat.per_cpu_iowait[cpu] += sys_ticks;
+		else
+			kstat.per_cpu_idle[cpu] += sys_ticks;
 #if CONFIG_SMP
 		idle_tick(rq);
 #endif
diff --git a/mm/filemap.c b/mm/filemap.c
index e9918139934d..f8587c5e5487 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -24,6 +24,7 @@
 #include <linux/hash.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/blkdev.h>
 #include <linux/security.h>
 /*
  * This is needed for the following functions:
@@ -51,7 +52,6 @@
  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  */
 
-
 /*
  * Lock ordering:
  *
@@ -302,7 +302,7 @@ void wait_on_page_bit(struct page *page, int bit_nr)
 		prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE);
 		sync_page(page);
 		if (test_bit(bit_nr, &page->flags))
-			schedule();
+			io_schedule();
 	} while (test_bit(bit_nr, &page->flags));
 	finish_wait(waitqueue, &wait);
 }
@@ -366,7 +366,7 @@ void __lock_page(struct page *page)
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		sync_page(page);
 		if (PageLocked(page))
-			schedule();
+			io_schedule();
 	}
 	finish_wait(wqh, &wait);
 }
-- 
cgit v1.2.3


From a2495207a528e2ec61f6c07acbfa7d4fb7cac8f0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 2 Oct 2002 22:58:55 -0700
Subject: [PATCH] tmpfs swapoff deadlock

tmpfs 1/5 swapoff deadlock: my igrab/iput around the yield in
shmem_unuse_inode was rubbish, seems my testing never really hit the
case until last week, when truncation of course deadlocked on the page
held locked across the iput (at least I had the foresight to say "ugh!"
there).  Don't yield here, switch over to the simple backoff I'd been
using for months in the loopable tmpfs patch (yes, it could loop
indefinitely for memory, that's already an issue to be dealt with
later).  The return convention from shmem_unuse to try_to_unuse is
inelegant (commented at both ends), but effective.
---
 include/linux/swap.h |  2 +-
 mm/shmem.c           | 50 +++++++++++++++++++-------------------------------
 mm/swapfile.c        | 23 ++++++++++++++++++-----
 3 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index f4acbd1e9b46..4ec8559245c6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -218,7 +218,7 @@ extern spinlock_t swaplock;
 #define swap_device_lock(p)	spin_lock(&p->sdev_lock)
 #define swap_device_unlock(p)	spin_unlock(&p->sdev_lock)
 
-extern void shmem_unuse(swp_entry_t entry, struct page *page);
+extern int shmem_unuse(swp_entry_t entry, struct page *page);
 
 #endif /* __KERNEL__*/
 
diff --git a/mm/shmem.c b/mm/shmem.c
index 496659e341f4..229ec6cde77c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -430,7 +430,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	swp_entry_t *ptr;
 	unsigned long idx;
 	int offset;
-	struct inode *inode;
 
 	idx = 0;
 	ptr = info->i_direct;
@@ -457,54 +456,43 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 	spin_unlock (&info->lock);
 	return 0;
 found:
-	idx += offset;
-	inode = igrab(&info->vfs_inode);
-	/* move head to start search for next from here */
-	list_move_tail(&shmem_inodes, &info->list);
-	spin_unlock(&shmem_ilock);
-	swap_free(entry);
-	ptr[offset] = (swp_entry_t) {0};
-
-	while (inode && move_from_swap_cache(page, idx, inode->i_mapping)) {
-		/*
-		 * Yield for kswapd, and try again - but we're still
-		 * holding the page lock - ugh! fix this up later on.
-		 * Beware of inode being unlinked or truncated: just
-		 * leave try_to_unuse to delete_from_swap_cache if so.
-		 */
-		spin_unlock(&info->lock);
-		yield();
-		spin_lock(&info->lock);
-		ptr = shmem_swp_entry(info, idx, 0);
-		if (IS_ERR(ptr))
-			break;
+	if (move_from_swap_cache(page, idx + offset,
+			info->vfs_inode.i_mapping) == 0) {
+		ptr[offset] = (swp_entry_t) {0};
+		info->swapped--;
 	}
-
-	info->swapped--;
-	SetPageUptodate(page);
 	spin_unlock(&info->lock);
-	if (inode)
-		iput(inode);
+	SetPageUptodate(page);
+	/*
+	 * Decrement swap count even when the entry is left behind:
+	 * try_to_unuse will skip over mms, then reincrement count.
+	 */
+	swap_free(entry);
 	return 1;
 }
 
 /*
  * shmem_unuse() search for an eventually swapped out shmem page.
- * Note shmem_unuse_inode drops shmem_ilock itself if successful.
  */
-void shmem_unuse(swp_entry_t entry, struct page *page)
+int shmem_unuse(swp_entry_t entry, struct page *page)
 {
 	struct list_head *p;
 	struct shmem_inode_info * info;
+	int found = 0;
 
 	spin_lock (&shmem_ilock);
 	list_for_each(p, &shmem_inodes) {
 		info = list_entry(p, struct shmem_inode_info, list);
 
-		if (info->swapped && shmem_unuse_inode(info, entry, page))
-			return;
+		if (info->swapped && shmem_unuse_inode(info, entry, page)) {
+			/* move head to start search for next from here */
+			list_move_tail(&shmem_inodes, &info->list);
+			found = 1;
+			break;
+		}
 	}
 	spin_unlock (&shmem_ilock);
+	return found;
 }
 
 /*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d6349e43ea67..bd308fe5991f 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -531,6 +531,7 @@ static int try_to_unuse(unsigned int type)
 	int i = 0;
 	int retval = 0;
 	int reset_overflow = 0;
+	int shmem;
 
 	/*
 	 * When searching mms for an entry, a good strategy is to
@@ -611,11 +612,12 @@ static int try_to_unuse(unsigned int type)
 		 * Whenever we reach init_mm, there's no address space
 		 * to search, but use it as a reminder to search shmem.
 		 */
+		shmem = 0;
 		swcount = *swap_map;
 		if (swcount > 1) {
 			flush_page_to_ram(page);
 			if (start_mm == &init_mm)
-				shmem_unuse(entry, page);
+				shmem = shmem_unuse(entry, page);
 			else
 				unuse_process(start_mm, entry, page);
 		}
@@ -632,7 +634,9 @@ static int try_to_unuse(unsigned int type)
 				swcount = *swap_map;
 				if (mm == &init_mm) {
 					set_start_mm = 1;
-					shmem_unuse(entry, page);
+					spin_unlock(&mmlist_lock);
+					shmem = shmem_unuse(entry, page);
+					spin_lock(&mmlist_lock);
 				} else
 					unuse_process(mm, entry, page);
 				if (set_start_mm && *swap_map < swcount) {
@@ -681,15 +685,24 @@ static int try_to_unuse(unsigned int type)
 		 * read from disk into another page.  Splitting into two
 		 * pages would be incorrect if swap supported "shared
 		 * private" pages, but they are handled by tmpfs files.
-		 * Note shmem_unuse already deleted its from swap cache.
+		 *
+		 * Note shmem_unuse already deleted a swappage from
+		 * the swap cache, unless the move to filepage failed:
+		 * in which case it left swappage in cache, lowered its
+		 * swap count to pass quickly through the loops above,
+		 * and now we must reincrement count to try again later.
 		 */
 		if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
 			swap_writepage(page);
 			lock_page(page);
 			wait_on_page_writeback(page);
 		}
-		if (PageSwapCache(page))
-			delete_from_swap_cache(page);
+		if (PageSwapCache(page)) {
+			if (shmem)
+				swap_duplicate(entry);
+			else
+				delete_from_swap_cache(page);
+		}
 
 		/*
 		 * So we could skip searching mms once swap count went
-- 
cgit v1.2.3


From cd7fef3d792cb28eacc97f318937e50abb1cd8d0 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 2 Oct 2002 23:00:32 -0700
Subject: [PATCH] shmem: remove info->sem

Between inode->i_sem and info->lock comes info->sem; but it doesn't
guard thoroughly against the difficult races (truncate during read),
and serializes reads from tmpfs unlike other filesystems.  I'd prefer
to work with just i_sem and info->lock, backtracking when necessary
(when another task allocates block or metablock at the same time).

(I am not satisfied with the locked setting of next_index at the start
of shmem_getpage_locked: it's one lock hold too many, and it doesn't
really fix races against truncate better than before: another patch in
a later batch will resolve that.)
---
 include/linux/shmem_fs.h |   1 -
 mm/shmem.c               | 284 +++++++++++++++++++++++++----------------------
 2 files changed, 149 insertions(+), 136 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 9d39c403c0a0..81c150bb8081 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -11,7 +11,6 @@ extern atomic_t shmem_nrpages;
 
 struct shmem_inode_info {
 	spinlock_t		lock;
-	struct semaphore 	sem;
 	unsigned long		next_index;
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
 	void		      **i_indirect; /* indirect blocks */
diff --git a/mm/shmem.c b/mm/shmem.c
index 07328dd61d19..68ba9c2f8f8e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -111,11 +111,9 @@ static void shmem_recalc_inode(struct inode * inode)
  * @page:  optional page to add to the structure. Has to be preset to
  *         all zeros
  *
- * If there is no space allocated yet it will return -ENOMEM when
- * page == 0 else it will use the page for the needed block.
- *
- * returns -EFBIG if the index is too big.
- *
+ * If there is no space allocated yet it will return NULL when
+ * page is 0, else it will use the page for the needed block,
+ * setting it to 0 on return to indicate that it has been used.
  *
  * The swap vector is organized the following way:
  *
@@ -143,70 +141,80 @@ static void shmem_recalc_inode(struct inode * inode)
  * 	      	       +-> 48-51
  * 	      	       +-> 52-55
  */
-static swp_entry_t * shmem_swp_entry (struct shmem_inode_info *info, unsigned long index, unsigned long page) 
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page)
 {
 	unsigned long offset;
 	void **dir;
 
+	if (index >= info->next_index)
+		return NULL;
 	if (index < SHMEM_NR_DIRECT)
 		return info->i_direct+index;
+	if (!info->i_indirect) {
+		if (page) {
+			info->i_indirect = (void *) *page;
+			*page = 0;
+		}
+		return NULL;			/* need another page */
+	}
 
 	index -= SHMEM_NR_DIRECT;
 	offset = index % ENTRIES_PER_PAGE;
 	index /= ENTRIES_PER_PAGE;
-
-	if (!info->i_indirect) {
-		info->i_indirect = (void *) page;
-		return ERR_PTR(-ENOMEM);
-	}
-
 	dir = info->i_indirect + index;
+
 	if (index >= ENTRIES_PER_PAGE/2) {
 		index -= ENTRIES_PER_PAGE/2;
 		dir = info->i_indirect + ENTRIES_PER_PAGE/2 
 			+ index/ENTRIES_PER_PAGE;
 		index %= ENTRIES_PER_PAGE;
-
-		if(!*dir) {
-			*dir = (void *) page;
-			/* We return since we will need another page
-                           in the next step */
-			return ERR_PTR(-ENOMEM);
+		if (!*dir) {
+			if (page) {
+				*dir = (void *) *page;
+				*page = 0;
+			}
+			return NULL;		/* need another page */
 		}
 		dir = ((void **)*dir) + index;
 	}
+
 	if (!*dir) {
-		if (!page)
-			return ERR_PTR(-ENOMEM);
-		*dir = (void *)page;
+		if (!page || !*page)
+			return NULL;		/* need a page */
+		*dir = (void *) *page;
+		*page = 0;
 	}
 	return ((swp_entry_t *)*dir) + offset;
 }
 
 /*
- * shmem_alloc_entry - get the position of the swap entry for the
- *                     page. If it does not exist allocate the entry
+ * shmem_swp_alloc - get the position of the swap entry for the page.
+ *                   If it does not exist allocate the entry.
  *
  * @info:	info structure for the inode
  * @index:	index of the page to find
  */
-static inline swp_entry_t * shmem_alloc_entry (struct shmem_inode_info *info, unsigned long index)
+static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index)
 {
 	unsigned long page = 0;
-	swp_entry_t * res;
-
-	if (index >= SHMEM_MAX_INDEX)
-		return ERR_PTR(-EFBIG);
-
-	if (info->next_index <= index)
-		info->next_index = index + 1;
+	swp_entry_t *entry;
 
-	while ((res = shmem_swp_entry(info,index,page)) == ERR_PTR(-ENOMEM)) {
+	while (!(entry = shmem_swp_entry(info, index, &page))) {
+		if (index >= info->next_index) {
+			entry = ERR_PTR(-EFAULT);
+			break;
+		}
+		spin_unlock(&info->lock);
 		page = get_zeroed_page(GFP_USER);
+		spin_lock(&info->lock);
 		if (!page)
-			break;
+			return ERR_PTR(-ENOMEM);
 	}
-	return res;
+	if (page) {
+		/* another task gave its page, or truncated the file */
+		free_page(page);
+	}
+	return entry;
 }
 
 /*
@@ -330,17 +338,15 @@ static void shmem_truncate (struct inode * inode)
 	unsigned long freed = 0;
 	struct shmem_inode_info * info = SHMEM_I(inode);
 
-	down(&info->sem);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	spin_lock (&info->lock);
 	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	spin_lock (&info->lock);
 	while (index < info->next_index) 
 		freed += shmem_truncate_indirect(info, index);
 
 	info->swapped -= freed;
 	shmem_recalc_inode(inode);
 	spin_unlock (&info->lock);
-	up(&info->sem);
 }
 
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
@@ -436,8 +442,8 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
 
 	for (idx = SHMEM_NR_DIRECT; idx < info->next_index; 
 	     idx += ENTRIES_PER_PAGE) {
-		ptr = shmem_swp_entry(info, idx, 0);
-		if (IS_ERR(ptr))
+		ptr = shmem_swp_entry(info, idx, NULL);
+		if (!ptr)
 			continue;
 		offset = info->next_index - idx;
 		if (offset > ENTRIES_PER_PAGE)
@@ -519,10 +525,10 @@ static int shmem_writepage(struct page * page)
 		return fail_writepage(page);
 
 	spin_lock(&info->lock);
-	entry = shmem_swp_entry(info, index, 0);
-	if (IS_ERR(entry))	/* this had been allocated on page allocation */
-		BUG();
 	shmem_recalc_inode(inode);
+	entry = shmem_swp_entry(info, index, NULL);
+	if (!entry)
+		BUG();
 	if (entry->val)
 		BUG();
 
@@ -570,61 +576,68 @@ static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct
 	struct shmem_sb_info *sbinfo;
 	struct page *page;
 	swp_entry_t *entry;
-	int error;
+	swp_entry_t swap;
+	int error = 0;
+
+	if (idx >= SHMEM_MAX_INDEX)
+		return ERR_PTR(-EFBIG);
+
+	/*
+	 * When writing, i_sem is held against truncation and other
+	 * writing, so next_index will remain as set here; but when
+	 * reading, idx must always be checked against next_index
+	 * after sleeping, lest truncation occurred meanwhile.
+	 */
+	spin_lock(&info->lock);
+	if (info->next_index <= idx)
+		info->next_index = idx + 1;
+	spin_unlock(&info->lock);
 
 repeat:
 	page = find_lock_page(mapping, idx);
 	if (page)
 		return page;
 
-	entry = shmem_alloc_entry (info, idx);
-	if (IS_ERR(entry))
+	spin_lock(&info->lock);
+	shmem_recalc_inode(inode);
+	entry = shmem_swp_alloc(info, idx);
+	if (IS_ERR(entry)) {
+		spin_unlock(&info->lock);
 		return (void *)entry;
-
-	spin_lock (&info->lock);
-	
-	/* The shmem_alloc_entry() call may have blocked, and
-	 * shmem_writepage may have been moving a page between the page
-	 * cache and swap cache.  We need to recheck the page cache
-	 * under the protection of the info->lock spinlock. */
-
-	page = find_get_page(mapping, idx);
-	if (page) {
-		if (TestSetPageLocked(page))
-			goto wait_retry;
-		spin_unlock (&info->lock);
-		return page;
 	}
-	
-	shmem_recalc_inode(inode);
-	if (entry->val) {
+	swap = *entry;
+
+	if (swap.val) {
 		/* Look it up and read it in.. */
-		page = lookup_swap_cache(*entry);
+		page = lookup_swap_cache(swap);
 		if (!page) {
-			swp_entry_t swap = *entry;
-			spin_unlock (&info->lock);
-			swapin_readahead(*entry);
-			page = read_swap_cache_async(*entry);
+			spin_unlock(&info->lock);
+			swapin_readahead(swap);
+			page = read_swap_cache_async(swap);
 			if (!page) {
-				if (entry->val != swap.val)
-					goto repeat;
-				return ERR_PTR(-ENOMEM);
+				spin_lock(&info->lock);
+				entry = shmem_swp_alloc(info, idx);
+				if (IS_ERR(entry))
+					error = PTR_ERR(entry);
+				else if (entry->val == swap.val)
+					error = -ENOMEM;
+				spin_unlock(&info->lock);
+				if (error)
+					return ERR_PTR(error);
+				goto repeat;
 			}
 			wait_on_page_locked(page);
-			if (!PageUptodate(page) && entry->val == swap.val) {
-				page_cache_release(page);
-				return ERR_PTR(-EIO);
-			}
-			
-			/* Too bad we can't trust this page, because we
-			 * dropped the info->lock spinlock */
 			page_cache_release(page);
 			goto repeat;
 		}
 
 		/* We have to do this with page locked to prevent races */
-		if (TestSetPageLocked(page))
-			goto wait_retry;
+		if (TestSetPageLocked(page)) {
+			spin_unlock(&info->lock);
+			wait_on_page_locked(page);
+			page_cache_release(page);
+			goto repeat;
+		}
 		if (PageWriteback(page)) {
 			spin_unlock(&info->lock);
 			wait_on_page_writeback(page);
@@ -632,42 +645,55 @@ repeat:
 			page_cache_release(page);
 			goto repeat;
 		}
-		error = move_from_swap_cache(page, idx, mapping);
-		if (error < 0) {
+
+		error = PageUptodate(page)?
+			move_from_swap_cache(page, idx, mapping): -EIO;
+		if (error) {
 			spin_unlock(&info->lock);
 			unlock_page(page);
 			page_cache_release(page);
 			return ERR_PTR(error);
 		}
 
-		swap_free(*entry);
 		*entry = (swp_entry_t) {0};
 		info->swapped--;
 		spin_unlock (&info->lock);
+		swap_free(swap);
 	} else {
+		spin_unlock(&info->lock);
 		sbinfo = SHMEM_SB(inode->i_sb);
-		spin_unlock (&info->lock);
-		spin_lock (&sbinfo->stat_lock);
-		if (sbinfo->free_blocks == 0)
-			goto no_space;
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks == 0) {
+			spin_unlock(&sbinfo->stat_lock);
+			return ERR_PTR(-ENOSPC);
+		}
 		sbinfo->free_blocks--;
-		spin_unlock (&sbinfo->stat_lock);
+		spin_unlock(&sbinfo->stat_lock);
 
-		/* Ok, get a new page.  We don't have to worry about the
-		 * info->lock spinlock here: we cannot race against
-		 * shm_writepage because we have already verified that
-		 * there is no page present either in memory or in the
-		 * swap cache, so we are guaranteed to be populating a
-		 * new shm entry.  The inode semaphore we already hold
-		 * is enough to make this atomic. */
 		page = page_cache_alloc(mapping);
-		if (!page)
-			goto no_mem;
-		error = add_to_page_cache_lru(page, mapping, idx);
-		if (error < 0) {
+		if (!page) {
+			spin_lock(&sbinfo->stat_lock);
+			sbinfo->free_blocks++;
+			spin_unlock(&sbinfo->stat_lock);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		spin_lock(&info->lock);
+		entry = shmem_swp_alloc(info, idx);
+		if (IS_ERR(entry))
+			error = PTR_ERR(entry);
+		if (error || entry->val ||
+		    add_to_page_cache_lru(page, mapping, idx) < 0) {
+			spin_unlock(&info->lock);
 			page_cache_release(page);
-			goto no_mem;
+			spin_lock(&sbinfo->stat_lock);
+			sbinfo->free_blocks++;
+			spin_unlock(&sbinfo->stat_lock);
+			if (error)
+				return ERR_PTR(error);
+			goto repeat;
 		}
+		spin_unlock(&info->lock);
 		clear_highpage(page);
 		inode->i_blocks += BLOCKS_PER_PAGE;
 	}
@@ -675,22 +701,6 @@ repeat:
 	/* We have the page */
 	SetPageUptodate(page);
 	return page;
-
-no_mem:
-	spin_lock(&sbinfo->stat_lock);
-	sbinfo->free_blocks++;
-	spin_unlock(&sbinfo->stat_lock);
-	return ERR_PTR(-ENOMEM);
-
-no_space:
-	spin_unlock (&sbinfo->stat_lock);
-	return ERR_PTR(-ENOSPC);
-
-wait_retry:
-	spin_unlock(&info->lock);
-	wait_on_page_locked(page);
-	page_cache_release(page);
-	goto repeat;
 }
 
 static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **ptr)
@@ -698,7 +708,6 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **
 	struct shmem_inode_info *info = SHMEM_I(inode);
 	int error;
 
-	down (&info->sem);
 	*ptr = ERR_PTR(-EFAULT);
 	if (inode->i_size <= (loff_t) idx * PAGE_CACHE_SIZE)
 		goto failed;
@@ -708,10 +717,8 @@ static int shmem_getpage(struct inode * inode, unsigned long idx, struct page **
 		goto failed;
 
 	unlock_page(*ptr);
-	up (&info->sem);
 	return 0;
 failed:
-	up (&info->sem);
 	error = PTR_ERR(*ptr);
 	*ptr = NOPAGE_SIGBUS;
 	if (error == -ENOMEM)
@@ -734,8 +741,8 @@ static struct page *shmem_holdpage(struct inode *inode, unsigned long idx)
 	spin_lock(&info->lock);
 	page = find_get_page(inode->i_mapping, idx);
 	if (!page) {
-		entry = shmem_swp_entry(info, idx, 0);
-		if (!IS_ERR(entry))
+		entry = shmem_swp_entry(info, idx, NULL);
+		if (entry)
 			swap = *entry;
 	}
 	spin_unlock(&info->lock);
@@ -814,12 +821,8 @@ struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
 		inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		info = SHMEM_I(inode);
-		spin_lock_init (&info->lock);
-		sema_init (&info->sem, 1);
-		info->next_index = 0;
-		memset (info->i_direct, 0, sizeof(info->i_direct));
-		info->i_indirect = NULL;
-		info->swapped = 0;
+		memset(info, 0, (char *)inode - (char *)info);
+		spin_lock_init(&info->lock);
 		info->flags = VM_ACCOUNT;
 		switch (mode & S_IFMT) {
 		default:
@@ -971,9 +974,7 @@ shmem_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
 		}
 
 		info = SHMEM_I(inode);
-		down (&info->sem);
 		page = shmem_getpage_locked(info, inode, index);
-		up (&info->sem);
 
 		status = PTR_ERR(page);
 		if (IS_ERR(page))
@@ -1041,17 +1042,33 @@ static void do_shmem_file_read(struct file * filp, loff_t *ppos, read_descriptor
 		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
 		if (index > end_index)
 			break;
-		nr = PAGE_CACHE_SIZE;
 		if (index == end_index) {
 			nr = inode->i_size & ~PAGE_CACHE_MASK;
 			if (nr <= offset)
 				break;
 		}
 
-		nr = nr - offset;
-
-		if ((desc->error = shmem_getpage(inode, index, &page)))
+		desc->error = shmem_getpage(inode, index, &page);
+		if (desc->error) {
+			if (desc->error == -EFAULT)
+				desc->error = 0;
 			break;
+		}
+
+		/*
+		 * We must evaluate after, since reads (unlike writes)
+		 * are called without i_sem protection against truncate
+		 */
+		nr = PAGE_CACHE_SIZE;
+		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
+		if (index == end_index) {
+			nr = inode->i_size & ~PAGE_CACHE_MASK;
+			if (nr <= offset) {
+				page_cache_release(page);
+				break;
+			}
+		}
+		nr -= offset;
 
 		if (!list_empty(&mapping->i_mmap_shared))
 			flush_dcache_page(page);
@@ -1279,10 +1296,8 @@ static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char *
 			iput(inode);
 			return -ENOMEM;
 		}
-		down(&info->sem);
 		page = shmem_getpage_locked(info, inode, 0);
 		if (IS_ERR(page)) {
-			up(&info->sem);
 			vm_unacct_memory(VM_ACCT(1));
 			iput(inode);
 			return PTR_ERR(page);
@@ -1297,7 +1312,6 @@ static int shmem_symlink(struct inode * dir, struct dentry *dentry, const char *
 		set_page_dirty(page);
 		unlock_page(page);
 		page_cache_release(page);
-		up(&info->sem);
 	}
 	dir->i_size += BOGO_DIRENT_SIZE;
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME;
-- 
cgit v1.2.3


From 03844e4b25f5993847fea8f2936eee540167cd41 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 2 Oct 2002 23:00:59 -0700
Subject: [PATCH] shmem: avoid metadata leakiness

akpm and wli each discovered unfortunate behaviour of dbench on tmpfs:
after tmpfs has reached its data memory limit, dbench continues to
lseek and write, and tmpfs carries on allocating unlimited metadata
blocks to accommodate the data it then refuses.  That particular
behaviour could be simply fixed by checking earlier; but I think tmpfs
metablocks should be subject to the memory limit, and included in df
and du accounting.  Also, manipulate inode->i_blocks under lock, was
missed before.
---
 include/linux/shmem_fs.h |   3 +-
 mm/shmem.c               | 136 ++++++++++++++++++++++++++++-------------------
 2 files changed, 83 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 81c150bb8081..c7a4cdf6009a 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -14,7 +14,8 @@ struct shmem_inode_info {
 	unsigned long		next_index;
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
 	void		      **i_indirect; /* indirect blocks */
-	unsigned long		swapped;
+	unsigned long		alloced;    /* data pages allocated to file */
+	unsigned long		swapped;    /* subtotal assigned to swap */
 	unsigned long		flags;
 	struct list_head	list;
 	struct inode		vfs_inode;
diff --git a/mm/shmem.c b/mm/shmem.c
index 39a721ed9fdd..f0ebe583c8cf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -68,38 +68,42 @@ LIST_HEAD (shmem_inodes);
 static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
 atomic_t shmem_nrpages = ATOMIC_INIT(0); /* Not used right now */
 
+static void shmem_free_block(struct inode *inode)
+{
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+	spin_lock(&sbinfo->stat_lock);
+	sbinfo->free_blocks++;
+	inode->i_blocks -= BLOCKS_PER_PAGE;
+	spin_unlock(&sbinfo->stat_lock);
+}
+
 /*
  * shmem_recalc_inode - recalculate the size of an inode
  *
  * @inode: inode to recalc
- * @swap:  additional swap pages freed externally
  *
- * We have to calculate the free blocks since the mm can drop pages
- * behind our back
+ * We have to calculate the free blocks since the mm can drop
+ * undirtied hole pages behind our back.  Later we should be
+ * able to use the releasepage method to handle this better.
  *
- * But we know that normally
- * inodes->i_blocks/BLOCKS_PER_PAGE == 
- * 			inode->i_mapping->nrpages + info->swapped
- *
- * So the mm freed 
- * inodes->i_blocks/BLOCKS_PER_PAGE - 
- * 			(inode->i_mapping->nrpages + info->swapped)
+ * But normally   info->alloced == inode->i_mapping->nrpages + info->swapped
+ * So mm freed is info->alloced - (inode->i_mapping->nrpages + info->swapped)
  *
  * It has to be called with the spinlock held.
  */
-
 static void shmem_recalc_inode(struct inode * inode)
 {
-	unsigned long freed;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	long freed;
 
-	freed = (inode->i_blocks/BLOCKS_PER_PAGE) -
-		(inode->i_mapping->nrpages + SHMEM_I(inode)->swapped);
-	if (freed){
-		struct shmem_sb_info * sbinfo = SHMEM_SB(inode->i_sb);
-		inode->i_blocks -= freed*BLOCKS_PER_PAGE;
-		spin_lock (&sbinfo->stat_lock);
+	freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
+	if (freed > 0) {
+		struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+		info->alloced -= freed;
+		spin_lock(&sbinfo->stat_lock);
 		sbinfo->free_blocks += freed;
-		spin_unlock (&sbinfo->stat_lock);
+		inode->i_blocks -= freed*BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
 	}
 }
 
@@ -196,6 +200,8 @@ static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long
  */
 static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index)
 {
+	struct inode *inode = &info->vfs_inode;
+	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
 	unsigned long page = 0;
 	swp_entry_t *entry;
 
@@ -204,14 +210,33 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 			entry = ERR_PTR(-EFAULT);
 			break;
 		}
+
+		/*
+		 * Test free_blocks against 1 not 0, since we have 1 data
+		 * page (and perhaps indirect index pages) yet to allocate:
+		 * a waste to allocate index if we cannot allocate data.
+		 */
+		spin_lock(&sbinfo->stat_lock);
+		if (sbinfo->free_blocks <= 1) {
+			spin_unlock(&sbinfo->stat_lock);
+			return ERR_PTR(-ENOSPC);
+		}
+		sbinfo->free_blocks--;
+		inode->i_blocks += BLOCKS_PER_PAGE;
+		spin_unlock(&sbinfo->stat_lock);
+
 		spin_unlock(&info->lock);
 		page = get_zeroed_page(GFP_USER);
 		spin_lock(&info->lock);
-		if (!page)
+
+		if (!page) {
+			shmem_free_block(inode);
 			return ERR_PTR(-ENOMEM);
+		}
 	}
 	if (page) {
 		/* another task gave its page, or truncated the file */
+		shmem_free_block(inode);
 		free_page(page);
 	}
 	return entry;
@@ -243,41 +268,42 @@ static int shmem_free_swp(swp_entry_t *dir, unsigned int count)
  * shmem_truncate_direct - free the swap entries of a whole doubly
  *                         indirect block
  *
+ * @info:	the info structure of the inode
  * @dir:	pointer to the pointer to the block
  * @start:	offset to start from (in pages)
  * @len:	how many pages are stored in this block
  *
  * Returns the number of freed swap entries.
  */
-
-static inline unsigned long 
-shmem_truncate_direct(swp_entry_t *** dir, unsigned long start, unsigned long len) {
+static inline unsigned long
+shmem_truncate_direct(struct shmem_inode_info *info, swp_entry_t ***dir, unsigned long start, unsigned long len)
+{
 	swp_entry_t **last, **ptr;
-	unsigned long off, freed = 0;
- 
-	if (!*dir)
-		return 0;
+	unsigned long off, freed_swp, freed = 0;
 
 	last = *dir + (len + ENTRIES_PER_PAGE-1) / ENTRIES_PER_PAGE;
 	off = start % ENTRIES_PER_PAGE;
 
-	for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++) {
-		if (!*ptr) {
-			off = 0;
+	for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
+		if (!*ptr)
 			continue;
+
+		if (info->swapped) {
+			freed_swp = shmem_free_swp(*ptr + off,
+						ENTRIES_PER_PAGE - off);
+			info->swapped -= freed_swp;
+			freed += freed_swp;
 		}
 
 		if (!off) {
-			freed += shmem_free_swp(*ptr, ENTRIES_PER_PAGE);
-			free_page ((unsigned long) *ptr);
+			info->alloced++;
+			free_page((unsigned long) *ptr);
 			*ptr = 0;
-		} else {
-			freed += shmem_free_swp(*ptr+off,ENTRIES_PER_PAGE-off);
-			off = 0;
 		}
 	}
-	
+
 	if (!start) {
+		info->alloced++;
 		free_page((unsigned long) *dir);
 		*dir = 0;
 	}
@@ -299,11 +325,16 @@ shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index)
 	swp_entry_t ***base;
 	unsigned long baseidx, len, start;
 	unsigned long max = info->next_index-1;
+	unsigned long freed;
 
 	if (max < SHMEM_NR_DIRECT) {
 		info->next_index = index;
-		return shmem_free_swp(info->i_direct + index,
-				      SHMEM_NR_DIRECT - index);
+		if (!info->swapped)
+			return 0;
+		freed = shmem_free_swp(info->i_direct + index,
+					SHMEM_NR_DIRECT - index);
+		info->swapped -= freed;
+		return freed;
 	}
 
 	if (max < ENTRIES_PER_PAGE * ENTRIES_PER_PAGE/2 + SHMEM_NR_DIRECT) {
@@ -329,24 +360,21 @@ shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index)
 		info->next_index = baseidx;
 		start = 0;
 	}
-	return shmem_truncate_direct(base, start, len);
+	return *base? shmem_truncate_direct(info, base, start, len): 0;
 }
 
-static void shmem_truncate (struct inode * inode)
+static void shmem_truncate(struct inode *inode)
 {
+	struct shmem_inode_info *info = SHMEM_I(inode);
 	unsigned long index;
-	unsigned long freed = 0;
-	struct shmem_inode_info * info = SHMEM_I(inode);
 
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	spin_lock (&info->lock);
-	while (index < info->next_index) 
-		freed += shmem_truncate_indirect(info, index);
-
-	info->swapped -= freed;
+	spin_lock(&info->lock);
+	while (index < info->next_index)
+		(void) shmem_truncate_indirect(info, index);
 	shmem_recalc_inode(inode);
-	spin_unlock (&info->lock);
+	spin_unlock(&info->lock);
 }
 
 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
@@ -407,6 +435,7 @@ static void shmem_delete_inode(struct inode * inode)
 		inode->i_size = 0;
 		shmem_truncate (inode);
 	}
+	BUG_ON(inode->i_blocks);
 	spin_lock (&sbinfo->stat_lock);
 	sbinfo->free_inodes++;
 	spin_unlock (&sbinfo->stat_lock);
@@ -663,13 +692,12 @@ repeat:
 			return -ENOSPC;
 		}
 		sbinfo->free_blocks--;
+		inode->i_blocks += BLOCKS_PER_PAGE;
 		spin_unlock(&sbinfo->stat_lock);
 
 		page = page_cache_alloc(mapping);
 		if (!page) {
-			spin_lock(&sbinfo->stat_lock);
-			sbinfo->free_blocks++;
-			spin_unlock(&sbinfo->stat_lock);
+			shmem_free_block(inode);
 			return -ENOMEM;
 		}
 
@@ -681,16 +709,14 @@ repeat:
 		    add_to_page_cache_lru(page, mapping, idx) < 0) {
 			spin_unlock(&info->lock);
 			page_cache_release(page);
-			spin_lock(&sbinfo->stat_lock);
-			sbinfo->free_blocks++;
-			spin_unlock(&sbinfo->stat_lock);
+			shmem_free_block(inode);
 			if (error)
 				return error;
 			goto repeat;
 		}
+		info->alloced++;
 		spin_unlock(&info->lock);
 		clear_highpage(page);
-		inode->i_blocks += BLOCKS_PER_PAGE;
 	}
 
 	/* We have the page */
-- 
cgit v1.2.3


From 2729b9afe1ce6308481e677ab06b0b6bfccff082 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 2 Oct 2002 23:01:12 -0700
Subject: [PATCH] put shmem metadata in highmem

wli suffered OOMs because tmpfs was allocating GFP_USER, for its
metadata pages.  This patch allocates them GFP_HIGHUSER (default
mapping->gfp_mask) and uses atomic kmaps to access (KM_USER0 for upper
levels, KM_USER1 for lowest level).  shmem_unuse_inode and
shmem_truncate rewritten alike to avoid repeated maps and unmaps of the
same page: cr's truncate was much more elegant, but I couldn't quite
see how to convert it.

I do wonder whether this patch is a bloat too far for tmpfs, and even
non-highmem configs will be penalised by page_address overhead (perhaps
a further patch could get over that).  There is an attractive
alternative (keep swp_entry_ts in the existing radix-tree, no metadata
pages at all), but we haven't worked out an unhacky interface to that.
For now at least, let's give tmpfs highmem metadata a spin.
---
 include/linux/shmem_fs.h |   2 +-
 mm/shmem.c               | 432 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 274 insertions(+), 160 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index c7a4cdf6009a..dce3d7f9c252 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -13,7 +13,7 @@ struct shmem_inode_info {
 	spinlock_t		lock;
 	unsigned long		next_index;
 	swp_entry_t		i_direct[SHMEM_NR_DIRECT]; /* for the first blocks */
-	void		      **i_indirect; /* indirect blocks */
+	struct page	       *i_indirect; /* indirect blocks */
 	unsigned long		alloced;    /* data pages allocated to file */
 	unsigned long		swapped;    /* subtotal assigned to swap */
 	unsigned long		flags;
diff --git a/mm/shmem.c b/mm/shmem.c
index f0ebe583c8cf..ddabcc5da82e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -37,9 +37,10 @@
 #define TMPFS_MAGIC	0x01021994
 
 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
+#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
 #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
 
-#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + ENTRIES_PER_PAGE * (ENTRIES_PER_PAGE/2) * (ENTRIES_PER_PAGE+1))
+#define SHMEM_MAX_INDEX  (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
 #define SHMEM_MAX_BYTES  ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
 
 #define VM_ACCT(size)    (((size) + PAGE_CACHE_SIZE - 1) >> PAGE_SHIFT)
@@ -47,6 +48,51 @@
 /* Pretend that each entry is of this size in directory's i_size */
 #define BOGO_DIRENT_SIZE 20
 
+static inline struct page *shmem_dir_alloc(unsigned int gfp_mask)
+{
+	/*
+	 * The above definition of ENTRIES_PER_PAGE, and the use of
+	 * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
+	 * might be reconsidered if it ever diverges from PAGE_SIZE.
+	 */
+	return alloc_pages(gfp_mask, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static inline void shmem_dir_free(struct page *page)
+{
+	__free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
+}
+
+static struct page **shmem_dir_map(struct page *page)
+{
+	return (struct page **)kmap_atomic(page, KM_USER0);
+}
+
+static inline void shmem_dir_unmap(struct page **dir)
+{
+	kunmap_atomic(dir, KM_USER0);
+}
+
+static swp_entry_t *shmem_swp_map(struct page *page)
+{
+	/*
+	 * We have to avoid the unconditional inc_preempt_count()
+	 * in kmap_atomic(), since shmem_swp_unmap() will also be
+	 * applied to the low memory addresses within i_direct[].
+	 * PageHighMem and high_memory tests are good for all arches
+	 * and configs: highmem_start_page and FIXADDR_START are not.
+	 */
+	return PageHighMem(page)?
+		(swp_entry_t *)kmap_atomic(page, KM_USER1):
+		(swp_entry_t *)page_address(page);
+}
+
+static inline void shmem_swp_unmap(swp_entry_t *entry)
+{
+	if (entry >= (swp_entry_t *)high_memory)
+		kunmap_atomic(entry, KM_USER1);
+}
+
 static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
 {
 	return sb->u.generic_sbp;
@@ -116,8 +162,8 @@ static void shmem_recalc_inode(struct inode * inode)
  *         all zeros
  *
  * If there is no space allocated yet it will return NULL when
- * page is 0, else it will use the page for the needed block,
- * setting it to 0 on return to indicate that it has been used.
+ * page is NULL, else it will use the page for the needed block,
+ * setting it to NULL on return to indicate that it has been used.
  *
  * The swap vector is organized the following way:
  *
@@ -145,10 +191,11 @@ static void shmem_recalc_inode(struct inode * inode)
  * 	      	       +-> 48-51
  * 	      	       +-> 52-55
  */
-static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page)
+static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
 {
 	unsigned long offset;
-	void **dir;
+	struct page **dir;
+	struct page *subdir;
 
 	if (index >= info->next_index)
 		return NULL;
@@ -156,8 +203,8 @@ static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long
 		return info->i_direct+index;
 	if (!info->i_indirect) {
 		if (page) {
-			info->i_indirect = (void *) *page;
-			*page = 0;
+			info->i_indirect = *page;
+			*page = NULL;
 		}
 		return NULL;			/* need another page */
 	}
@@ -165,30 +212,37 @@ static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long
 	index -= SHMEM_NR_DIRECT;
 	offset = index % ENTRIES_PER_PAGE;
 	index /= ENTRIES_PER_PAGE;
-	dir = info->i_indirect + index;
+	dir = shmem_dir_map(info->i_indirect);
 
 	if (index >= ENTRIES_PER_PAGE/2) {
 		index -= ENTRIES_PER_PAGE/2;
-		dir = info->i_indirect + ENTRIES_PER_PAGE/2 
-			+ index/ENTRIES_PER_PAGE;
+		dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
 		index %= ENTRIES_PER_PAGE;
-		if (!*dir) {
+		subdir = *dir;
+		if (!subdir) {
 			if (page) {
-				*dir = (void *) *page;
-				*page = 0;
+				*dir = *page;
+				*page = NULL;
 			}
+			shmem_dir_unmap(dir);
 			return NULL;		/* need another page */
 		}
-		dir = ((void **)*dir) + index;
+		shmem_dir_unmap(dir);
+		dir = shmem_dir_map(subdir);
 	}
 
-	if (!*dir) {
-		if (!page || !*page)
+	dir += index;
+	subdir = *dir;
+	if (!subdir) {
+		if (!page || !(subdir = *page)) {
+			shmem_dir_unmap(dir);
 			return NULL;		/* need a page */
-		*dir = (void *) *page;
-		*page = 0;
+		}
+		*dir = subdir;
+		*page = NULL;
 	}
-	return ((swp_entry_t *)*dir) + offset;
+	shmem_dir_unmap(dir);
+	return shmem_swp_map(subdir) + offset;
 }
 
 /*
@@ -202,7 +256,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 {
 	struct inode *inode = &info->vfs_inode;
 	struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
-	unsigned long page = 0;
+	struct page *page = NULL;
 	swp_entry_t *entry;
 
 	while (!(entry = shmem_swp_entry(info, index, &page))) {
@@ -226,7 +280,9 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 		spin_unlock(&sbinfo->stat_lock);
 
 		spin_unlock(&info->lock);
-		page = get_zeroed_page(GFP_USER);
+		page = shmem_dir_alloc(inode->i_mapping->gfp_mask);
+		if (page)
+			clear_highpage(page);
 		spin_lock(&info->lock);
 
 		if (!page) {
@@ -237,7 +293,7 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
 	if (page) {
 		/* another task gave its page, or truncated the file */
 		shmem_free_block(inode);
-		free_page(page);
+		shmem_dir_free(page);
 	}
 	return entry;
 }
@@ -246,133 +302,138 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
  * shmem_free_swp - free some swap entries in a directory
  *
  * @dir:   pointer to the directory
- * @count: number of entries to scan
+ * @edir:  pointer after last entry of the directory
  */
-static int shmem_free_swp(swp_entry_t *dir, unsigned int count)
+static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
 {
-	swp_entry_t *ptr, entry;
+	swp_entry_t *ptr;
 	int freed = 0;
 
-	for (ptr = dir; ptr < dir + count; ptr++) {
-		if (!ptr->val)
-			continue;
-		entry = *ptr;
-		*ptr = (swp_entry_t){0};
-		freed++;
-		free_swap_and_cache(entry);
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val) {
+			free_swap_and_cache(*ptr);
+			*ptr = (swp_entry_t){0};
+			freed++;
+		}
 	}
 	return freed;
 }
 
-/*
- * shmem_truncate_direct - free the swap entries of a whole doubly
- *                         indirect block
- *
- * @info:	the info structure of the inode
- * @dir:	pointer to the pointer to the block
- * @start:	offset to start from (in pages)
- * @len:	how many pages are stored in this block
- *
- * Returns the number of freed swap entries.
- */
-static inline unsigned long
-shmem_truncate_direct(struct shmem_inode_info *info, swp_entry_t ***dir, unsigned long start, unsigned long len)
+static void shmem_truncate(struct inode *inode)
 {
-	swp_entry_t **last, **ptr;
-	unsigned long off, freed_swp, freed = 0;
-
-	last = *dir + (len + ENTRIES_PER_PAGE-1) / ENTRIES_PER_PAGE;
-	off = start % ENTRIES_PER_PAGE;
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	struct page **dir;
+	struct page *subdir;
+	struct page *empty;
+	swp_entry_t *ptr;
+	int offset;
 
-	for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
-		if (!*ptr)
-			continue;
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	idx = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (idx >= info->next_index)
+		return;
 
-		if (info->swapped) {
-			freed_swp = shmem_free_swp(*ptr + off,
-						ENTRIES_PER_PAGE - off);
-			info->swapped -= freed_swp;
-			freed += freed_swp;
+	spin_lock(&info->lock);
+	limit = info->next_index;
+	info->next_index = idx;
+	if (info->swapped && idx < SHMEM_NR_DIRECT) {
+		ptr = info->i_direct;
+		size = limit;
+		if (size > SHMEM_NR_DIRECT)
+			size = SHMEM_NR_DIRECT;
+		info->swapped -= shmem_free_swp(ptr+idx, ptr+size);
+	}
+	if (!info->i_indirect)
+		goto done2;
+
+	BUG_ON(limit <= SHMEM_NR_DIRECT);
+	limit -= SHMEM_NR_DIRECT;
+	idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
+	offset = idx % ENTRIES_PER_PAGE;
+	idx -= offset;
+
+	empty = NULL;
+	dir = shmem_dir_map(info->i_indirect);
+	stage = ENTRIES_PER_PAGEPAGE/2;
+	if (idx < ENTRIES_PER_PAGEPAGE/2)
+		dir += idx/ENTRIES_PER_PAGE;
+	else {
+		dir += ENTRIES_PER_PAGE/2;
+		dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
+		while (stage <= idx)
+			stage += ENTRIES_PER_PAGEPAGE;
+		if (*dir) {
+			subdir = *dir;
+			size = ((idx - ENTRIES_PER_PAGEPAGE/2) %
+				ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
+			if (!size && !offset) {
+				empty = subdir;
+				*dir = NULL;
+			}
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(subdir) + size;
+		} else {
+			offset = 0;
+			idx = stage;
 		}
+	}
 
-		if (!off) {
+	for (; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir-1);
+			dir = shmem_dir_map(info->i_indirect) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto done1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			subdir = *dir;
+			*dir = NULL;
+			shmem_dir_unmap(dir);
+			if (empty) {
+				shmem_dir_free(empty);
+				info->alloced++;
+			}
+			empty = subdir;
+			dir = shmem_dir_map(subdir);
+		}
+		subdir = *dir;
+		if (subdir) {
+			ptr = shmem_swp_map(subdir);
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			info->swapped -= shmem_free_swp(ptr+offset, ptr+size);
+			shmem_swp_unmap(ptr);
+		}
+		if (offset)
+			offset = 0;
+		else if (subdir) {
+			*dir = NULL;
+			shmem_dir_free(subdir);
 			info->alloced++;
-			free_page((unsigned long) *ptr);
-			*ptr = 0;
 		}
 	}
-
-	if (!start) {
+done1:
+	shmem_dir_unmap(dir-1);
+	if (empty) {
+		shmem_dir_free(empty);
 		info->alloced++;
-		free_page((unsigned long) *dir);
-		*dir = 0;
-	}
-	return freed;
-}
-
-/*
- * shmem_truncate_indirect - truncate an inode
- *
- * @info:  the info structure of the inode
- * @index: the index to truncate
- *
- * This function locates the last doubly indirect block and calls
- * then shmem_truncate_direct to do the real work
- */
-static inline unsigned long
-shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index)
-{
-	swp_entry_t ***base;
-	unsigned long baseidx, len, start;
-	unsigned long max = info->next_index-1;
-	unsigned long freed;
-
-	if (max < SHMEM_NR_DIRECT) {
-		info->next_index = index;
-		if (!info->swapped)
-			return 0;
-		freed = shmem_free_swp(info->i_direct + index,
-					SHMEM_NR_DIRECT - index);
-		info->swapped -= freed;
-		return freed;
-	}
-
-	if (max < ENTRIES_PER_PAGE * ENTRIES_PER_PAGE/2 + SHMEM_NR_DIRECT) {
-		max -= SHMEM_NR_DIRECT;
-		base = (swp_entry_t ***) &info->i_indirect;
-		baseidx = SHMEM_NR_DIRECT;
-		len = max+1;
-	} else {
-		max -= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2+SHMEM_NR_DIRECT;
-		if (max >= ENTRIES_PER_PAGE*ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2)
-			BUG();
-
-		baseidx = max & ~(ENTRIES_PER_PAGE*ENTRIES_PER_PAGE-1);
-		base = (swp_entry_t ***) info->i_indirect + ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGE/ENTRIES_PER_PAGE ;
-		len = max - baseidx + 1;
-		baseidx += ENTRIES_PER_PAGE*ENTRIES_PER_PAGE/2+SHMEM_NR_DIRECT;
 	}
-
-	if (index > baseidx) {
-		info->next_index = index;
-		start = index - baseidx;
-	} else {
-		info->next_index = baseidx;
-		start = 0;
+	if (info->next_index <= SHMEM_NR_DIRECT) {
+		shmem_dir_free(info->i_indirect);
+		info->i_indirect = NULL;
+		info->alloced++;
 	}
-	return *base? shmem_truncate_direct(info, base, start, len): 0;
-}
-
-static void shmem_truncate(struct inode *inode)
-{
-	struct shmem_inode_info *info = SHMEM_I(inode);
-	unsigned long index;
-
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-	spin_lock(&info->lock);
-	while (index < info->next_index)
-		(void) shmem_truncate_indirect(info, index);
+done2:
+	BUG_ON(info->swapped > info->next_index);
 	shmem_recalc_inode(inode);
 	spin_unlock(&info->lock);
 }
@@ -442,46 +503,81 @@ static void shmem_delete_inode(struct inode * inode)
 	clear_inode(inode);
 }
 
-static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *ptr, swp_entry_t *eptr)
+static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
 {
-	swp_entry_t *test;
+	swp_entry_t *ptr;
 
-	for (test = ptr; test < eptr; test++) {
-		if (test->val == entry.val)
-			return test - ptr;
+	for (ptr = dir; ptr < edir; ptr++) {
+		if (ptr->val == entry.val)
+			return ptr - dir;
 	}
 	return -1;
 }
 
 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
 {
-	swp_entry_t *ptr;
 	unsigned long idx;
+	unsigned long size;
+	unsigned long limit;
+	unsigned long stage;
+	struct page **dir;
+	struct page *subdir;
+	swp_entry_t *ptr;
 	int offset;
 
 	idx = 0;
 	ptr = info->i_direct;
-	spin_lock (&info->lock);
-	offset = info->next_index;
-	if (offset > SHMEM_NR_DIRECT)
-		offset = SHMEM_NR_DIRECT;
-	offset = shmem_find_swp(entry, ptr, ptr + offset);
+	spin_lock(&info->lock);
+	limit = info->next_index;
+	size = limit;
+	if (size > SHMEM_NR_DIRECT)
+		size = SHMEM_NR_DIRECT;
+	offset = shmem_find_swp(entry, ptr, ptr+size);
 	if (offset >= 0)
 		goto found;
-
-	for (idx = SHMEM_NR_DIRECT; idx < info->next_index; 
-	     idx += ENTRIES_PER_PAGE) {
-		ptr = shmem_swp_entry(info, idx, NULL);
-		if (!ptr)
-			continue;
-		offset = info->next_index - idx;
-		if (offset > ENTRIES_PER_PAGE)
-			offset = ENTRIES_PER_PAGE;
-		offset = shmem_find_swp(entry, ptr, ptr + offset);
-		if (offset >= 0)
-			goto found;
+	if (!info->i_indirect)
+		goto lost2;
+	/* we might be racing with shmem_truncate */
+	if (limit <= SHMEM_NR_DIRECT)
+		goto lost2;
+
+	dir = shmem_dir_map(info->i_indirect);
+	stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
+
+	for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
+		if (unlikely(idx == stage)) {
+			shmem_dir_unmap(dir-1);
+			dir = shmem_dir_map(info->i_indirect) +
+			    ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
+			while (!*dir) {
+				dir++;
+				idx += ENTRIES_PER_PAGEPAGE;
+				if (idx >= limit)
+					goto lost1;
+			}
+			stage = idx + ENTRIES_PER_PAGEPAGE;
+			subdir = *dir;
+			shmem_dir_unmap(dir);
+			dir = shmem_dir_map(subdir);
+		}
+		subdir = *dir;
+		if (subdir) {
+			ptr = shmem_swp_map(subdir);
+			size = limit - idx;
+			if (size > ENTRIES_PER_PAGE)
+				size = ENTRIES_PER_PAGE;
+			offset = shmem_find_swp(entry, ptr, ptr+size);
+			if (offset >= 0) {
+				shmem_dir_unmap(dir);
+				goto found;
+			}
+			shmem_swp_unmap(ptr);
+		}
 	}
-	spin_unlock (&info->lock);
+lost1:
+	shmem_dir_unmap(dir-1);
+lost2:
+	spin_unlock(&info->lock);
 	return 0;
 found:
 	if (move_from_swap_cache(page, idx + offset,
@@ -489,6 +585,7 @@ found:
 		ptr[offset] = (swp_entry_t) {0};
 		info->swapped--;
 	}
+	shmem_swp_unmap(ptr);
 	spin_unlock(&info->lock);
 	/*
 	 * Decrement swap count even when the entry is left behind:
@@ -561,11 +658,13 @@ static int shmem_writepage(struct page * page)
 	if (!err) {
 		*entry = swap;
 		info->swapped++;
+		shmem_swp_unmap(entry);
 		spin_unlock(&info->lock);
 		unlock_page(page);
 		return 0;
 	}
 
+	shmem_swp_unmap(entry);
 	spin_unlock(&info->lock);
 	swap_free(swap);
 	return fail_writepage(page);
@@ -635,6 +734,7 @@ repeat:
 		/* Look it up and read it in.. */
 		page = lookup_swap_cache(swap);
 		if (!page) {
+			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			swapin_readahead(swap);
 			page = read_swap_cache_async(swap);
@@ -643,8 +743,11 @@ repeat:
 				entry = shmem_swp_alloc(info, idx);
 				if (IS_ERR(entry))
 					error = PTR_ERR(entry);
-				else if (entry->val == swap.val)
-					error = -ENOMEM;
+				else {
+					if (entry->val == swap.val)
+						error = -ENOMEM;
+					shmem_swp_unmap(entry);
+				}
 				spin_unlock(&info->lock);
 				if (error)
 					return error;
@@ -657,12 +760,14 @@ repeat:
 
 		/* We have to do this with page locked to prevent races */
 		if (TestSetPageLocked(page)) {
+			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_locked(page);
 			page_cache_release(page);
 			goto repeat;
 		}
 		if (PageWriteback(page)) {
+			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			wait_on_page_writeback(page);
 			unlock_page(page);
@@ -673,6 +778,7 @@ repeat:
 		error = PageUptodate(page)?
 			move_from_swap_cache(page, idx, mapping): -EIO;
 		if (error) {
+			shmem_swp_unmap(entry);
 			spin_unlock(&info->lock);
 			unlock_page(page);
 			page_cache_release(page);
@@ -681,9 +787,11 @@ repeat:
 
 		*entry = (swp_entry_t) {0};
 		info->swapped--;
-		spin_unlock (&info->lock);
+		shmem_swp_unmap(entry);
+		spin_unlock(&info->lock);
 		swap_free(swap);
 	} else {
+		shmem_swp_unmap(entry);
 		spin_unlock(&info->lock);
 		sbinfo = SHMEM_SB(inode->i_sb);
 		spin_lock(&sbinfo->stat_lock);
@@ -705,7 +813,11 @@ repeat:
 		entry = shmem_swp_alloc(info, idx);
 		if (IS_ERR(entry))
 			error = PTR_ERR(entry);
-		if (error || entry->val ||
+		else {
+			swap = *entry;
+			shmem_swp_unmap(entry);
+		}
+		if (error || swap.val ||
 		    add_to_page_cache_lru(page, mapping, idx) < 0) {
 			spin_unlock(&info->lock);
 			page_cache_release(page);
@@ -741,8 +853,10 @@ static struct page *shmem_holdpage(struct inode *inode, unsigned long idx)
 	page = find_get_page(inode->i_mapping, idx);
 	if (!page) {
 		entry = shmem_swp_entry(info, idx, NULL);
-		if (entry)
+		if (entry) {
 			swap = *entry;
+			shmem_swp_unmap(entry);
+		}
 	}
 	spin_unlock(&info->lock);
 	if (swap.val) {
-- 
cgit v1.2.3


From 794aa320b79d2cb8643ecb6058f0f3fadd51955d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 2 Oct 2002 23:32:38 -0700
Subject: [PATCH] sigfix-2.5.40-D6

This fixes all known signal semantics problems.

sigwait() is really evil - i had to re-introduce ->real_blocked. When a
signal has no handler defined then the actual action taken by the kernel
depends on whether the sigwait()-ing thread was blocking the signal
originally or not. If the signal was blocked => specific delivery to the
thread, if the signal was not blocked => kill-all.

fortunately this meant that PF_SIGWAIT could be killed - the real_blocked
field contains all the necessery information to do the right decision at
signal-sending time.

i've also cleaned up and made the shared-pending code more robust: now
there's a single central dequeue_signal() function that handles all the
details. Plus upon unqueueing a shared-pending signal we now re-queue the
signal to the current thread, which this time around is not going to end
up in the shared-pending queue. This change handles the following case
correctly: a signal was blocked in every signal, then one thread unblocks
it and gets the signal delivered - but there's no handler for the signal
=> the correct action is to do a kill-all.

i removed the unused shared_unblocked field as well, reported by Oleg
Nesterov.

now we pass both signal-tst1 and signal-tst2, so i'm confident that we got
most of the details right.
---
 include/linux/sched.h |  4 ++--
 kernel/signal.c       | 53 +++++++++++++++++++++++++++++++--------------------
 2 files changed, 34 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 59dcfad4667e..89c4ead4cf4b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -378,7 +378,7 @@ struct task_struct {
 /* signal handlers */
 	struct signal_struct *sig;
 
-	sigset_t blocked, real_blocked, shared_unblocked;
+	sigset_t blocked, real_blocked;
 	struct sigpending pending;
 
 	unsigned long sas_ss_sp;
@@ -530,7 +530,7 @@ extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *);
 extern void sig_exit(int, int, struct siginfo *);
-extern int dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info);
+extern int dequeue_signal(sigset_t *mask, siginfo_t *info);
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
diff --git a/kernel/signal.c b/kernel/signal.c
index c7701f42a932..b037b12ce04b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -405,14 +405,8 @@ found_another:
 	return 0;
 }
 
-/*
- * Dequeue a signal and return the element to the caller, which is 
- * expected to free it.
- *
- * All callers have to hold the siglock.
- */
-
-int dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info)
+static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
+			siginfo_t *info)
 {
 	int sig = 0;
 
@@ -438,6 +432,27 @@ int dequeue_signal(struct sigpending *pending, sigset_t *mask, siginfo_t *info)
 	return sig;
 }
 
+/*
+ * Dequeue a signal and return the element to the caller, which is 
+ * expected to free it.
+ *
+ * All callers have to hold the siglock.
+ */
+int dequeue_signal(sigset_t *mask, siginfo_t *info)
+{
+	/*
+	 * Here we handle shared pending signals. To implement the full
+	 * semantics we need to unqueue and resend them. It will likely
+	 * get into our own pending queue.
+	 */
+	if (current->sig->shared_pending.head) {
+		int signr = __dequeue_signal(&current->sig->shared_pending, mask, info);
+		if (signr)
+			__send_sig_info(signr, info, current);
+	}
+	return __dequeue_signal(&current->pending, mask, info);
+}
+
 static int rm_from_queue(int sig, struct sigpending *s)
 {
 	struct sigqueue *q, **pp;
@@ -843,8 +858,7 @@ struct task_struct * find_unblocked_thread(struct task_struct *p, int signr)
 	struct pid *pid;
 
 	for_each_task_pid(p->tgid, PIDTYPE_TGID, tmp, l, pid)
-		if (!sigismember(&tmp->blocked, signr) &&
-					!sigismember(&tmp->real_blocked, signr))
+		if (!sigismember(&tmp->blocked, signr))
 			return tmp;
 	return NULL;
 }
@@ -887,6 +901,10 @@ __send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 		ret = specific_send_sig_info(sig, info, p, 1);
 		goto out_unlock;
 	}
+	if (sigismember(&t->real_blocked,sig)) {
+		ret = specific_send_sig_info(sig, info, t, 0);
+		goto out_unlock;
+	}
 	if (sig_kernel_broadcast(sig) || sig_kernel_coredump(sig)) {
 		ret = __broadcast_thread_group(p, sig);
 		goto out_unlock;
@@ -1169,10 +1187,7 @@ int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs)
 		struct k_sigaction *ka;
 
 		spin_lock_irq(&current->sig->siglock);
-		if (current->sig->shared_pending.head)
-			signr = dequeue_signal(&current->sig->shared_pending, mask, info);
-		if (!signr)
-			signr = dequeue_signal(&current->pending, mask, info);
+		signr = dequeue_signal(mask, info);
 		spin_unlock_irq(&current->sig->siglock);
 
 		if (!signr)
@@ -1268,7 +1283,7 @@ int get_signal_to_deliver(siginfo_t *info, struct pt_regs *regs)
 #endif
 
 EXPORT_SYMBOL(recalc_sigpending);
-EXPORT_SYMBOL(dequeue_signal);
+EXPORT_SYMBOL_GPL(dequeue_signal);
 EXPORT_SYMBOL(flush_signals);
 EXPORT_SYMBOL(force_sig);
 EXPORT_SYMBOL(force_sig_info);
@@ -1469,9 +1484,7 @@ sys_rt_sigtimedwait(const sigset_t *uthese, siginfo_t *uinfo,
 	}
 
 	spin_lock_irq(&current->sig->siglock);
-	sig = dequeue_signal(&current->sig->shared_pending, &these, &info);
-	if (!sig)
-		sig = dequeue_signal(&current->pending, &these, &info);
+	sig = dequeue_signal(&these, &info);
 	if (!sig) {
 		timeout = MAX_SCHEDULE_TIMEOUT;
 		if (uts)
@@ -1491,9 +1504,7 @@ sys_rt_sigtimedwait(const sigset_t *uthese, siginfo_t *uinfo,
 			timeout = schedule_timeout(timeout);
 
 			spin_lock_irq(&current->sig->siglock);
-			sig = dequeue_signal(&current->sig->shared_pending, &these, &info);
-			if (!sig)
-				sig = dequeue_signal(&current->pending, &these, &info);
+			sig = dequeue_signal(&these, &info);
 			current->blocked = current->real_blocked;
 			siginitset(&current->real_blocked, 0);
 			recalc_sigpending();
-- 
cgit v1.2.3


From afc141063a5b0f4e5a45e7e68060918daa69a859 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 2 Oct 2002 23:33:06 -0700
Subject: [PATCH] timer-2.5.40-F7

This does a number of timer subsystem enhancements:

- simplified timer initialization, now it's the cheapest possible thing:

    static inline void init_timer(struct timer_list * timer)
    {
            timer->base = NULL;
    }

  since the timer functions already did a !timer->base check this did not
  have any effect on their fastpath.

- the rule from now on is that timer->base is set upon activation of the
  timer, and cleared upon deactivation. This also made it possible to:

- reorganize all the timer handling code to not assume anything about
  timer->entry.next and timer->entry.prev - this also removed lots of
  unnecessery cleaning of these fields. Removed lots of unnecessary list
  operations from the fastpath.

- simplified del_timer_sync(): it now uses del_timer() plus some simple
  synchronization code. Note that this also fixes a bug: if mod_timer (or
  add_timer) moves a currently executing timer to another CPU's timer
  vector, then del_timer_sync() does not synchronize with the handler
  properly.

- bugfix: moved run_local_timers() from scheduler_tick() into
  update_process_times() .. scheduler_tick() might be called from the fork
  code which will not quite have the intended effect ...

- removed the APIC-timer-IRQ shifting done on SMP, Dipankar Sarma's
  testing shows no negative effects.

- cleaned up include/linux/timer.h:

     - removed the timer_t typedef, and fixes up kernel/workqueue.c to use
       the 'struct timer_list' name instead.

     - removed unnecessery includes

     - renamed the 'list' field to 'entry' (it's an entry not a list head)

     - exchanged the 'function' and 'data' fields. This, besides being
       more logical, also unearthed the last few remaining places that
       initialized timers by assuming some given field ordering, the patch
       also fixes these places. (fs/xfs/pagebuf/page_buf.c,
       net/core/profile.c and net/ipv4/inetpeer.c)

     - removed the defunct sync_timers(), timer_enter() and timer_exit()
       prototypes.

     - added docbook-style comments.

- other kernel/timer.c changes:

     - base->running_timer does not have to be volatile ...

     - added consistent comments to all the important functions.

     - made the sync-waiting in del_timer_sync preempt- and lowpower-
       friendly.

i've compiled, booted & tested the patched kernel on x86 UP and SMP. I
have tried moderately high networking load as well, to make sure the timer
changes are correct - they appear to be.
---
 arch/i386/kernel/apic.c   |  33 +-------
 fs/xfs/pagebuf/page_buf.c |   2 +-
 include/linux/timer.h     |  89 ++++++++++-----------
 include/linux/workqueue.h |   2 +-
 kernel/sched.c            |   6 +-
 kernel/timer.c            | 193 +++++++++++++++++++++++++++++-----------------
 kernel/workqueue.c        |   2 +-
 net/core/profile.c        |   4 +-
 net/ipv4/inetpeer.c       |   2 +-
 9 files changed, 172 insertions(+), 161 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 012fdf1e87c2..a9943eb75f96 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -813,24 +813,9 @@ void __setup_APIC_LVTT(unsigned int clocks)
 
 static void setup_APIC_timer(unsigned int clocks)
 {
-	unsigned int slice, t0, t1;
 	unsigned long flags;
-	int delta;
 
-	local_save_flags(flags);
-	local_irq_enable();
-	/*
-	 * ok, Intel has some smart code in their APIC that knows
-	 * if a CPU was in 'hlt' lowpower mode, and this increases
-	 * its APIC arbitration priority. To avoid the external timer
-	 * IRQ APIC event being in synchron with the APIC clock we
-	 * introduce an interrupt skew to spread out timer events.
-	 *
-	 * The number of slices within a 'big' timeslice is NR_CPUS+1
-	 */
-
-	slice = clocks / (NR_CPUS+1);
-	printk("cpu: %d, clocks: %d, slice: %d\n", smp_processor_id(), clocks, slice);
+	local_irq_save(flags);
 
 	/*
 	 * Wait for IRQ0's slice:
@@ -839,22 +824,6 @@ static void setup_APIC_timer(unsigned int clocks)
 
 	__setup_APIC_LVTT(clocks);
 
-	t0 = apic_read(APIC_TMICT)*APIC_DIVISOR;
-	/* Wait till TMCCT gets reloaded from TMICT... */
-	do {
-		t1 = apic_read(APIC_TMCCT)*APIC_DIVISOR;
-		delta = (int)(t0 - t1 - slice*(smp_processor_id()+1));
-	} while (delta >= 0);
-	/* Now wait for our slice for real. */
-	do {
-		t1 = apic_read(APIC_TMCCT)*APIC_DIVISOR;
-		delta = (int)(t0 - t1 - slice*(smp_processor_id()+1));
-	} while (delta < 0);
-
-	__setup_APIC_LVTT(clocks);
-
-	printk("CPU%d<T0:%d,T1:%d,D:%d,S:%d,C:%d>\n", smp_processor_id(), t0, t1, delta, slice, clocks);
-
 	local_irq_restore(flags);
 }
 
diff --git a/fs/xfs/pagebuf/page_buf.c b/fs/xfs/pagebuf/page_buf.c
index c79804b4bad0..c98dc4637050 100644
--- a/fs/xfs/pagebuf/page_buf.c
+++ b/fs/xfs/pagebuf/page_buf.c
@@ -1680,7 +1680,7 @@ pagebuf_daemon(
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 	struct timer_list	pb_daemon_timer =
-		{ {NULL, NULL}, 0, 0, (timeout_fn)pagebuf_daemon_wakeup };
+		{ .function = (timeout_fn)pagebuf_daemon_wakeup };
 
 	/*  Set up the thread  */
 	daemonize();
diff --git a/include/linux/timer.h b/include/linux/timer.h
index f890f4f3d668..cfedb5e8bb07 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -2,70 +2,59 @@
 #define _LINUX_TIMER_H
 
 #include <linux/config.h>
-#include <linux/smp.h>
-#include <linux/stddef.h>
 #include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/cache.h>
 
 struct tvec_t_base_s;
 
-/*
- * Timers may be dynamically created and destroyed, and should be initialized
- * by a call to init_timer() upon creation.
- *
- * The "data" field enables use of a common timeout function for several
- * timeouts. You can use this field to distinguish between the different
- * invocations.
- */
-typedef struct timer_list {
-	struct list_head list;
+struct timer_list {
+	struct list_head entry;
 	unsigned long expires;
-	unsigned long data;
+
 	void (*function)(unsigned long);
+	unsigned long data;
+
 	struct tvec_t_base_s *base;
-} timer_t;
+};
 
-extern void add_timer(timer_t * timer);
-extern int del_timer(timer_t * timer);
-  
-#ifdef CONFIG_SMP
-extern int del_timer_sync(timer_t * timer);
-extern void sync_timers(void);
-#define timer_enter(base, t) do { base->running_timer = t; mb(); } while (0)
-#define timer_exit(base) do { base->running_timer = NULL; } while (0)
-#define timer_is_running(base,t) (base->running_timer == t)
-#define timer_synchronize(base,t) while (timer_is_running(base,t)) barrier()
-#else
-#define del_timer_sync(t)	del_timer(t)
-#define sync_timers()		do { } while (0)
-#define timer_enter(base,t)          do { } while (0)
-#define timer_exit(base)            do { } while (0)
-#endif
-  
-/*
- * mod_timer is a more efficient way to update the expire field of an
- * active timer (if the timer is inactive it will be activated)
- * mod_timer(a,b) is equivalent to del_timer(a); a->expires = b; add_timer(a).
- * If the timer is known to be not pending (ie, in the handler), mod_timer
- * is less efficient than a->expires = b; add_timer(a).
+/***
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
  */
-int mod_timer(timer_t *timer, unsigned long expires);
-
-extern void it_real_fn(unsigned long);
-
-extern void init_timers(void);
-extern void run_local_timers(void);
-
-static inline void init_timer(timer_t * timer)
+static inline void init_timer(struct timer_list * timer)
 {
-	timer->list.next = timer->list.prev = NULL;
 	timer->base = NULL;
 }
 
-static inline int timer_pending(const timer_t * timer)
+/***
+ * timer_pending - is a timer pending?
+ * @timer: the timer in question
+ *
+ * timer_pending will tell whether a given timer is currently pending,
+ * or not. Callers must ensure serialization wrt. other operations done
+ * to this timer, eg. interrupt contexts, or other CPUs on SMP.
+ *
+ * return value: 1 if the timer is pending, 0 if not.
+ */
+static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->list.next != NULL;
+	return timer->base != NULL;
 }
 
+extern void add_timer(struct timer_list * timer);
+extern int del_timer(struct timer_list * timer);
+extern int mod_timer(struct timer_list *timer, unsigned long expires);
+  
+#if CONFIG_SMP
+  extern int del_timer_sync(struct timer_list * timer);
+#else
+# define del_timer_sync(t) del_timer(t)
+#endif
+
+extern void init_timers(void);
+extern void run_local_timers(void);
+extern void it_real_fn(unsigned long);
+
 #endif
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7828c7bef55f..3e466894179f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -15,7 +15,7 @@ struct work_struct {
 	void (*func)(void *);
 	void *data;
 	void *wq_data;
-	timer_t timer;
+	struct timer_list timer;
 };
 
 #define __WORK_INITIALIZER(n, f, d) {				\
diff --git a/kernel/sched.c b/kernel/sched.c
index d28180b44322..0d8095a9be9b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -855,6 +855,9 @@ static inline void idle_tick(runqueue_t *rq)
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
+ *
+ * It also gets called by the fork code, when changing the parent's
+ * timeslices.
  */
 void scheduler_tick(int user_ticks, int sys_ticks)
 {
@@ -862,7 +865,6 @@ void scheduler_tick(int user_ticks, int sys_ticks)
 	runqueue_t *rq = this_rq();
 	task_t *p = current;
 
-	run_local_timers();
 	if (p == rq->idle) {
 		/* note: this timer irq context must be accounted for as well */
 		if (irq_count() - HARDIRQ_OFFSET >= SOFTIRQ_OFFSET)
@@ -2109,8 +2111,6 @@ __init int migration_init(void)
 spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 #endif
 
-extern void init_timers(void);
-
 void __init sched_init(void)
 {
 	runqueue_t *rq;
diff --git a/kernel/timer.c b/kernel/timer.c
index 01ee9d3103b4..96113a11d875 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -47,10 +47,12 @@ typedef struct tvec_root_s {
 	struct list_head vec[TVR_SIZE];
 } tvec_root_t;
 
+typedef struct timer_list timer_t;
+
 struct tvec_t_base_s {
 	spinlock_t lock;
 	unsigned long timer_jiffies;
-	volatile timer_t * volatile running_timer;
+	timer_t *running_timer;
 	tvec_root_t tv1;
 	tvec_t tv2;
 	tvec_t tv3;
@@ -69,7 +71,7 @@ static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
-	struct list_head * vec;
+	struct list_head *vec;
 
 	if (idx < TVR_SIZE) {
 		int i = expires & TVR_MASK;
@@ -92,24 +94,36 @@ static inline void internal_add_timer(tvec_base_t *base, timer_t *timer)
 	} else if (idx <= 0xffffffffUL) {
 		int i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
-	} else {
+	} else
 		/* Can only get here on architectures with 64-bit jiffies */
-		INIT_LIST_HEAD(&timer->list);
 		return;
-	}
 	/*
 	 * Timers are FIFO:
 	 */
-	list_add_tail(&timer->list, vec);
+	list_add_tail(&timer->entry, vec);
 }
 
+/***
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expired point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expired, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expired field in the past will be executed in the next
+ * timer tick. It's illegal to add an already pending timer.
+ */
 void add_timer(timer_t *timer)
 {
 	int cpu = get_cpu();
 	tvec_base_t *base = tvec_bases + cpu;
   	unsigned long flags;
   
-  	BUG_ON(timer_pending(timer));
+  	BUG_ON(timer_pending(timer) || !timer->function);
 
 	spin_lock_irqsave(&base->lock, flags);
 	internal_add_timer(base, timer);
@@ -118,25 +132,38 @@ void add_timer(timer_t *timer)
 	put_cpu();
 }
 
-static inline int detach_timer (timer_t *timer)
-{
-	if (!timer_pending(timer))
-		return 0;
-	list_del(&timer->list);
-	return 1;
-}
-
-/*
- * mod_timer() has subtle locking semantics because parallel
- * calls to it must happen serialized.
+/***
+ * mod_timer - modify a timer's timeout
+ * @timer: the timer to be modified
+ *
+ * mod_timer is a more efficient way to update the expire field of an
+ * active timer (if the timer is inactive it will be activated)
+ *
+ * mod_timer(timer, expires) is equivalent to:
+ *
+ *     del_timer(timer); timer->expires = expires; add_timer(timer);
+ *
+ * Note that if there are multiple unserialized concurrent users of the
+ * same timer, then mod_timer() is the only safe way to modify the timeout,
+ * since add_timer() cannot modify an already running timer.
+ *
+ * The function returns whether it has modified a pending timer or not.
+ * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
+ * active timer returns 1.)
  */
 int mod_timer(timer_t *timer, unsigned long expires)
 {
 	tvec_base_t *old_base, *new_base;
 	unsigned long flags;
-	int ret;
+	int ret = 0;
 
-	if (timer_pending(timer) && timer->expires == expires)
+	BUG_ON(!timer->function);
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
 	local_irq_save(flags);
@@ -156,8 +183,8 @@ repeat:
 			spin_lock(&new_base->lock);
 		}
 		/*
-		 * Subtle, we rely on timer->base being always
-		 * valid and being updated atomically.
+		 * The timer base might have changed while we were
+		 * trying to take the lock(s):
 		 */
 		if (timer->base != old_base) {
 			spin_unlock(&new_base->lock);
@@ -167,8 +194,15 @@ repeat:
 	} else
 		spin_lock(&new_base->lock);
 
+	/*
+	 * Delete the previous timeout (if there was any), and install
+	 * the new one:
+	 */
+	if (old_base) {
+		list_del(&timer->entry);
+		ret = 1;
+	}
 	timer->expires = expires;
-	ret = detach_timer(timer);
 	internal_add_timer(new_base, timer);
 	timer->base = new_base;
 
@@ -179,66 +213,74 @@ repeat:
 	return ret;
 }
 
-int del_timer(timer_t * timer)
+/***
+ * del_timer - deactive a timer.
+ * @timer: the timer to be deactivated
+ *
+ * del_timer() deactivates a timer - this works on both active and inactive
+ * timers.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
+ * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
+ * active timer returns 1.)
+ */
+int del_timer(timer_t *timer)
 {
 	unsigned long flags;
-	tvec_base_t * base;
-	int ret;
+	tvec_base_t *base;
 
-	if (!timer->base)
-		return 0;
 repeat:
  	base = timer->base;
+	if (!base)
+		return 0;
 	spin_lock_irqsave(&base->lock, flags);
 	if (base != timer->base) {
 		spin_unlock_irqrestore(&base->lock, flags);
 		goto repeat;
 	}
-	ret = detach_timer(timer);
-	timer->list.next = timer->list.prev = NULL;
+	list_del(&timer->entry);
+	timer->base = NULL;
 	spin_unlock_irqrestore(&base->lock, flags);
 
-	return ret;
+	return 1;
 }
 
 #ifdef CONFIG_SMP
-/*
- * SMP specific function to delete periodic timer.
- * Caller must disable by some means restarting the timer
- * for new. Upon exit the timer is not queued and handler is not running
- * on any CPU. It returns number of times, which timer was deleted
- * (for reference counting).
+/***
+ * del_timer_sync - deactivate a timer and wait for the handler to finish.
+ * @timer: the timer to be deactivated
+ *
+ * This function only differs from del_timer() on SMP: besides deactivating
+ * the timer it also makes sure the handler has finished executing on other
+ * CPUs.
+ *
+ * Synchronization rules: callers must prevent restarting of the timer,
+ * otherwise this function is meaningless. It must not be called from
+ * interrupt contexts. Upon exit the timer is not queued and the handler
+ * is not running on any CPU.
+ *
+ * The function returns whether it has deactivated a pending timer or not.
  */
-
-int del_timer_sync(timer_t * timer)
+int del_timer_sync(timer_t *timer)
 {
-	tvec_base_t * base;
-	int ret = 0;
+	tvec_base_t *base = tvec_bases;
+	int i, ret;
 
-	if (!timer->base)
-		return 0;
-	for (;;) {
-		unsigned long flags;
-		int running;
-
-repeat:
-	 	base = timer->base;
-		spin_lock_irqsave(&base->lock, flags);
-		if (base != timer->base) {
-			spin_unlock_irqrestore(&base->lock, flags);
-			goto repeat;
-		}
-		ret += detach_timer(timer);
-		timer->list.next = timer->list.prev = 0;
-		running = timer_is_running(base, timer);
-		spin_unlock_irqrestore(&base->lock, flags);
+	ret = del_timer(timer);
 
-		if (!running)
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_online(i))
+			continue;
+		if (base->running_timer == timer) {
+			while (base->running_timer == timer) {
+				cpu_relax();
+				preempt_disable();
+				preempt_enable();
+			}
 			break;
-
-		timer_synchronize(base, timer);
+		}
+		base++;
 	}
-
 	return ret;
 }
 #endif
@@ -258,11 +300,10 @@ static void cascade(tvec_base_t *base, tvec_t *tv)
 	while (curr != head) {
 		timer_t *tmp;
 
-		tmp = list_entry(curr, timer_t, list);
+		tmp = list_entry(curr, timer_t, entry);
 		if (tmp->base != base)
 			BUG();
 		next = curr->next;
-		list_del(curr); // not needed
 		internal_add_timer(base, tmp);
 		curr = next;
 	}
@@ -270,7 +311,14 @@ static void cascade(tvec_base_t *base, tvec_t *tv)
 	tv->index = (tv->index + 1) & TVN_MASK;
 }
 
-static void __run_timers(tvec_base_t *base)
+/***
+ * __run_timers - run all expired timers (if any) on this CPU.
+ * @base: the timer vector to be processed.
+ *
+ * This function cascades all vectors and executes all expired timer
+ * vectors.
+ */
+static inline void __run_timers(tvec_base_t *base)
 {
 	unsigned long flags;
 
@@ -300,22 +348,26 @@ repeat:
 			unsigned long data;
 			timer_t *timer;
 
-			timer = list_entry(curr, timer_t, list);
+			timer = list_entry(curr, timer_t, entry);
  			fn = timer->function;
  			data = timer->data;
 
-			detach_timer(timer);
-			timer->list.next = timer->list.prev = NULL;
-			timer_enter(base, timer);
+			list_del(&timer->entry);
+			timer->base = NULL;
+#if CONFIG_SMP
+			base->running_timer = timer;
+#endif
 			spin_unlock_irq(&base->lock);
 			fn(data);
 			spin_lock_irq(&base->lock);
-			timer_exit(base);
 			goto repeat;
 		}
 		++base->timer_jiffies; 
 		base->tv1.index = (base->tv1.index + 1) & TVR_MASK;
 	}
+#if CONFIG_SMP
+	base->running_timer = NULL;
+#endif
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 
@@ -607,6 +659,7 @@ void update_process_times(int user_tick)
 	int cpu = smp_processor_id(), system = user_tick ^ 1;
 
 	update_one_process(p, user_tick, system, cpu);
+	run_local_timers();
 	scheduler_tick(user_tick, system);
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 7adce746f426..156583c7dbf7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -100,7 +100,7 @@ static void delayed_work_timer_fn(unsigned long __data)
 int queue_delayed_work(struct workqueue_struct *wq, struct work_struct *work, unsigned long delay)
 {
 	int ret = 0, cpu = get_cpu();
-	timer_t *timer = &work->timer;
+	struct timer_list *timer = &work->timer;
 	struct cpu_workqueue_struct *cwq = wq->cpu_wq + cpu;
 
 	if (!test_and_set_bit(0, &work->pending)) {
diff --git a/net/core/profile.c b/net/core/profile.c
index 403556b6c745..33bfdd4260a5 100644
--- a/net/core/profile.c
+++ b/net/core/profile.c
@@ -35,7 +35,7 @@ long alpha_hi;
 static void alpha_tick(unsigned long);
 
 static struct timer_list alpha_timer =
-	{ NULL, NULL, 0, 0L, alpha_tick };
+	{ .function = alpha_tick };
 
 void alpha_tick(unsigned long dummy)
 {
@@ -158,7 +158,7 @@ static void whitehole_inject(unsigned long);
 int whitehole_init(struct net_device *dev);
 
 static struct timer_list whitehole_timer =
-	{ NULL, NULL, 0, 0L, whitehole_inject };
+	{ .function = whitehole_inject };
 
 static struct net_device whitehole_dev = {
 	"whitehole", 0x0, 0x0, 0x0, 0x0, 0, 0, 0, 0, 0, NULL, whitehole_init, };
diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c
index bc3af5838e7b..c326f9d36cc1 100644
--- a/net/ipv4/inetpeer.c
+++ b/net/ipv4/inetpeer.c
@@ -98,7 +98,7 @@ spinlock_t inet_peer_unused_lock = SPIN_LOCK_UNLOCKED;
 
 static void peer_check_expire(unsigned long dummy);
 static struct timer_list peer_periodic_timer =
-	{ { NULL, NULL }, 0, 0, &peer_check_expire };
+	{ .function = &peer_check_expire };
 
 /* Exported for sysctl_net_ipv4.  */
 int inet_peer_gc_mintime = 10 * HZ,
-- 
cgit v1.2.3


From c33585c5a5458799af1de8009b71ffecb62371fa Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Wed, 2 Oct 2002 23:34:28 -0700
Subject: [PATCH] pipe bugfix /cleanup

pipe_write contains a wakeup storm, 2 writers that write into the same
fifo can wake each other up, and spend 100% cpu time with
wakeup/schedule, without making any progress.

The only regression I'm aware of is that

  $ dd if=/dev/zero | grep not_there

will fail due to OOM, because grep does something like

	for(;;) {
		rlen = read(fd, buf, len);
		if (rlen == len) {
			len *= 2;
			buf = realloc(buf, len);
		}
	}

if it operates on pipes, and due to the improved syscall merging, read
will always return the maximum possible amount of data. But that's a grep
bug, not a kernel problem.
---
 fs/pipe.c                 | 315 ++++++++++++++++++++--------------------------
 include/linux/pipe_fs_i.h |   2 -
 2 files changed, 138 insertions(+), 179 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index ced3acc411e9..20bba0702007 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -25,6 +25,9 @@
  *
  * FIFOs and Pipes now generate SIGIO for both readers and writers.
  * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
+ *
+ * pipe_read & write cleanup
+ * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
 /* Drop the inode semaphore and wait for a pipe event, atomically */
@@ -44,97 +47,81 @@ static ssize_t
 pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	ssize_t size, read, ret;
-
-	/* Seeks are not allowed on pipes.  */
-	ret = -ESPIPE;
-	read = 0;
-	if (ppos != &filp->f_pos)
-		goto out_nolock;
-
-	/* Always return 0 on null read.  */
+	int do_wakeup;
+	ssize_t ret;
+
+	/* pread is not allowed on pipes. */
+	if (unlikely(ppos != &filp->f_pos))
+		return -ESPIPE;
+	
+	/* Null read succeeds. */
+	if (unlikely(count == 0))
+		return 0;
+
+	do_wakeup = 0;
 	ret = 0;
-	if (count == 0)
-		goto out_nolock;
+	down(PIPE_SEM(*inode));
+	for (;;) {
+		int size = PIPE_LEN(*inode);
+		if (size) {
+			char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
+			ssize_t chars = PIPE_MAX_RCHUNK(*inode);
 
-	/* Get the pipe semaphore */
-	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
-		goto out_nolock;
+			if (chars > count)
+				chars = count;
+			if (chars > size)
+				chars = size;
+
+			if (copy_to_user(buf, pipebuf, chars)) {
+				if (!ret) ret = -EFAULT;
+				break;
+			}
+			ret += chars;
 
-	if (PIPE_EMPTY(*inode)) {
-do_more_read:
-		ret = 0;
+			PIPE_START(*inode) += chars;
+			PIPE_START(*inode) &= (PIPE_SIZE - 1);
+			PIPE_LEN(*inode) -= chars;
+			count -= chars;
+			buf += chars;
+			do_wakeup = 1;
+		}
+		if (!count)
+			break;	/* common path: read succeeded */
+		if (PIPE_LEN(*inode)) /* test for cyclic buffers */
+			continue;
 		if (!PIPE_WRITERS(*inode))
-			goto out;
-
-		ret = -EAGAIN;
-		if (filp->f_flags & O_NONBLOCK)
-			goto out;
-
-		for (;;) {
-			PIPE_WAITING_READERS(*inode)++;
-			pipe_wait(inode);
-			PIPE_WAITING_READERS(*inode)--;
-			ret = -ERESTARTSYS;
-			if (signal_pending(current))
-				goto out;
-			ret = 0;
-			if (!PIPE_EMPTY(*inode))
+			break;
+		if (!PIPE_WAITING_WRITERS(*inode)) {
+			/* syscall merging: Usually we must not sleep
+			 * if O_NONBLOCK is set, or if we got some data.
+			 * But if a writer sleeps in kernel space, then
+			 * we can wait for that data without violating POSIX.
+			 */
+			if (ret)
 				break;
-			if (!PIPE_WRITERS(*inode))
-				goto out;
+			if (filp->f_flags & O_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
 		}
+		if (signal_pending(current)) {
+			if (!ret) ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
+			wake_up_interruptible(PIPE_WAIT(*inode));
+ 			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		}
+		pipe_wait(inode);
 	}
-
-	/* Read what data is available.  */
-	ret = -EFAULT;
-	while (count > 0 && (size = PIPE_LEN(*inode))) {
-		char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
-		ssize_t chars = PIPE_MAX_RCHUNK(*inode);
-
-		if (chars > count)
-			chars = count;
-		if (chars > size)
-			chars = size;
-
-		if (copy_to_user(buf, pipebuf, chars))
-			goto out;
-
-		read += chars;
-		PIPE_START(*inode) += chars;
-		PIPE_START(*inode) &= (PIPE_SIZE - 1);
-		PIPE_LEN(*inode) -= chars;
-		count -= chars;
-		buf += chars;
-	}
-
-	/* Cache behaviour optimization */
-	if (!PIPE_LEN(*inode))
-		PIPE_START(*inode) = 0;
-
-	if (count && PIPE_WAITING_WRITERS(*inode) && !(filp->f_flags & O_NONBLOCK)) {
-		/*
-		 * We know that we are going to sleep: signal
-		 * writers synchronously that there is more
-		 * room.
-		 */
+	up(PIPE_SEM(*inode));
+	/* Signal writers asynchronously that there is more room.  */
+	if (do_wakeup) {
 		wake_up_interruptible_sync(PIPE_WAIT(*inode));
- 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
-		if (!PIPE_EMPTY(*inode))
-			BUG();
-		goto do_more_read;
+		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
-	/* Signal writers asynchronously that there is more room.  */
-	wake_up_interruptible(PIPE_WAIT(*inode));
- 	kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
-
-	ret = read;
-out:
-	up(PIPE_SEM(*inode));
-out_nolock:
-	if (read)
-		ret = read;
+	if (ret > 0)
+		UPDATE_ATIME(inode);
 	return ret;
 }
 
@@ -142,116 +129,90 @@ static ssize_t
 pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	ssize_t free, written, ret;
-
-	/* Seeks are not allowed on pipes.  */
-	ret = -ESPIPE;
-	written = 0;
-	if (ppos != &filp->f_pos)
-		goto out_nolock;
-
-	/* Null write succeeds.  */
+	ssize_t ret;
+	size_t min;
+	int do_wakeup;
+
+	/* pwrite is not allowed on pipes. */
+	if (unlikely(ppos != &filp->f_pos))
+		return -ESPIPE;
+	
+	/* Null write succeeds. */
+	if (unlikely(count == 0))
+		return 0;
+
+	do_wakeup = 0;
 	ret = 0;
-	if (count == 0)
-		goto out_nolock;
-
-	ret = -ERESTARTSYS;
-	if (down_interruptible(PIPE_SEM(*inode)))
-		goto out_nolock;
-
-	/* No readers yields SIGPIPE.  */
-	if (!PIPE_READERS(*inode))
-		goto sigpipe;
-
-	/* If count <= PIPE_BUF, we have to make it atomic.  */
-	free = (count <= PIPE_BUF ? count : 1);
-
-	/* Wait, or check for, available space.  */
-	if (filp->f_flags & O_NONBLOCK) {
-		ret = -EAGAIN;
-		if (PIPE_FREE(*inode) < free)
-			goto out;
-	} else {
-		while (PIPE_FREE(*inode) < free) {
-			PIPE_WAITING_WRITERS(*inode)++;
-			pipe_wait(inode);
-			PIPE_WAITING_WRITERS(*inode)--;
-			ret = -ERESTARTSYS;
-			if (signal_pending(current))
-				goto out;
-
-			if (!PIPE_READERS(*inode))
-				goto sigpipe;
+	min = count;
+	if (min > PIPE_BUF)
+		min = 1;
+	down(PIPE_SEM(*inode));
+	for (;;) {
+		int free;
+		if (!PIPE_READERS(*inode)) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret) ret = -EPIPE;
+			break;
 		}
-	}
-
-	/* Copy into available space.  */
-	ret = -EFAULT;
-	while (count > 0) {
-		int space;
-		char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
-		ssize_t chars = PIPE_MAX_WCHUNK(*inode);
-
-		if ((space = PIPE_FREE(*inode)) != 0) {
+		free = PIPE_FREE(*inode);
+		if (free >= min) {
+			/* transfer data */
+			ssize_t chars = PIPE_MAX_WCHUNK(*inode);
+			char *pipebuf = PIPE_BASE(*inode) + PIPE_END(*inode);
+			/* Always wakeup, even if the copy fails. Otherwise
+			 * we lock up (O_NONBLOCK-)readers that sleep due to
+			 * syscall merging.
+			 */
+			do_wakeup = 1;
 			if (chars > count)
 				chars = count;
-			if (chars > space)
-				chars = space;
+			if (chars > free)
+				chars = free;
 
-			if (copy_from_user(pipebuf, buf, chars))
-				goto out;
+			if (copy_from_user(pipebuf, buf, chars)) {
+				if (!ret) ret = -EFAULT;
+				break;
+			}
 
-			written += chars;
+			ret += chars;
 			PIPE_LEN(*inode) += chars;
 			count -= chars;
 			buf += chars;
-			space = PIPE_FREE(*inode);
+		}
+		if (!count)
+			break;
+		if (PIPE_FREE(*inode) && ret) {
+			/* handle cyclic data buffers */
+			min = 1;
 			continue;
 		}
-
-		ret = written;
-		if (filp->f_flags & O_NONBLOCK)
+		if (filp->f_flags & O_NONBLOCK) {
+			if (!ret) ret = -EAGAIN;
 			break;
-
-		do {
-			/*
-			 * Synchronous wake-up: it knows that this process
-			 * is going to give up this CPU, so it doesn't have
-			 * to do idle reschedules.
-			 */
+		}
+		if (signal_pending(current)) {
+			if (!ret) ret = -ERESTARTSYS;
+			break;
+		}
+		if (do_wakeup) {
 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
- 			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-			PIPE_WAITING_WRITERS(*inode)++;
-			pipe_wait(inode);
-			PIPE_WAITING_WRITERS(*inode)--;
-			if (signal_pending(current))
-				goto out;
-			if (!PIPE_READERS(*inode))
-				goto sigpipe;
-		} while (!PIPE_FREE(*inode));
-		ret = -EFAULT;
+			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+			do_wakeup = 0;
+		}
+		PIPE_WAITING_WRITERS(*inode)++;
+		pipe_wait(inode);
+		PIPE_WAITING_WRITERS(*inode)--;
 	}
-
-	/* Signal readers asynchronously that there is more data.  */
-	wake_up_interruptible(PIPE_WAIT(*inode));
-	kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	mark_inode_dirty(inode);
-
-out:
 	up(PIPE_SEM(*inode));
-out_nolock:
-	if (written)
-		ret = written;
+	if (do_wakeup) {
+		wake_up_interruptible(PIPE_WAIT(*inode));
+		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+	}
+	if (ret > 0) {
+		inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+		mark_inode_dirty(inode);
+	}
 	return ret;
-
-sigpipe:
-	if (written)
-		goto out;
-	up(PIPE_SEM(*inode));
-	send_sig(SIGPIPE, current, 0);
-	return -EPIPE;
 }
 
 static ssize_t
@@ -525,7 +486,7 @@ struct inode* pipe_new(struct inode* inode)
 	PIPE_BASE(*inode) = (char*) page;
 	PIPE_START(*inode) = PIPE_LEN(*inode) = 0;
 	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
-	PIPE_WAITING_READERS(*inode) = PIPE_WAITING_WRITERS(*inode) = 0;
+	PIPE_WAITING_WRITERS(*inode) = 0;
 	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
 	*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index d6a7928a2866..407c0e0b3e84 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -9,7 +9,6 @@ struct pipe_inode_info {
 	unsigned int start;
 	unsigned int readers;
 	unsigned int writers;
-	unsigned int waiting_readers;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
@@ -28,7 +27,6 @@ struct pipe_inode_info {
 #define PIPE_LEN(inode)		((inode).i_pipe->len)
 #define PIPE_READERS(inode)	((inode).i_pipe->readers)
 #define PIPE_WRITERS(inode)	((inode).i_pipe->writers)
-#define PIPE_WAITING_READERS(inode)	((inode).i_pipe->waiting_readers)
 #define PIPE_WAITING_WRITERS(inode)	((inode).i_pipe->waiting_writers)
 #define PIPE_RCOUNTER(inode)	((inode).i_pipe->r_counter)
 #define PIPE_WCOUNTER(inode)	((inode).i_pipe->w_counter)
-- 
cgit v1.2.3


From c2c0b5d0011d37771b34bd661a0e7509807ad79d Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Wed, 2 Oct 2002 23:36:51 -0700
Subject: PCI: remove pcibios_find_class()

---
 drivers/pci/compat.c | 17 -----------------
 include/linux/pci.h  |  3 ---
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/compat.c b/drivers/pci/compat.c
index b355f674cba7..4082c8f087b2 100644
--- a/drivers/pci/compat.c
+++ b/drivers/pci/compat.c
@@ -19,22 +19,6 @@ pcibios_present(void)
 	return !list_empty(&pci_devices);
 }
 
-int
-pcibios_find_class(unsigned int class, unsigned short index, unsigned char *bus, unsigned char *devfn)
-{
-	const struct pci_dev *dev = NULL;
-	int cnt = 0;
-
-	while ((dev = pci_find_class(class, dev)))
-		if (index == cnt++) {
-			*bus = dev->bus->number;
-			*devfn = dev->devfn;
-			return PCIBIOS_SUCCESSFUL;
-		}
-	return PCIBIOS_DEVICE_NOT_FOUND;
-}
-
-
 int
 pcibios_find_device(unsigned short vendor, unsigned short device, unsigned short index,
 		    unsigned char *bus, unsigned char *devfn)
@@ -75,5 +59,4 @@ EXPORT_SYMBOL(pcibios_read_config_dword);
 EXPORT_SYMBOL(pcibios_write_config_byte);
 EXPORT_SYMBOL(pcibios_write_config_word);
 EXPORT_SYMBOL(pcibios_write_config_dword);
-EXPORT_SYMBOL(pcibios_find_class);
 EXPORT_SYMBOL(pcibios_find_device);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ba2e997304fd..ec8f0686e24a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -533,7 +533,6 @@ int pcibios_write_config_word (unsigned char bus, unsigned char dev_fn,
 			       unsigned char where, unsigned short val);
 int pcibios_write_config_dword (unsigned char bus, unsigned char dev_fn,
 				unsigned char where, unsigned int val);
-int pcibios_find_class (unsigned int class_code, unsigned short index, unsigned char *bus, unsigned char *dev_fn);
 int pcibios_find_device (unsigned short vendor, unsigned short dev_id,
 			 unsigned short index, unsigned char *bus,
 			 unsigned char *dev_fn);
@@ -661,8 +660,6 @@ void pci_pool_free (struct pci_pool *pool, void *vaddr, dma_addr_t addr);
 
 #ifndef CONFIG_PCI
 static inline int pcibios_present(void) { return 0; }
-static inline int pcibios_find_class (unsigned int class_code, unsigned short index, unsigned char *bus, unsigned char *dev_fn) 
-{ 	return PCIBIOS_DEVICE_NOT_FOUND; }
 
 #define _PCI_NOP(o,s,t) \
 	static inline int pcibios_##o##_config_##s (u8 bus, u8 dfn, u8 where, t val) \
-- 
cgit v1.2.3


From 4a66ae8251604a7c8a262e8d2302a26de62a2b16 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Wed, 2 Oct 2002 23:45:53 -0700
Subject: PCI: remove pci_find_device()

---
 drivers/pci/compat.c | 17 -----------------
 include/linux/pci.h  |  3 ---
 2 files changed, 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/compat.c b/drivers/pci/compat.c
index 4082c8f087b2..51e9b0828d08 100644
--- a/drivers/pci/compat.c
+++ b/drivers/pci/compat.c
@@ -19,22 +19,6 @@ pcibios_present(void)
 	return !list_empty(&pci_devices);
 }
 
-int
-pcibios_find_device(unsigned short vendor, unsigned short device, unsigned short index,
-		    unsigned char *bus, unsigned char *devfn)
-{
-	const struct pci_dev *dev = NULL;
-	int cnt = 0;
-
-	while ((dev = pci_find_device(vendor, device, dev)))
-		if (index == cnt++) {
-			*bus = dev->bus->number;
-			*devfn = dev->devfn;
-			return PCIBIOS_SUCCESSFUL;
-		}
-	return PCIBIOS_DEVICE_NOT_FOUND;
-}
-
 #define PCI_OP(rw,size,type)							\
 int pcibios_##rw##_config_##size (unsigned char bus, unsigned char dev_fn,	\
 				  unsigned char where, unsigned type val)	\
@@ -59,4 +43,3 @@ EXPORT_SYMBOL(pcibios_read_config_dword);
 EXPORT_SYMBOL(pcibios_write_config_byte);
 EXPORT_SYMBOL(pcibios_write_config_word);
 EXPORT_SYMBOL(pcibios_write_config_dword);
-EXPORT_SYMBOL(pcibios_find_device);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index ec8f0686e24a..933e62a6813c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -533,9 +533,6 @@ int pcibios_write_config_word (unsigned char bus, unsigned char dev_fn,
 			       unsigned char where, unsigned short val);
 int pcibios_write_config_dword (unsigned char bus, unsigned char dev_fn,
 				unsigned char where, unsigned int val);
-int pcibios_find_device (unsigned short vendor, unsigned short dev_id,
-			 unsigned short index, unsigned char *bus,
-			 unsigned char *dev_fn);
 
 /* Generic PCI functions used internally */
 
-- 
cgit v1.2.3


From 36be8435c1ad70461e93840606a1b4ef2c9e7f5f Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <greg@kroah.com>
Date: Thu, 3 Oct 2002 00:06:24 -0700
Subject: PCI: removed pcibios_present()

---
 drivers/net/hp100.c       |  4 ++--
 drivers/net/tulip/de4x5.c |  4 ++--
 drivers/pci/compat.c      |  8 --------
 drivers/pci/syscall.c     |  2 +-
 include/linux/pci.h       | 21 ++++++++++-----------
 5 files changed, 15 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index 7a23fe6bd93b..de55257a8c7f 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -412,7 +412,7 @@ int __init hp100_probe(struct net_device *dev)
 	/* First: scan PCI bus(es) */
 
 #ifdef CONFIG_PCI
-	if (pcibios_present()) {
+	if (pci_present()) {
 		int pci_index;
 		struct pci_dev *pci_dev = NULL;
 		int pci_id_index;
@@ -2960,7 +2960,7 @@ static int __init hp100_module_init(void)
 {
 	int i, cards;
 
-	if (hp100_port == 0 && !EISA_bus && !pcibios_present())
+	if (hp100_port == 0 && !EISA_bus && !pci_present())
 		printk("hp100: You should not use auto-probing with insmod!\n");
 
 	/* Loop on all possible base addresses */
diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c
index 93c863801086..e4cc1f9eaf1f 100644
--- a/drivers/net/tulip/de4x5.c
+++ b/drivers/net/tulip/de4x5.c
@@ -2190,7 +2190,7 @@ pci_probe(struct net_device *dev, u_long ioaddr)
 
     if (lastPCI == NO_MORE_PCI) return;
 
-    if (!pcibios_present()) {
+    if (!pci_present()) {
 	lastPCI = NO_MORE_PCI;
 	return;          /* No PCI bus in this machine! */
     }
@@ -5872,7 +5872,7 @@ count_adapters(void)
 	if (EISA_signature(name, EISA_ID)) j++;
     }
 #endif
-    if (!pcibios_present()) return j;
+    if (!pci_present()) return j;
 
     for (i=0; (pdev=pci_find_class(class, pdev))!= NULL; i++) {
 	vendor = pdev->vendor;
diff --git a/drivers/pci/compat.c b/drivers/pci/compat.c
index 51e9b0828d08..048c610a9e62 100644
--- a/drivers/pci/compat.c
+++ b/drivers/pci/compat.c
@@ -13,12 +13,6 @@
 
 /* Obsolete functions, these will be going away... */
 
-int
-pcibios_present(void)
-{
-	return !list_empty(&pci_devices);
-}
-
 #define PCI_OP(rw,size,type)							\
 int pcibios_##rw##_config_##size (unsigned char bus, unsigned char dev_fn,	\
 				  unsigned char where, unsigned type val)	\
@@ -35,8 +29,6 @@ PCI_OP(write, byte, char)
 PCI_OP(write, word, short)
 PCI_OP(write, dword, int)
 
-
-EXPORT_SYMBOL(pcibios_present);
 EXPORT_SYMBOL(pcibios_read_config_byte);
 EXPORT_SYMBOL(pcibios_read_config_word);
 EXPORT_SYMBOL(pcibios_read_config_dword);
diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c
index c935efd9a933..dd39d23d51ed 100644
--- a/drivers/pci/syscall.c
+++ b/drivers/pci/syscall.c
@@ -98,7 +98,7 @@ sys_pciconfig_write(unsigned long bus, unsigned long dfn,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
-	if (!pcibios_present())
+	if (!pci_present())
 		return -ENOSYS;
 
 	dev = pci_find_slot(bus, dfn);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 933e62a6813c..9c12d53f9dc7 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -322,15 +322,6 @@ enum pci_mmap_state {
 
 #define PCI_ANY_ID (~0)
 
-#define pci_present pcibios_present
-
-
-#define pci_for_each_dev_reverse(dev) \
-	for(dev = pci_dev_g(pci_devices.prev); dev != pci_dev_g(&pci_devices); dev = pci_dev_g(dev->global_list.prev))
-
-#define pci_for_each_bus(bus) \
-for(bus = pci_bus_b(pci_root_buses.next); bus != pci_bus_b(&pci_root_buses); bus = pci_bus_b(bus->node.next))
-
 /*
  * The pci_dev structure is used to describe both PCI and ISAPnP devices.
  */
@@ -503,8 +494,17 @@ struct pci_driver {
 /* these external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
+static inline int pci_present(void)
+{
+	return !list_empty(&pci_devices);
+}
+
 #define pci_for_each_dev(dev) \
 	for(dev = pci_dev_g(pci_devices.next); dev != pci_dev_g(&pci_devices); dev = pci_dev_g(dev->global_list.next))
+#define pci_for_each_dev_reverse(dev) \
+	for(dev = pci_dev_g(pci_devices.prev); dev != pci_dev_g(&pci_devices); dev = pci_dev_g(dev->global_list.prev))
+#define pci_for_each_bus(bus) \
+	for(bus = pci_bus_b(pci_root_buses.next); bus != pci_bus_b(&pci_root_buses); bus = pci_bus_b(bus->node.next))
 
 void pcibios_fixup_bus(struct pci_bus *);
 int pcibios_enable_device(struct pci_dev *, int mask);
@@ -520,7 +520,6 @@ void pcibios_fixup_pbus_ranges(struct pci_bus *, struct pbus_set_ranges_data *);
 
 /* Backward compatibility, don't use in new code! */
 
-int pcibios_present(void);
 int pcibios_read_config_byte (unsigned char bus, unsigned char dev_fn,
 			      unsigned char where, unsigned char *val);
 int pcibios_read_config_word (unsigned char bus, unsigned char dev_fn,
@@ -656,7 +655,7 @@ void pci_pool_free (struct pci_pool *pool, void *vaddr, dma_addr_t addr);
  */
 
 #ifndef CONFIG_PCI
-static inline int pcibios_present(void) { return 0; }
+static inline int pci_present(void) { return 0; }
 
 #define _PCI_NOP(o,s,t) \
 	static inline int pcibios_##o##_config_##s (u8 bus, u8 dfn, u8 where, t val) \
-- 
cgit v1.2.3


From b1c725a713176833cb0c27189bbae26986443750 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@penguin.transmeta.com>
Date: Thu, 3 Oct 2002 00:30:05 -0700
Subject: Add <linux/linkage.h> include to get FASTCALL() define.

---
 include/linux/workqueue.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 3e466894179f..8fb06d58739f 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -6,6 +6,7 @@
 #define _LINUX_WORKQUEUE_H
 
 #include <linux/timer.h>
+#include <linux/linkage.h>
 
 struct workqueue_struct;
 
-- 
cgit v1.2.3


From 20ddfc0047bd762ed32ec42e50b5a14bad02728d Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 3 Oct 2002 20:43:42 -0700
Subject: [PATCH] remove _P/_p delaying iops

Lets kill these off for good.

o Remove OUT_BYTE/IN_BYTE and variants. We defaulted to the fast ones
  even before

o Add read barrier for ppc, it needs it
---
 drivers/ide/ide-iops.c | 182 ++++++++++++++++++++++++-------------------------
 drivers/ide/ide.c      |  10 ---
 include/linux/ide.h    |  55 ---------------
 3 files changed, 90 insertions(+), 157 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index 19304b007aad..353e71f6eef7 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -32,135 +32,156 @@
 #include <asm/bitops.h>
 
 
-static inline u8 ide_inb (u32 port)
+static u8 ide_inb (u32 port)
 {
-	return (u8) IN_BYTE(port);
+	return (u8) inb(port);
 }
 
-static inline u8 ide_inb_p (u32 port)
+static u16 ide_inw (u32 port)
 {
-	return (u8) IN_BYTE_P(port);
+	return (u16) inw(port);
 }
 
-static inline u16 ide_inw (u32 port)
+static void ide_insw (u32 port, void *addr, u32 count)
 {
-	return (u16) IN_WORD(port);
+	return insw(port, addr, count);
 }
 
-static inline u16 ide_inw_p (u32 port)
+static u32 ide_inl (u32 port)
 {
-	return (u16) IN_WORD_P(port);
+	return (u32) inl(port);
 }
 
-static inline void ide_insw (u32 port, void *addr, u32 count)
+static void ide_insl (u32 port, void *addr, u32 count)
 {
-	while (count--) { *(u16 *)addr = IN_WORD(port); addr += 2; }
+	insl(port, addr, count);
 }
 
-static inline void ide_insw_p (u32 port, void *addr, u32 count)
+static void ide_outb (u8 addr, u32 port)
 {
-	while (count--) { *(u16 *)addr = IN_WORD_P(port); addr += 2; }
+	outb(addr, port);
 }
 
-static inline u32 ide_inl (u32 port)
+static void ide_outw (u16 addr, u32 port)
 {
-	return (u32) IN_LONG(port);
+	outw(addr, port);
 }
 
-static inline u32 ide_inl_p (u32 port)
+static void ide_outsw (u32 port, void *addr, u32 count)
 {
-        return (u32) IN_LONG_P(port);
+	outsw(port, addr, count);
 }
 
-static inline void ide_insl (u32 port, void *addr, u32 count)
+static void ide_outl (u32 addr, u32 port)
 {
-	ide_insw(port, addr, (count)<<1);
-//	while (count--) { *(u32 *)addr = IN_LONG(port); addr += 4; }
+	outl(addr, port);
 }
 
-static inline void ide_insl_p (u32 port, void *addr, u32 count)
+static void ide_outsl (u32 port, void *addr, u32 count)
 {
-	ide_insw_p(port, addr, (count)<<1);
-//	while (count--) { *(u32 *)addr = IN_LONG(port); addr += 4; }
+	return outsl(port, addr, count);
 }
 
-static inline void ide_outb (u8 addr, u32 port)
+void default_hwif_iops (ide_hwif_t *hwif)
 {
-	OUT_BYTE(addr, port);
+	hwif->OUTB	= ide_outb;
+	hwif->OUTW	= ide_outw;
+	hwif->OUTL	= ide_outl;
+	hwif->OUTSW	= ide_outsw;
+	hwif->OUTSL	= ide_outsl;
+	hwif->INB	= ide_inb;
+	hwif->INW	= ide_inw;
+	hwif->INL	= ide_inl;
+	hwif->INSW	= ide_insw;
+	hwif->INSL	= ide_insl;
 }
 
-static inline void ide_outb_p (u8 addr, u32 port)
+EXPORT_SYMBOL(default_hwif_iops);
+
+static u8 ide_mm_inb (u32 port)
 {
-	OUT_BYTE_P(addr, port);
+	return (u8) readb(port);
 }
 
-static inline void ide_outw (u16 addr, u32 port)
+static u16 ide_mm_inw (u32 port)
 {
-	OUT_WORD(addr, port);
+	return (u16) readw(port);
 }
 
-static inline void ide_outw_p (u16 addr, u32 port)
+static void ide_mm_insw (u32 port, void *addr, u32 count)
 {
-	OUT_WORD_P(addr, port);
+#ifdef CONFIG_PPC
+	/* Can we move the barrier out of the loop ? */
+	while (count--) { *(u16 *)addr = __raw_readw(port); iobarrier_r(); addr += 2; }
+#else /* everything else is sane benh */
+	while (count--) { *(u16 *)addr = readw(port); addr += 2; }
+#endif
 }
 
-static inline void ide_outsw (u32 port, void *addr, u32 count)
+static u32 ide_mm_inl (u32 port)
 {
-	while (count--) { OUT_WORD(*(u16 *)addr, port); addr += 2; }
+	return (u32) readl(port);
 }
 
-static inline void ide_outsw_p (u32 port, void *addr, u32 count)
+static void ide_mm_insl (u32 port, void *addr, u32 count)
 {
-	while (count--) { OUT_WORD_P(*(u16 *)addr, port); addr += 2; }
+#ifdef CONFIG_PPC
+	/* Can we move the barrier out of the loop ? */
+	while (count--) { *(u32 *)addr = __raw_readl(port); iobarrier_r(); addr += 4; }
+#else /* everything else is sane benh */
+	while (count--) { *(u32 *)addr = readl(port); addr += 4; }
+#endif
 }
 
-static inline void ide_outl (u32 addr, u32 port)
+static void ide_mm_outb (u8 value, u32 port)
 {
-	OUT_LONG(addr, port);
+	writeb(value, port);
 }
 
-static inline void ide_outl_p (u32 addr, u32 port)
+static void ide_mm_outw (u16 value, u32 port)
 {
-	OUT_LONG_P(addr, port);
+	writew(value, port);
 }
 
-static inline void ide_outsl (u32 port, void *addr, u32 count)
+static void ide_mm_outsw (u32 port, void *addr, u32 count)
 {
-	ide_outsw(port, addr, (count)<<1);
-//	while (count--) { OUT_LONG(*(u32 *)addr, port); addr += 4; }
+#ifdef CONFIG_PPC
+	/* Can we move the barrier out of the loop ? */
+	while (count--) { __raw_writew(*(u16 *)addr, port); iobarrier_w(); addr += 2; }
+#else /* everything else is sane benh */
+	while (count--) { writew(*(u16 *)addr, port); addr += 2; }
+#endif
 }
 
-static inline void ide_outsl_p (u32 port, void *addr, u32 count)
+static void ide_mm_outl (u32 value, u32 port)
 {
-	ide_outsw_p(port, addr, (count)<<1);
-//	while (count--) { OUT_LONG_P(*(u32 *)addr, port); addr += 4; }
+	writel(value, port);
 }
 
-void default_hwif_iops (ide_hwif_t *hwif)
+static void ide_mm_outsl (u32 port, void *addr, u32 count)
 {
-	hwif->OUTB	= ide_outb;
-	hwif->OUTBP	= ide_outb_p;
-	hwif->OUTW	= ide_outw;
-	hwif->OUTWP	= ide_outw_p;
-	hwif->OUTL	= ide_outl;
-	hwif->OUTLP	= ide_outl_p;
-	hwif->OUTSW	= ide_outsw;
-	hwif->OUTSWP	= ide_outsw_p;
-	hwif->OUTSL	= ide_outsl;
-	hwif->OUTSLP	= ide_outsl_p;
-	hwif->INB	= ide_inb;
-	hwif->INBP	= ide_inb_p;
-	hwif->INW	= ide_inw;
-	hwif->INWP	= ide_inw_p;
-	hwif->INL	= ide_inl;
-	hwif->INLP	= ide_inl_p;
-	hwif->INSW	= ide_insw;
-	hwif->INSWP	= ide_insw_p;
-	hwif->INSL	= ide_insl;
-	hwif->INSLP	= ide_insl_p;
+#ifdef CONFIG_PPC
+	while (count--) { __raw_writel(*(u32 *)addr, port); iobarrier_w(); addr += 4; }
+#else /* everything else is sane benh */
+	while (count--) { writel(*(u32 *)addr, port); addr += 4; }
+#endif
 }
 
-EXPORT_SYMBOL(default_hwif_iops);
+void default_hwif_mmiops (ide_hwif_t *hwif)
+{
+	hwif->OUTB	= ide_mm_outb;
+	hwif->OUTW	= ide_mm_outw;
+	hwif->OUTL	= ide_mm_outl;
+	hwif->OUTSW	= ide_mm_outsw;
+	hwif->OUTSL	= ide_mm_outsl;
+	hwif->INB	= ide_mm_inb;
+	hwif->INW	= ide_mm_inw;
+	hwif->INL	= ide_mm_inl;
+	hwif->INSW	= ide_mm_insw;
+	hwif->INSL	= ide_mm_insl;
+}
+
+EXPORT_SYMBOL(default_hwif_mmiops);
 
 void default_hwif_transport (ide_hwif_t *hwif)
 {
@@ -217,7 +238,6 @@ void QUIRK_LIST (ide_drive_t *drive)
 
 EXPORT_SYMBOL(QUIRK_LIST);
 
-#if SUPPORT_VLB_SYNC
 /*
  * Some localbus EIDE interfaces require a special access sequence
  * when using 32-bit I/O instructions to transfer data.  We call this
@@ -233,7 +253,6 @@ void ata_vlb_sync (ide_drive_t *drive, ide_ioreg_t port)
 }
 
 EXPORT_SYMBOL(ata_vlb_sync);
-#endif /* SUPPORT_VLB_SYNC */
 
 /*
  * This is used for most PIO data transfers *from* the IDE interface
@@ -244,7 +263,6 @@ void ata_input_data (ide_drive_t *drive, void *buffer, u32 wcount)
 	u8 io_32bit		= drive->io_32bit;
 
 	if (io_32bit) {
-#if SUPPORT_VLB_SYNC
 		if (io_32bit & 2) {
 			unsigned long flags;
 			local_irq_save(flags);
@@ -252,19 +270,9 @@ void ata_input_data (ide_drive_t *drive, void *buffer, u32 wcount)
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 			local_irq_restore(flags);
 		} else
-#endif /* SUPPORT_VLB_SYNC */
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 	} else {
-#if SUPPORT_SLOW_DATA_PORTS
-		if (drive->slow) {
-			u16 *ptr = (u16 *) buffer;
-			while (wcount--) {
-				*ptr++ = hwif->INWP(IDE_DATA_REG);
-				*ptr++ = hwif->INWP(IDE_DATA_REG);
-			}
-		} else
-#endif /* SUPPORT_SLOW_DATA_PORTS */
-			hwif->INSW(IDE_DATA_REG, buffer, wcount<<1);
+		hwif->INSW(IDE_DATA_REG, buffer, wcount<<1);
 	}
 }
 
@@ -279,7 +287,6 @@ void ata_output_data (ide_drive_t *drive, void *buffer, u32 wcount)
 	u8 io_32bit		= drive->io_32bit;
 
 	if (io_32bit) {
-#if SUPPORT_VLB_SYNC
 		if (io_32bit & 2) {
 			unsigned long flags;
 			local_irq_save(flags);
@@ -287,19 +294,9 @@ void ata_output_data (ide_drive_t *drive, void *buffer, u32 wcount)
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 			local_irq_restore(flags);
 		} else
-#endif /* SUPPORT_VLB_SYNC */
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 	} else {
-#if SUPPORT_SLOW_DATA_PORTS
-		if (drive->slow) {
-			u16 *ptr = (u16 *) buffer;
-			while (wcount--) {
-				hwif->OUTWP(*ptr++, IDE_DATA_REG);
-				hwif->OUTWP(*ptr++, IDE_DATA_REG);
-			}
-		} else
-#endif /* SUPPORT_SLOW_DATA_PORTS */
-			hwif->OUTSW(IDE_DATA_REG, buffer, wcount<<1);
+		hwif->OUTSW(IDE_DATA_REG, buffer, wcount<<1);
 	}
 }
 
@@ -312,6 +309,7 @@ EXPORT_SYMBOL(ata_output_data);
  * so if an odd bytecount is specified, be sure that there's at least one
  * extra byte allocated for the buffer.
  */
+
 void atapi_input_bytes (ide_drive_t *drive, void *buffer, u32 bytecount)
 {
 	ide_hwif_t *hwif = HWIF(drive);
diff --git a/drivers/ide/ide.c b/drivers/ide/ide.c
index f70e0f363410..acc3d795340e 100644
--- a/drivers/ide/ide.c
+++ b/drivers/ide/ide.c
@@ -1945,24 +1945,14 @@ void ide_unregister (unsigned int index)
 	hwif->OUTB		= old_hwif.OUTB;
 	hwif->OUTW		= old_hwif.OUTW;
 	hwif->OUTL		= old_hwif.OUTL;
-	hwif->OUTBP		= old_hwif.OUTBP;
-	hwif->OUTWP		= old_hwif.OUTWP;
-	hwif->OUTLP		= old_hwif.OUTLP;
 	hwif->OUTSW		= old_hwif.OUTSW;
-	hwif->OUTSWP		= old_hwif.OUTSWP;
 	hwif->OUTSL		= old_hwif.OUTSL;
-	hwif->OUTSLP		= old_hwif.OUTSLP;
 
 	hwif->INB		= old_hwif.INB;
 	hwif->INW		= old_hwif.INW;
 	hwif->INL		= old_hwif.INL;
-	hwif->INBP		= old_hwif.INBP;
-	hwif->INWP		= old_hwif.INWP;
-	hwif->INLP		= old_hwif.INLP;
 	hwif->INSW		= old_hwif.INSW;
-	hwif->INSWP		= old_hwif.INSWP;
 	hwif->INSL		= old_hwif.INSL;
-	hwif->INSLP		= old_hwif.INSLP;
 #endif
 
 	hwif->mmio			= old_hwif.mmio;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 74fff672eaef..7b8ebfe2c8dd 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -302,24 +302,14 @@ typedef struct ide_io_ops_s {
 	void (*OUTB)(u8 addr, u32 port);
 	void (*OUTW)(u16 addr, u32 port);
 	void (*OUTL)(u32 addr, u32 port);
-	void (*OUTBP)(u8 addr, u32 port);
-	void (*OUTWP)(u16 addr, u32 port);
-	void (*OUTLP)(u32 addr, u32 port);
 	void (*OUTSW)(u32 port, void *addr, u32 count);
-	void (*OUTSWP)(u32 port, void *addr, u32 count);
 	void (*OUTSL)(u32 port, void *addr, u32 count);
-	void (*OUTSLP)(u32 port, void *addr, u32 count);
 
 	u8  (*INB)(u32 port);
 	u16 (*INW)(u32 port);
 	u32 (*INL)(u32 port);
-	u8  (*INBP)(u32 port);
-	u16 (*INWP)(u32 port);
-	u32 (*INLP)(u32 port);
 	void (*INSW)(u32 port, void *addr, u32 count);
-	void (*INSWP)(u32 port, void *addr, u32 count);
 	void (*INSL)(u32 port, void *addr, u32 count);
-	void (*INSLP)(u32 port, void *addr, u32 count);
 } ide_io_ops_t;
 
 /*
@@ -374,41 +364,6 @@ extern int ide_irq_lock;
 # define ide_get_lock(lock, hdlr, data)		do {} while (0)
 #endif /* IDE_ARCH_LOCK */
 
-/*
- * If the arch-dependant ide.h did not declare/define any OUT_BYTE
- * or IN_BYTE functions, we make some defaults here.
- */
-
-#ifndef HAVE_ARCH_OUT_BYTE
-# ifdef REALLY_FAST_IO
-#  define OUT_BYTE(b,p)		outb((b),(p))
-#  define OUT_WORD(w,p)		outw((w),(p))
-#  define OUT_LONG(l,p)		outl((l),(p))
-# else
-#  define OUT_BYTE(b,p)		outb_p((b),(p))
-#  define OUT_WORD(w,p)		outw_p((w),(p))
-#  define OUT_LONG(l,p)		outl_p((l),(p))
-# endif
-# define OUT_BYTE_P(b,p)	outb_p((b),(p))
-# define OUT_WORD_P(w,p)	outw_p((w),(p))
-# define OUT_LONG_P(l,p)	outl_p((l),(p))
-#endif
-
-#ifndef HAVE_ARCH_IN_BYTE
-# ifdef REALLY_FAST_IO
-#  define IN_BYTE(p)		(u8) inb(p)
-#  define IN_WORD(p)		(u16) inw(p)
-#  define IN_LONG(p)		(u32) inl(p)
-# else
-#  define IN_BYTE(p)		(u8) inb_p(p)
-#  define IN_WORD(p)		(u16) inw_p(p)
-#  define IN_LONG(p)		(u32) inl_p(p)
-# endif
-# define IN_BYTE_P(p)		(u8) inb_p(p)
-# define IN_WORD_P(p)		(u16) inw_p(p)
-# define IN_LONG_P(p)		(u32) inl_p(p)
-#endif
-
 /*
  * Now for the data we need to maintain per-drive:  ide_drive_t
  */
@@ -1011,24 +966,14 @@ typedef struct hwif_s {
 	void (*OUTB)(u8 addr, u32 port);
 	void (*OUTW)(u16 addr, u32 port);
 	void (*OUTL)(u32 addr, u32 port);
-	void (*OUTBP)(u8 addr, u32 port);
-	void (*OUTWP)(u16 addr, u32 port);
-	void (*OUTLP)(u32 addr, u32 port);
 	void (*OUTSW)(u32 port, void *addr, u32 count);
-	void (*OUTSWP)(u32 port, void *addr, u32 count);
 	void (*OUTSL)(u32 port, void *addr, u32 count);
-	void (*OUTSLP)(u32 port, void *addr, u32 count);
 
 	u8  (*INB)(u32 port);
 	u16 (*INW)(u32 port);
 	u32 (*INL)(u32 port);
-	u8  (*INBP)(u32 port);
-	u16 (*INWP)(u32 port);
-	u32 (*INLP)(u32 port);
 	void (*INSW)(u32 port, void *addr, u32 count);
-	void (*INSWP)(u32 port, void *addr, u32 count);
 	void (*INSL)(u32 port, void *addr, u32 count);
-	void (*INSLP)(u32 port, void *addr, u32 count);
 #endif
 
 	/* dma physical region descriptor table (cpu view) */
-- 
cgit v1.2.3


From 39ae1835eb59ce0a69cc7994e00a43655cfc47d6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 3 Oct 2002 20:44:29 -0700
Subject: [PATCH] pass elevator type by reference, not value

Ingo spotted this one too, it's a leftover from when the elevator type
wasn't a variable. Also don't pass in &q->elevator, it can always be
deduced from queue itself of course.
---
 drivers/block/elevator.c  | 10 +++++++---
 drivers/block/ll_rw_blk.c |  4 ++--
 drivers/s390/block/dasd.c |  6 ++----
 include/linux/elevator.h  |  4 ++--
 4 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 9237eec87ba4..0b1517f93501 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -217,9 +217,11 @@ struct request *elevator_noop_next_request(request_queue_t *q)
 /*
  * general block -> elevator interface starts here
  */
-int elevator_init(request_queue_t *q, elevator_t *e, elevator_t type)
+int elevator_init(request_queue_t *q, elevator_t *type)
 {
-	*e = type;
+	elevator_t *e = &q->elevator;
+
+	memcpy(e, type, sizeof(*e));
 
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
@@ -230,8 +232,10 @@ int elevator_init(request_queue_t *q, elevator_t *e, elevator_t type)
 	return 0;
 }
 
-void elevator_exit(request_queue_t *q, elevator_t *e)
+void elevator_exit(request_queue_t *q)
 {
+	elevator_t *e = &q->elevator;
+
 	if (e->elevator_exit_fn)
 		e->elevator_exit_fn(q, e);
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 01720fdccb1c..e5a0f98423d2 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1145,7 +1145,7 @@ void blk_cleanup_queue(request_queue_t * q)
 	if (blk_queue_tagged(q))
 		blk_queue_free_tags(q);
 
-	elevator_exit(q, &q->elevator);
+	elevator_exit(q);
 
 	memset(q, 0, sizeof(*q));
 }
@@ -1227,7 +1227,7 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 	if (blk_init_free_list(q))
 		return -ENOMEM;
 
-	if ((ret = elevator_init(q, &q->elevator, iosched_deadline))) {
+	if ((ret = elevator_init(q, &iosched_deadline))) {
 		blk_cleanup_queue(q);
 		return ret;
 	}
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e9278eec247e..6ae8a0053b0a 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2046,10 +2046,8 @@ dasd_setup_blkdev(dasd_device_t * device)
 			    &device->request_queue_lock);
 	if (rc)
 		return rc;
-	elevator_exit(device->request_queue, &device->request_queue->elevator);
-	rc = elevator_init(device->request_queue,
-			   &device->request_queue->elevator,
-			   elevator_noop);
+	elevator_exit(device->request_queue);
+	rc = elevator_init(device->request_queue, &elevator_noop);
 	if (rc) {
 		blk_cleanup_queue(device->request_queue);
 		return rc;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 5c6c7db6e97e..8a54a1c5e5af 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -73,8 +73,8 @@ typedef struct blkelv_ioctl_arg_s {
 #define BLKELVGET   _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))
 #define BLKELVSET   _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))
 
-extern int elevator_init(request_queue_t *, elevator_t *, elevator_t);
-extern void elevator_exit(request_queue_t *, elevator_t *);
+extern int elevator_init(request_queue_t *, elevator_t *);
+extern void elevator_exit(request_queue_t *);
 extern inline int bio_rq_in_between(struct bio *, struct request *, struct list_head *);
 extern inline int elv_rq_merge_ok(struct request *, struct bio *);
 extern inline int elv_try_merge(struct request *, struct bio *);
-- 
cgit v1.2.3


From 3900abd5bb7557e1f8463f6aa6981b1475ec89d2 Mon Sep 17 00:00:00 2001
From: Christer Weinigel <christer@weinigel.se>
Date: Fri, 4 Oct 2002 20:13:10 -0700
Subject: [PATCH] Updated NatSemi SCx200 patches for Linux-2.5

This patch adds support for the National Semiconductor SCx200
processor family to Linux 2.5.

The patch consists of the following drivers:

  arch/i386/kernel/scx200.c -- give kernel access to the GPIO pins

  drivers/chars/scx200_gpio.c -- give userspace access to the GPIO pins
  drivers/chars/scx200_wdt.c -- watchdog timer driver

  drivers/i2c/scx200_i2c.c -- use any two GPIO pins as an I2C bus
  drivers/i2c/scx200_acb.c -- driver for the Access.BUS hardware

  drivers/mtd/maps/scx200_docflash.c -- driver for a CFI flash connected
                                      to the DOCCS pin
---
 MAINTAINERS                        |   6 +
 arch/i386/Config.help              |   9 +
 arch/i386/config.in                |   2 +
 arch/i386/kernel/Makefile          |   3 +
 arch/i386/kernel/scx200.c          | 128 ++++++++
 drivers/char/Config.help           |  12 +
 drivers/char/Config.in             |   3 +
 drivers/char/Makefile              |   2 +
 drivers/char/scx200_gpio.c         | 154 ++++++++++
 drivers/char/scx200_wdt.c          | 277 ++++++++++++++++++
 drivers/i2c/Config.help            |  22 ++
 drivers/i2c/Config.in              |   6 +
 drivers/i2c/Makefile               |   2 +
 drivers/i2c/scx200_acb.c           | 578 +++++++++++++++++++++++++++++++++++++
 drivers/i2c/scx200_i2c.c           | 156 ++++++++++
 drivers/mtd/maps/Config.help       |   8 +
 drivers/mtd/maps/Config.in         |   1 +
 drivers/mtd/maps/Makefile          |   1 +
 drivers/mtd/maps/scx200_docflash.c | 268 +++++++++++++++++
 include/linux/pci_ids.h            |   6 +
 include/linux/scx200.h             |  56 ++++
 include/linux/scx200_gpio.h        |  98 +++++++
 22 files changed, 1798 insertions(+)
 create mode 100644 arch/i386/kernel/scx200.c
 create mode 100644 drivers/char/scx200_gpio.c
 create mode 100644 drivers/char/scx200_wdt.c
 create mode 100644 drivers/i2c/scx200_acb.c
 create mode 100644 drivers/i2c/scx200_i2c.c
 create mode 100644 drivers/mtd/maps/scx200_docflash.c
 create mode 100644 include/linux/scx200.h
 create mode 100644 include/linux/scx200_gpio.h

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 3a9119b564ae..db808a97de6f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1433,6 +1433,12 @@ M:	Kai.Makisara@metla.fi
 L:	linux-scsi@vger.kernel.org
 S:	Maintained
 
+SCx200 CPU SUPPORT
+P:	Christer Weinigel
+M:	christer@weinigel.se
+W:	http://www.weinigel.se
+S:	Supported
+
 SGI VISUAL WORKSTATION 320 AND 540
 P:	Bent Hagemark
 M:	bh@sgi.com
diff --git a/arch/i386/Config.help b/arch/i386/Config.help
index 407fb1a2233c..cb2550c38a97 100644
--- a/arch/i386/Config.help
+++ b/arch/i386/Config.help
@@ -1058,3 +1058,12 @@ CONFIG_SOFTWARE_SUSPEND
   absence of features.
 
   For more information take a look at Documentation/swsusp.txt.
+
+CONFIG_SCx200
+  This provides basic support for the National Semiconductor SCx200 
+  processor.  Right now this is just a driver for the GPIO pins.
+
+  If you don't know what to do here, say N.
+
+  This support is also available as a module.  If compiled as a
+  module, it will be called scx200.o.
diff --git a/arch/i386/config.in b/arch/i386/config.in
index 566971ae7bd6..d1efcd695fbd 100644
--- a/arch/i386/config.in
+++ b/arch/i386/config.in
@@ -293,6 +293,8 @@ else
    fi
 fi
 
+tristate 'NatSemi SCx200 support' CONFIG_SCx200
+
 source drivers/pci/Config.in
 
 bool 'EISA support' CONFIG_EISA
diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile
index 735c11356773..3bdabc3f8f14 100644
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -29,4 +29,7 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq.o
 
 EXTRA_AFLAGS   := -traditional
 
+export-objs += scx200.o
+obj-$(CONFIG_SCx200)		+= scx200.o
+
 include $(TOPDIR)/Rules.make
diff --git a/arch/i386/kernel/scx200.c b/arch/i386/kernel/scx200.c
new file mode 100644
index 000000000000..340e7f39a718
--- /dev/null
+++ b/arch/i386/kernel/scx200.c
@@ -0,0 +1,128 @@
+/* linux/arch/i386/kernel/scx200.c 
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   National Semiconductor SCx200 support. */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+
+#include <linux/scx200.h>
+#include <linux/scx200.h>
+
+#define NAME "scx200"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Driver");
+MODULE_LICENSE("GPL");
+
+unsigned scx200_gpio_base = 0;
+long scx200_gpio_shadow[2];
+
+spinlock_t scx200_gpio_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t scx200_gpio_config_lock = SPIN_LOCK_UNLOCKED;
+
+u32 scx200_gpio_configure(int index, u32 mask, u32 bits)
+{
+	u32 config, new_config;
+	unsigned long flags;
+
+	spin_lock_irqsave(&scx200_gpio_config_lock, flags);
+
+	outl(index, scx200_gpio_base + 0x20);
+	config = inl(scx200_gpio_base + 0x24);
+
+	new_config = (config & mask) | bits;
+	outl(new_config, scx200_gpio_base + 0x24);
+
+	spin_unlock_irqrestore(&scx200_gpio_config_lock, flags);
+
+	return config;
+}
+
+void scx200_gpio_dump(unsigned index)
+{
+	u32 config = scx200_gpio_configure(index, ~0, 0);
+	printk(KERN_DEBUG "GPIO%02u: 0x%08lx", index, (unsigned long)config);
+	
+	if (config & 1) 
+		printk(" OE"); /* output enabled */
+	else
+		printk(" TS"); /* tristate */
+	if (config & 2) 
+		printk(" PP"); /* push pull */
+	else
+		printk(" OD"); /* open drain */
+	if (config & 4) 
+		printk(" PUE"); /* pull up enabled */
+	else
+		printk(" PUD"); /* pull up disabled */
+	if (config & 8) 
+		printk(" LOCKED"); /* locked */
+	if (config & 16) 
+		printk(" LEVEL"); /* level input */
+	else
+		printk(" EDGE"); /* edge input */
+	if (config & 32) 
+		printk(" HI"); /* trigger on rising edge */
+	else
+		printk(" LO"); /* trigger on falling edge */
+	if (config & 64) 
+		printk(" DEBOUNCE"); /* debounce */
+	printk("\n");
+}
+
+int __init scx200_init(void)
+{
+	struct pci_dev *bridge;
+	int bank;
+	unsigned base;
+
+	printk(KERN_INFO NAME ": NatSemi SCx200 Driver\n");
+
+	if ((bridge = pci_find_device(PCI_VENDOR_ID_NS, 
+				      PCI_DEVICE_ID_NS_SCx200_BRIDGE,
+				      NULL)) == NULL)
+		return -ENODEV;
+
+	base = pci_resource_start(bridge, 0);
+	printk(KERN_INFO NAME ": GPIO base 0x%x\n", base);
+
+	if (request_region(base, SCx200_GPIO_SIZE, "NatSemi SCx200 GPIO") == 0) {
+		printk(KERN_ERR NAME ": can't allocate I/O for GPIOs\n");
+		return -EBUSY;
+	}
+
+	scx200_gpio_base = base;
+
+	/* read the current values driven on the GPIO signals */
+	for (bank = 0; bank < 2; ++bank)
+		scx200_gpio_shadow[bank] = inl(scx200_gpio_base + 0x10 * bank);
+
+	return 0;
+}
+
+void __exit scx200_cleanup(void)
+{
+	release_region(scx200_gpio_base, SCx200_GPIO_SIZE);
+}
+
+module_init(scx200_init);
+module_exit(scx200_cleanup);
+
+EXPORT_SYMBOL(scx200_gpio_base);
+EXPORT_SYMBOL(scx200_gpio_shadow);
+EXPORT_SYMBOL(scx200_gpio_lock);
+EXPORT_SYMBOL(scx200_gpio_configure);
+EXPORT_SYMBOL(scx200_gpio_dump);
+
+/*
+    Local variables:
+        compile-command: "make -k -C ../../.. SUBDIRS=arch/i386/kernel modules"
+        c-basic-offset: 8
+    End:
+*/
diff --git a/drivers/char/Config.help b/drivers/char/Config.help
index c5272c99c910..e808a660614a 100644
--- a/drivers/char/Config.help
+++ b/drivers/char/Config.help
@@ -818,6 +818,12 @@ CONFIG_MIXCOMWD
   module, say M here and read <file:Documentation/modules.txt>.  Most
   people will say N.
 
+CONFIG_SCx200_WDT
+  Enable the built-in watchdog timer support on the National 
+  Semiconductor SCx200 processors.
+
+  If compiled as a module, it will be called scx200_watchdog.o.
+
 CONFIG_MACHZ_WDT
   If you are using a ZF Micro MachZ processor, say Y here, otherwise
   N.  This is the driver for the watchdog timer builtin on that
@@ -1021,3 +1027,9 @@ CONFIG_RAW_DRIVER
   Once bound, I/O against /dev/raw/rawN uses efficient zero-copy I/O. 
   See the raw(8) manpage for more details.
 
+CONFIG_SCx200_GPIO
+  Give userspace access to the GPIO pins on the National
+  Semiconductor SCx200 processors.
+
+  If compiled as a module, it will be called scx200_gpio.o.
+
diff --git a/drivers/char/Config.in b/drivers/char/Config.in
index fec2100978dd..680bafe4ca4a 100644
--- a/drivers/char/Config.in
+++ b/drivers/char/Config.in
@@ -131,6 +131,7 @@ if [ "$CONFIG_WATCHDOG" != "n" ]; then
    tristate '  IB700 SBC Watchdog Timer' CONFIG_IB700_WDT
    tristate '  Intel i810 TCO timer / Watchdog' CONFIG_I810_TCO
    tristate '  Mixcom Watchdog' CONFIG_MIXCOMWD 
+   tristate '  NatSemi SCx200 Watchdog' CONFIG_SCx200_WDT
    tristate '  SBC-60XX Watchdog Timer' CONFIG_60XX_WDT
    tristate '  W83877F (EMACS) Watchdog Timer' CONFIG_W83877F_WDT
    tristate '  ZF MachZ Watchdog' CONFIG_MACHZ_WDT
@@ -193,6 +194,8 @@ if [ "$CONFIG_X86" = "y" ]; then
    tristate 'ACP Modem (Mwave) support' CONFIG_MWAVE
 fi
 
+dep_tristate 'NatSemi SCx200 GPIO Support' CONFIG_SCx200_GPIO $CONFIG_SCx200
+
 tristate '  RAW driver (/dev/raw/rawN)' CONFIG_RAW_DRIVER
 
 endmenu
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index f2dac15888bc..80a28ced908c 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -75,6 +75,7 @@ obj-$(CONFIG_PPDEV) += ppdev.o
 obj-$(CONFIG_DZ) += dz.o
 obj-$(CONFIG_NWBUTTON) += nwbutton.o
 obj-$(CONFIG_NWFLASH) += nwflash.o
+obj-$(CONFIG_SCx200_GPIO) += scx200_gpio.o
 
 # Only one watchdog can succeed. We probe the hardware watchdog
 # drivers first, then the softdog driver.  This means if your hardware
@@ -86,6 +87,7 @@ obj-$(CONFIG_ACQUIRE_WDT) += acquirewdt.o
 obj-$(CONFIG_ADVANTECH_WDT) += advantechwdt.o
 obj-$(CONFIG_IB700_WDT) += ib700wdt.o
 obj-$(CONFIG_MIXCOMWD) += mixcomwd.o
+obj-$(CONFIG_SCx200_WDT) += scx200_wdt.o
 obj-$(CONFIG_60XX_WDT) += sbc60xxwdt.o
 obj-$(CONFIG_WDT) += wdt.o
 obj-$(CONFIG_WDTPCI) += wdt_pci.o
diff --git a/drivers/char/scx200_gpio.c b/drivers/char/scx200_gpio.c
new file mode 100644
index 000000000000..812bfb4bf50b
--- /dev/null
+++ b/drivers/char/scx200_gpio.c
@@ -0,0 +1,154 @@
+/* linux/drivers/char/scx200_gpio.c 
+
+   National Semiconductor SCx200 GPIO driver.  Allows a user space
+   process to play with the GPIO pins.
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com> */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/scx200_gpio.h>
+
+#define NAME "scx200_gpio"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 GPIO Pin Driver");
+MODULE_LICENSE("GPL");
+
+static int major = 0;		/* default to dynamic major */
+MODULE_PARM(major, "i");
+MODULE_PARM_DESC(major, "Major device number");
+
+static ssize_t scx200_gpio_write(struct file *file, const char *data, 
+				 size_t len, loff_t *ppos)
+{
+	unsigned m = minor(file->f_dentry->d_inode->i_rdev);
+	size_t i;
+
+	if (ppos != &file->f_pos)
+		return -ESPIPE;
+
+	for (i = 0; i < len; ++i) {
+		char c;
+		if (get_user(c, data+i))
+			return -EFAULT;
+		switch (c)
+		{
+		case '0': 
+			scx200_gpio_set(m, 0); 
+			break;
+		case '1': 
+			scx200_gpio_set(m, 1); 
+			break;
+		case 'O':
+			printk(KERN_INFO NAME ": GPIO%d output enabled\n", m);
+			scx200_gpio_configure(m, ~1, 1);
+			break;
+		case 'o':
+			printk(KERN_INFO NAME ": GPIO%d output disabled\n", m);
+			scx200_gpio_configure(m, ~1, 0);
+			break;
+		case 'T':
+			printk(KERN_INFO NAME ": GPIO%d output is push pull\n", m);
+			scx200_gpio_configure(m, ~2, 2);
+			break;
+		case 't':
+			printk(KERN_INFO NAME ": GPIO%d output is open drain\n", m);
+			scx200_gpio_configure(m, ~2, 0);
+			break;
+		case 'P':
+			printk(KERN_INFO NAME ": GPIO%d pull up enabled\n", m);
+			scx200_gpio_configure(m, ~4, 4);
+			break;
+		case 'p':
+			printk(KERN_INFO NAME ": GPIO%d pull up disabled\n", m);
+			scx200_gpio_configure(m, ~4, 0);
+			break;
+		}
+	}
+
+	return len;
+}
+
+static ssize_t scx200_gpio_read(struct file *file, char *buf,
+				size_t len, loff_t *ppos)
+{
+	unsigned m = minor(file->f_dentry->d_inode->i_rdev);
+	int value;
+
+	if (ppos != &file->f_pos)
+		return -ESPIPE;
+
+	value = scx200_gpio_get(m);
+	if (put_user(value ? '1' : '0', buf))
+		return -EFAULT;
+	
+	return 1;
+}
+
+static int scx200_gpio_open(struct inode *inode, struct file *file)
+{
+	unsigned m = minor(inode->i_rdev);
+	if (m > 63)
+		return -EINVAL;
+	return 0;
+}
+
+static int scx200_gpio_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+
+static struct file_operations scx200_gpio_fops = {
+	.owner   = THIS_MODULE,
+	.write   = scx200_gpio_write,
+	.read    = scx200_gpio_read,
+	.open    = scx200_gpio_open,
+	.release = scx200_gpio_release,
+};
+
+static int __init scx200_gpio_init(void)
+{
+	int r;
+
+	printk(KERN_DEBUG NAME ": NatSemi SCx200 GPIO Driver\n");
+
+	if (!scx200_gpio_present()) {
+		printk(KERN_ERR NAME ": no SCx200 gpio pins available\n");
+		return -ENODEV;
+	}
+
+	r = register_chrdev(major, NAME, &scx200_gpio_fops);
+	if (r < 0) {
+		printk(KERN_ERR NAME ": unable to register character device\n");
+		return r;
+	}
+	if (!major) {
+		major = r;
+		printk(KERN_DEBUG NAME ": got dynamic major %d\n", major);
+	}
+
+	return 0;
+}
+
+static void __exit scx200_gpio_cleanup(void)
+{
+	unregister_chrdev(major, NAME);
+}
+
+module_init(scx200_gpio_init);
+module_exit(scx200_gpio_cleanup);
+
+/*
+    Local variables:
+        compile-command: "make -k -C ../.. SUBDIRS=drivers/char modules"
+        c-basic-offset: 8
+    End:
+*/
diff --git a/drivers/char/scx200_wdt.c b/drivers/char/scx200_wdt.c
new file mode 100644
index 000000000000..c02aa8b609e2
--- /dev/null
+++ b/drivers/char/scx200_wdt.c
@@ -0,0 +1,277 @@
+/* linux/drivers/char/scx200_wdt.c 
+
+   National Semiconductor SCx200 Watchdog support
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   Som code taken from:
+   National Semiconductor PC87307/PC97307 (ala SC1200) WDT driver
+   (c) Copyright 2002 Zwane Mwaikambo <zwane@commfireservices.com>
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   The author(s) of this software shall not be held liable for damages
+   of any nature resulting due to the use of this software. This
+   software is provided AS-IS with no warranties. */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/watchdog.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/pci.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#include <linux/scx200.h>
+
+#define NAME "scx200_wdt"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 Watchdog Driver");
+MODULE_LICENSE("GPL");
+
+#ifndef CONFIG_WATCHDOG_NOWAYOUT
+#define CONFIG_WATCHDOG_NOWAYOUT 0
+#endif
+
+static int margin = 60;		/* in seconds */
+MODULE_PARM(margin, "i");
+MODULE_PARM_DESC(margin, "Watchdog margin in seconds");
+
+static int nowayout = CONFIG_WATCHDOG_NOWAYOUT;
+MODULE_PARM(nowayout, "i");
+MODULE_PARM_DESC(nowayout, "Disable watchdog shutdown on close");
+
+static u16 wdto_restart;
+static struct semaphore open_semaphore;
+static unsigned expect_close;
+
+/* Bits of the WDCNFG register */
+#define W_ENABLE 0x00fa		/* Enable watchdog */
+#define W_DISABLE 0x0000	/* Disable watchdog */
+
+/* The scaling factor for the timer, this depends on the value of W_ENABLE */
+#define W_SCALE (32768/1024)
+
+static void scx200_wdt_ping(void)
+{
+	outw(wdto_restart, SCx200_CB_BASE + SCx200_WDT_WDTO);
+}
+
+static void scx200_wdt_update_margin(void)
+{
+	printk(KERN_INFO NAME ": timer margin %d seconds\n", margin);
+	wdto_restart = margin * W_SCALE;
+}
+
+static void scx200_wdt_enable(void)
+{
+	printk(KERN_DEBUG NAME ": enabling watchdog timer, wdto_restart = %d\n", 
+	       wdto_restart);
+
+	outw(0, SCx200_CB_BASE + SCx200_WDT_WDTO);
+	outb(SCx200_WDT_WDSTS_WDOVF, SCx200_CB_BASE + SCx200_WDT_WDSTS);
+	outw(W_ENABLE, SCx200_CB_BASE + SCx200_WDT_WDCNFG);
+
+	scx200_wdt_ping();
+}
+
+static void scx200_wdt_disable(void)
+{
+	printk(KERN_DEBUG NAME ": disabling watchdog timer\n");
+		
+	outw(0, SCx200_CB_BASE + SCx200_WDT_WDTO);
+	outb(SCx200_WDT_WDSTS_WDOVF, SCx200_CB_BASE + SCx200_WDT_WDSTS);
+	outw(W_DISABLE, SCx200_CB_BASE + SCx200_WDT_WDCNFG);
+}
+
+static int scx200_wdt_open(struct inode *inode, struct file *file)
+{
+        /* only allow one at a time */
+        if (down_trylock(&open_semaphore))
+                return -EBUSY;
+	scx200_wdt_enable();
+	expect_close = 0;
+
+	return 0;
+}
+
+static int scx200_wdt_release(struct inode *inode, struct file *file)
+{
+	if (!expect_close) {
+		printk(KERN_WARNING NAME ": watchdog device closed unexpectedly, will not disable the watchdog timer\n");
+	} else if (!nowayout) {
+		scx200_wdt_disable();
+	}
+        up(&open_semaphore);
+
+	return 0;
+}
+
+static int scx200_wdt_notify_sys(struct notifier_block *this, 
+				      unsigned long code, void *unused)
+{
+        if (code == SYS_HALT || code == SYS_POWER_OFF)
+		if (!nowayout)
+			scx200_wdt_disable();
+
+        return NOTIFY_DONE;
+}
+
+static struct notifier_block scx200_wdt_notifier =
+{
+        notifier_call: scx200_wdt_notify_sys
+};
+
+static ssize_t scx200_wdt_write(struct file *file, const char *data, 
+				     size_t len, loff_t *ppos)
+{
+	if (ppos != &file->f_pos)
+		return -ESPIPE;
+
+	/* check for a magic close character */
+	if (len) 
+	{
+		size_t i;
+
+		scx200_wdt_ping();
+
+		expect_close = 0;
+		for (i = 0; i < len; ++i) {
+			char c;
+			if (get_user(c, data+i))
+				return -EFAULT;
+			if (c == 'V')
+				expect_close = 1;
+		}
+
+		return len;
+	}
+
+	return 0;
+}
+
+static int scx200_wdt_ioctl(struct inode *inode, struct file *file,
+	unsigned int cmd, unsigned long arg)
+{
+	static struct watchdog_info ident = {
+		.identity = "NatSemi SCx200 Watchdog",
+		.firmware_version = 1, 
+		.options = (WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING),
+	};
+	int new_margin;
+	
+	switch (cmd) {
+	default:
+		return -ENOTTY;
+	case WDIOC_GETSUPPORT:
+		if(copy_to_user((struct watchdog_info *)arg, &ident, 
+				sizeof(ident)))
+			return -EFAULT;
+		return 0;
+	case WDIOC_GETSTATUS:
+	case WDIOC_GETBOOTSTATUS:
+		if (put_user(0, (int *)arg))
+			return -EFAULT;
+		return 0;
+	case WDIOC_KEEPALIVE:
+		scx200_wdt_ping();
+		return 0;
+	case WDIOC_SETTIMEOUT:
+		if (get_user(new_margin, (int *)arg))
+			return -EFAULT;
+		if (new_margin < 1)
+			return -EINVAL;
+		margin = new_margin;
+		scx200_wdt_update_margin();
+		scx200_wdt_ping();
+	case WDIOC_GETTIMEOUT:
+		if (put_user(margin, (int *)arg))
+			return -EFAULT;
+		return 0;
+	}
+}
+
+static struct file_operations scx200_wdt_fops = {
+	.owner	 = THIS_MODULE,
+	.write   = scx200_wdt_write,
+	.ioctl   = scx200_wdt_ioctl,
+	.open    = scx200_wdt_open,
+	.release = scx200_wdt_release,
+};
+
+static struct miscdevice scx200_wdt_miscdev = {
+	.minor = WATCHDOG_MINOR,
+	.name  = NAME,
+	.fops  = &scx200_wdt_fops,
+};
+
+static int __init scx200_wdt_init(void)
+{
+	int r;
+
+	printk(KERN_DEBUG NAME ": NatSemi SCx200 Watchdog Driver\n");
+
+	/* First check that this really is a NatSemi SCx200 CPU */
+	if ((pci_find_device(PCI_VENDOR_ID_NS, 
+			     PCI_DEVICE_ID_NS_SCx200_BRIDGE,
+			     NULL)) == NULL)
+		return -ENODEV;
+
+	/* More sanity checks, verify that the configuration block is there */
+	if (!scx200_cb_probe(SCx200_CB_BASE)) {
+		printk(KERN_WARNING NAME ": no configuration block found\n");
+		return -ENODEV;
+	}
+
+	if (!request_region(SCx200_CB_BASE + SCx200_WDT_OFFSET, 
+			    SCx200_WDT_SIZE, 
+			    "NatSemi SCx200 Watchdog")) {
+		printk(KERN_WARNING NAME ": watchdog I/O region busy\n");
+		return -EBUSY;
+	}
+
+	scx200_wdt_update_margin();
+	scx200_wdt_disable();
+
+	sema_init(&open_semaphore, 1);
+
+	r = misc_register(&scx200_wdt_miscdev);
+	if (r)
+		return r;
+
+	r = register_reboot_notifier(&scx200_wdt_notifier);
+        if (r) {
+                printk(KERN_ERR NAME ": unable to register reboot notifier");
+		misc_deregister(&scx200_wdt_miscdev);
+                return r;
+        }
+
+	return 0;
+}
+
+static void __exit scx200_wdt_cleanup(void)
+{
+        unregister_reboot_notifier(&scx200_wdt_notifier);
+	misc_deregister(&scx200_wdt_miscdev);
+	release_region(SCx200_CB_BASE + SCx200_WDT_OFFSET,
+		       SCx200_WDT_SIZE);
+}
+
+module_init(scx200_wdt_init);
+module_exit(scx200_wdt_cleanup);
+
+/*
+    Local variables:
+        compile-command: "make -k -C ../.. SUBDIRS=drivers/char modules"
+        c-basic-offset: 8
+    End:
+*/
diff --git a/drivers/i2c/Config.help b/drivers/i2c/Config.help
index 371162eb2ad5..b21fce5bd96a 100644
--- a/drivers/i2c/Config.help
+++ b/drivers/i2c/Config.help
@@ -91,6 +91,28 @@ CONFIG_I2C_CHARDEV
   <file:Documentation/modules.txt>.
   The module will be called i2c-dev.o.
 
+CONFIG_SCx200_I2C
+  Enable the use of two GPIO pins of a SCx200 processor as an I2C bus.
+
+  If you don't know what to do here, say N.
+
+  If compiled as a module, it will be called scx200_i2c.o.
+
+CONFIG_SCx200_I2C_SCL
+  Enter the GPIO pin number used for the SCL signal.  This value can
+  also be specified with a module parameter.
+
+CONFIG_SCx200_I2C_SDA
+  Enter the GPIO pin number used for the SSA signal.  This value can
+  also be specified with a module parameter.
+
+CONFIG_SCx200_ACB
+  Enable the use of the ACCESS.bus controllers of a SCx200 processor.
+
+  If you don't know what to do here, say N.
+
+  If compiled as a module, it will be called scx200_acb.o.
+
 CONFIG_I2C_PROC
   This provides support for i2c device entries in the /proc filesystem.
   The entries will be found in /proc/sys/dev/sensors.
diff --git a/drivers/i2c/Config.in b/drivers/i2c/Config.in
index af9fd0f1f444..6054a16568bc 100644
--- a/drivers/i2c/Config.in
+++ b/drivers/i2c/Config.in
@@ -13,6 +13,12 @@ if [ "$CONFIG_I2C" != "n" ]; then
       dep_tristate '  Philips style parallel port adapter' CONFIG_I2C_PHILIPSPAR $CONFIG_I2C_ALGOBIT $CONFIG_PARPORT
       dep_tristate '  ELV adapter' CONFIG_I2C_ELV $CONFIG_I2C_ALGOBIT
       dep_tristate '  Velleman K9000 adapter' CONFIG_I2C_VELLEMAN $CONFIG_I2C_ALGOBIT
+      dep_tristate '  NatSemi SCx200 I2C using GPIO pins' CONFIG_SCx200_I2C $CONFIG_SCx200 $CONFIG_I2C_ALGOBIT
+      if [ "$CONFIG_SCx200_I2C" != "n" ]; then
+         int  '    GPIO pin used for SCL' CONFIG_SCx200_I2C_SCL 12
+         int  '    GPIO pin used for SDA' CONFIG_SCx200_I2C_SDA 13
+      fi
+      dep_tristate '  NatSemi SCx200 ACCESS.bus' CONFIG_SCx200_ACB $CONFIG_I2C
    fi
 
    dep_tristate 'I2C PCF 8584 interfaces' CONFIG_I2C_ALGOPCF $CONFIG_I2C
diff --git a/drivers/i2c/Makefile b/drivers/i2c/Makefile
index 33ef7be099e9..56c6474c5611 100644
--- a/drivers/i2c/Makefile
+++ b/drivers/i2c/Makefile
@@ -15,6 +15,8 @@ obj-$(CONFIG_I2C_ALGOPCF)	+= i2c-algo-pcf.o
 obj-$(CONFIG_I2C_ELEKTOR)	+= i2c-elektor.o
 obj-$(CONFIG_ITE_I2C_ALGO)	+= i2c-algo-ite.o
 obj-$(CONFIG_ITE_I2C_ADAP)	+= i2c-adap-ite.o
+obj-$(CONFIG_SCx200_I2C)	+= scx200_i2c.o
+obj-$(CONFIG_SCx200_ACB)	+= scx200_acb.o
 obj-$(CONFIG_I2C_PROC)		+= i2c-proc.o
 
 # This is needed for automatic patch generation: sensors code starts here
diff --git a/drivers/i2c/scx200_acb.c b/drivers/i2c/scx200_acb.c
new file mode 100644
index 000000000000..1b29f806a830
--- /dev/null
+++ b/drivers/i2c/scx200_acb.c
@@ -0,0 +1,578 @@
+/*  linux/drivers/i2c/scx200_acb.c 
+
+    Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+    National Semiconductor SCx200 ACCESS.bus support
+    
+    Based on i2c-keywest.c which is:
+        Copyright (c) 2001 Benjamin Herrenschmidt <benh@kernel.crashing.org>
+        Copyright (c) 2000 Philip Edelbrock <phil@stimpy.netroedge.com>
+    
+    This program is free software; you can redistribute it and/or
+    modify it under the terms of the GNU General Public License as
+    published by the Free Software Foundation; either version 2 of the
+    License, or (at your option) any later version.
+   
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    General Public License for more details.
+   
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+*/
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/smp_lock.h>
+#include <linux/pci.h>
+#include <asm/io.h>
+
+#include <linux/scx200.h>
+
+#define NAME "scx200_acb"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 ACCESS.bus Driver");
+MODULE_LICENSE("GPL");
+
+#define MAX_DEVICES 4
+static int base[MAX_DEVICES] = { 0x840 };
+MODULE_PARM(base, "1-4i");
+MODULE_PARM_DESC(base, "Base addresses for the ACCESS.bus controllers");
+
+#define DEBUG 0
+
+#if DEBUG
+#define DBG(x...) printk(KERN_DEBUG NAME ": " x)
+#else
+#define DBG(x...)
+#endif
+
+/* The hardware supports interrupt driven mode too, but I haven't
+   implemented that. */
+#define POLLED_MODE 1
+#define POLL_TIMEOUT (HZ)
+
+enum scx200_acb_state {
+	state_idle,
+	state_address,
+	state_command,
+	state_repeat_start,
+	state_quick,
+	state_read,
+	state_write,
+};
+
+static const char *scx200_acb_state_name[] = {
+	"idle",
+	"address",
+	"command",
+	"repeat_start",
+	"quick",
+	"read",
+	"write",
+};
+
+/* Physical interface */
+struct scx200_acb_iface
+{
+	struct scx200_acb_iface *next;
+	struct i2c_adapter adapter;
+	unsigned base;
+	struct semaphore sem;
+
+	/* State machine data */
+	enum scx200_acb_state state;
+	int result;
+	u8 address_byte;
+	u8 command;
+	u8 *ptr;
+	char needs_reset;
+	unsigned len;
+};
+
+/* Register Definitions */
+#define ACBSDA		(iface->base + 0)
+#define ACBST		(iface->base + 1)
+#define    ACBST_SDAST		0x40 /* SDA Status */
+#define    ACBST_BER		0x20 
+#define    ACBST_NEGACK		0x10 /* Negative Acknowledge */
+#define    ACBST_STASTR		0x08 /* Stall After Start */
+#define    ACBST_MASTER		0x02
+#define ACBCST		(iface->base + 2)
+#define    ACBCST_BB		0x02
+#define ACBCTL1		(iface->base + 3)
+#define    ACBCTL1_STASTRE	0x80
+#define    ACBCTL1_NMINTE	0x40
+#define	   ACBCTL1_ACK		0x10
+#define	   ACBCTL1_STOP		0x02
+#define	   ACBCTL1_START	0x01
+#define ACBADDR		(iface->base + 4)
+#define ACBCTL2		(iface->base + 5)
+#define    ACBCTL2_ENABLE	0x01
+
+/************************************************************************/
+
+static void scx200_acb_machine(struct scx200_acb_iface *iface, u8 status)
+{
+	const char *errmsg;
+
+	DBG("state %s, status = 0x%02x\n", 
+	    scx200_acb_state_name[iface->state], status);
+
+	if (status & ACBST_BER) {
+		errmsg = "bus error";
+		goto error;
+	}
+	if (!(status & ACBST_MASTER)) {
+		errmsg = "not master";
+		goto error;
+	}
+	if (status & ACBST_NEGACK)
+		goto negack;
+
+	switch (iface->state) {
+	case state_idle:
+		printk(KERN_WARNING NAME ": %s, interrupt in idle state\n", 
+		       iface->adapter.name);
+		break;
+
+	case state_address:
+		/* Do a pointer write first */
+		outb(iface->address_byte & ~1, ACBSDA);
+
+		iface->state = state_command;
+		break;
+
+	case state_command:
+		outb(iface->command, ACBSDA);
+
+		if (iface->address_byte & 1)
+			iface->state = state_repeat_start;
+		else
+			iface->state = state_write;
+		break;
+
+	case state_repeat_start:
+		outb(inb(ACBCTL1) | ACBCTL1_START, ACBCTL1);
+		/* fallthrough */
+		
+	case state_quick:
+		if (iface->address_byte & 1) {
+			if (iface->len == 1) 
+				outb(inb(ACBCTL1) | ACBCTL1_ACK, ACBCTL1);
+			else
+				outb(inb(ACBCTL1) & ~ACBCTL1_ACK, ACBCTL1);
+			outb(iface->address_byte, ACBSDA);
+
+			iface->state = state_read;
+		} else {
+			outb(iface->address_byte, ACBSDA);
+
+			iface->state = state_write;
+		}
+		break;
+
+	case state_read:
+		/* Set ACK if receiving the last byte */
+		if (iface->len == 1)
+			outb(inb(ACBCTL1) | ACBCTL1_ACK, ACBCTL1);
+		else
+			outb(inb(ACBCTL1) & ~ACBCTL1_ACK, ACBCTL1);
+
+		*iface->ptr++ = inb(ACBSDA);
+		--iface->len;
+
+		if (iface->len == 0) {
+			iface->result = 0;
+			iface->state = state_idle;
+			outb(inb(ACBCTL1) | ACBCTL1_STOP, ACBCTL1);
+		}
+
+		break;
+
+	case state_write:
+		if (iface->len == 0) {
+			iface->result = 0;
+			iface->state = state_idle;
+			outb(inb(ACBCTL1) | ACBCTL1_STOP, ACBCTL1);
+			break;
+		}
+		
+		outb(*iface->ptr++, ACBSDA);
+		--iface->len;
+		
+		break;
+	}
+
+	return;
+
+ negack:
+	DBG("negative acknowledge in state %s\n", 
+	    scx200_acb_state_name[iface->state]);
+
+	iface->state = state_idle;
+	iface->result = -ENXIO;
+
+	outb(inb(ACBCTL1) | ACBCTL1_STOP, ACBCTL1);
+	outb(ACBST_STASTR | ACBST_NEGACK, ACBST);
+	return;
+
+ error:
+	printk(KERN_ERR NAME ": %s, %s in state %s\n", iface->adapter.name, 
+	       errmsg, scx200_acb_state_name[iface->state]);
+
+	iface->state = state_idle;
+	iface->result = -EIO;
+	iface->needs_reset = 1;
+}
+
+static void scx200_acb_timeout(struct scx200_acb_iface *iface) 
+{
+	printk(KERN_ERR NAME ": %s, timeout in state %s\n", 
+	       iface->adapter.name, scx200_acb_state_name[iface->state]);
+
+	iface->state = state_idle;
+	iface->result = -EIO;
+	iface->needs_reset = 1;
+}
+
+#ifdef POLLED_MODE
+static void scx200_acb_poll(struct scx200_acb_iface *iface)
+{
+	u8 status = 0;
+	unsigned long timeout;
+
+	timeout = jiffies + POLL_TIMEOUT;
+	while (time_before(jiffies, timeout)) {
+		status = inb(ACBST);
+		if ((status & (ACBST_SDAST|ACBST_BER|ACBST_NEGACK)) != 0) {
+			scx200_acb_machine(iface, status);
+			return;
+		}
+		schedule_timeout(HZ/100+1);
+	}
+
+	scx200_acb_timeout(iface);
+}
+#endif /* POLLED_MODE */
+
+static void scx200_acb_reset(struct scx200_acb_iface *iface)
+{
+	/* Disable the ACCESS.bus device and Configure the SCL
+           frequency: 16 clock cycles */
+	outb(0x70, ACBCTL2);
+	/* Polling mode */
+	outb(0, ACBCTL1);
+	/* Disable slave address */
+	outb(0, ACBADDR);
+	/* Enable the ACCESS.bus device */
+	outb(inb(ACBCTL2) | ACBCTL2_ENABLE, ACBCTL2);
+	/* Free STALL after START */
+	outb(inb(ACBCTL1) & ~(ACBCTL1_STASTRE | ACBCTL1_NMINTE), ACBCTL1);
+	/* Send a STOP */
+	outb(inb(ACBCTL1) | ACBCTL1_STOP, ACBCTL1);
+	/* Clear BER, NEGACK and STASTR bits */
+	outb(ACBST_BER | ACBST_NEGACK | ACBST_STASTR, ACBST);
+	/* Clear BB bit */
+	outb(inb(ACBCST) | ACBCST_BB, ACBCST);
+}
+
+static s32 scx200_acb_smbus_xfer(struct i2c_adapter *adapter,
+				u16 address, unsigned short flags,	
+				char rw, u8 command, int size, 
+				union i2c_smbus_data *data)
+{
+	struct scx200_acb_iface *iface = adapter->data;
+	int len;
+	u8 *buffer;
+	u16 cur_word;
+	int rc;
+
+	switch (size) {
+	case I2C_SMBUS_QUICK:
+	    	len = 0;
+	    	buffer = NULL;
+	    	break;
+	case I2C_SMBUS_BYTE:
+		if (rw == I2C_SMBUS_READ) {
+			len = 1;
+			buffer = &data->byte;
+		} else {
+			len = 1;
+			buffer = &command;
+		}
+	    	break;
+	case I2C_SMBUS_BYTE_DATA:
+	    	len = 1;
+	    	buffer = &data->byte;
+	    	break;
+	case I2C_SMBUS_WORD_DATA:
+		len = 2;
+	    	cur_word = cpu_to_le16(data->word);
+	    	buffer = (u8 *)&cur_word;
+		break;
+	case I2C_SMBUS_BLOCK_DATA:
+	    	len = data->block[0];
+	    	buffer = &data->block[1];
+		break;
+	default:
+	    	return -EINVAL;
+	}
+
+	DBG("size=%d, address=0x%x, command=0x%x, len=%d, read=%d\n",
+	    size, address, command, len, rw == I2C_SMBUS_READ);
+
+	if (!len && rw == I2C_SMBUS_READ) {
+		printk(KERN_WARNING NAME ": %s, zero length read\n", 
+		       adapter->name);
+		return -EINVAL;
+	}
+
+	if (len && !buffer) {
+		printk(KERN_WARNING NAME ": %s, nonzero length but no buffer\n", adapter->name);
+		return -EFAULT;
+	}
+
+	down(&iface->sem);
+
+	iface->address_byte = address<<1;
+	if (rw == I2C_SMBUS_READ)
+		iface->address_byte |= 1;
+	iface->command = command;
+	iface->ptr = buffer;
+	iface->len = len;
+	iface->result = -EINVAL;
+	iface->needs_reset = 0;
+
+	outb(inb(ACBCTL1) | ACBCTL1_START, ACBCTL1);
+
+	if (size == I2C_SMBUS_QUICK || size == I2C_SMBUS_BYTE)
+		iface->state = state_quick;
+	else
+		iface->state = state_address;
+
+#ifdef POLLED_MODE
+	while (iface->state != state_idle)
+		scx200_acb_poll(iface);
+#else /* POLLED_MODE */
+#error Interrupt driven mode not implemented
+#endif /* POLLED_MODE */	
+
+	if (iface->needs_reset)
+		scx200_acb_reset(iface);
+
+	rc = iface->result;
+
+	up(&iface->sem);
+
+	if (rc == 0 && size == I2C_SMBUS_WORD_DATA && rw == I2C_SMBUS_READ)
+	    	data->word = le16_to_cpu(cur_word);
+
+#if DEBUG
+	printk(KERN_DEBUG NAME ": transfer done, result: %d", rc);
+	if (buffer) {
+		int i;
+		printk(" data:");
+		for (i = 0; i < len; ++i)
+			printk(" %02x", buffer[i]);
+	}
+	printk("\n");
+#endif
+
+	return rc;
+}
+
+static u32 scx200_acb_func(struct i2c_adapter *adapter)
+{
+	return I2C_FUNC_SMBUS_QUICK | I2C_FUNC_SMBUS_BYTE |
+	       I2C_FUNC_SMBUS_BYTE_DATA | I2C_FUNC_SMBUS_WORD_DATA |
+	       I2C_FUNC_SMBUS_BLOCK_DATA;
+}
+
+static int scx200_acb_reg(struct i2c_client *client)
+{
+	return 0;
+}
+
+static int scx200_acb_unreg(struct i2c_client *client)
+{
+	return 0;
+}
+
+static void scx200_acb_inc_use(struct i2c_adapter *adapter)
+{
+	MOD_INC_USE_COUNT;
+}
+
+static void scx200_acb_dec_use(struct i2c_adapter *adapter)
+{
+	MOD_DEC_USE_COUNT;
+}
+
+/* For now, we only handle combined mode (smbus) */
+static struct i2c_algorithm scx200_acb_algorithm = {
+	name:		"NatSemi SCx200 ACCESS.bus",
+	id:		I2C_ALGO_SMBUS,
+	smbus_xfer:	scx200_acb_smbus_xfer,
+	functionality:	scx200_acb_func,
+};
+
+struct scx200_acb_iface *scx200_acb_list;
+
+int scx200_acb_probe(struct scx200_acb_iface *iface)
+{
+	u8 val;
+
+	/* Disable the ACCESS.bus device and Configure the SCL
+           frequency: 16 clock cycles */
+	outb(0x70, ACBCTL2);
+
+	if (inb(ACBCTL2) != 0x70) {
+		DBG("ACBCTL2 readback failed\n");
+		return -ENXIO;
+	}
+
+	outb(inb(ACBCTL1) | ACBCTL1_NMINTE, ACBCTL1);
+
+	val = inb(ACBCTL1);
+	if (val) {
+		DBG("disabled, but ACBCTL1=0x%02x\n", val);
+		return -ENXIO;
+	}
+
+	outb(inb(ACBCTL2) | ACBCTL2_ENABLE, ACBCTL2);
+
+	outb(inb(ACBCTL1) | ACBCTL1_NMINTE, ACBCTL1);
+
+	val = inb(ACBCTL1);
+	if ((val & ACBCTL1_NMINTE) != ACBCTL1_NMINTE) {
+		DBG("enabled, but NMINTE won't be set, ACBCTL1=0x%02x\n", val);
+		return -ENXIO;
+	}
+
+	return 0;
+}
+
+static int  __init scx200_acb_create(int base, int index)
+{
+	struct scx200_acb_iface *iface;
+	struct i2c_adapter *adapter;
+	int rc = 0;
+	char description[64];
+
+	iface = kmalloc(sizeof(*iface), GFP_KERNEL);
+	if (!iface) {
+		printk(KERN_ERR NAME ": can't allocate memory\n");
+		rc = -ENOMEM;
+		goto errout;
+	}
+
+	memset(iface, 0, sizeof(*iface));
+	adapter = &iface->adapter;
+	adapter->data = iface;
+	sprintf(adapter->name, "SCx200 ACB%d", index);
+	adapter->id = I2C_ALGO_SMBUS;
+	adapter->algo = &scx200_acb_algorithm;
+	adapter->inc_use = scx200_acb_inc_use;
+	adapter->dec_use = scx200_acb_dec_use;
+	adapter->client_register = scx200_acb_reg;
+	adapter->client_unregister = scx200_acb_unreg;
+
+	init_MUTEX(&iface->sem);
+
+	sprintf(description, "NatSemi SCx200 ACCESS.bus [%s]", adapter->name);
+	if (request_region(base, 8, description) == 0) {
+		printk(KERN_ERR NAME ": %s, can't allocate io 0x%x-0x%x\n", 
+		       adapter->name, base, base + 8-1);
+		rc = -EBUSY;
+		goto errout;
+	}
+	iface->base = base;
+
+	rc = scx200_acb_probe(iface);
+	if (rc) {
+		printk(KERN_WARNING NAME ": %s, probe failed\n", adapter->name);
+		goto errout;
+	}
+
+	scx200_acb_reset(iface);
+
+	if (i2c_add_adapter(adapter) < 0) {
+		printk(KERN_ERR NAME ": %s, failed to register\n", adapter->name);
+		rc = -ENODEV;
+		goto errout;
+	}
+
+	lock_kernel();
+	iface->next = scx200_acb_list;
+	scx200_acb_list = iface;
+	unlock_kernel();
+
+	return 0;
+
+ errout:
+	if (iface) {
+		if (iface->base)
+			release_region(iface->base, 8);
+		kfree(iface);
+	}
+	return rc;
+}
+
+static int __init scx200_acb_init(void)
+{
+	int i;
+	int rc;
+
+	printk(KERN_DEBUG NAME ": NatSemi SCx200 ACCESS.bus Driver\n");
+
+	/* Verify that this really is a SCx200 processor */
+	if (pci_find_device(PCI_VENDOR_ID_NS,
+			    PCI_DEVICE_ID_NS_SCx200_BRIDGE,
+			    NULL) == NULL)
+		return -ENODEV;
+
+	rc = -ENXIO;
+	for (i = 0; i < MAX_DEVICES; ++i) {
+		if (base[i] > 0)
+			rc = scx200_acb_create(base[i], i);
+	}
+	if (scx200_acb_list)
+		return 0;
+	return rc;
+}
+
+static void __exit scx200_acb_cleanup(void)
+{
+	struct scx200_acb_iface *iface;
+	lock_kernel();
+	while ((iface = scx200_acb_list) != NULL) {
+		scx200_acb_list = iface->next;
+		unlock_kernel();
+
+		i2c_del_adapter(&iface->adapter);
+		release_region(iface->base, 8);
+		kfree(iface);
+		lock_kernel();
+	}
+	unlock_kernel();
+}
+
+module_init(scx200_acb_init);
+module_exit(scx200_acb_cleanup);
+
+/*
+    Local variables:
+        compile-command: "make -k -C ../.. SUBDIRS=drivers/i2c modules"
+        c-basic-offset: 8
+    End:
+*/
+
diff --git a/drivers/i2c/scx200_i2c.c b/drivers/i2c/scx200_i2c.c
new file mode 100644
index 000000000000..515e0c73d523
--- /dev/null
+++ b/drivers/i2c/scx200_i2c.c
@@ -0,0 +1,156 @@
+/* linux/drivers/i2c/scx200_i2c.c 
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   National Semiconductor SCx200 I2C bus on GPIO pins
+
+   Based on i2c-velleman.c Copyright (C) 1995-96, 2000 Simon G. Vogl
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+   
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+   
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.		     
+*/
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/i2c.h>
+#include <linux/i2c-algo-bit.h>
+#include <asm/io.h>
+
+#include <linux/scx200_gpio.h>
+
+#define NAME "scx200_i2c"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@nano-system.com>");
+MODULE_DESCRIPTION("NatSemi SCx200 I2C Driver");
+MODULE_LICENSE("GPL");
+
+MODULE_PARM(scl, "i");
+MODULE_PARM_DESC(scl, "GPIO line for SCL");
+MODULE_PARM(sda, "i");
+MODULE_PARM_DESC(sda, "GPIO line for SDA");
+
+static int scl = CONFIG_SCx200_I2C_SCL;
+static int sda = CONFIG_SCx200_I2C_SDA;
+
+static void scx200_i2c_setscl(void *data, int state)
+{
+	scx200_gpio_set(scl, state);
+}
+
+static void scx200_i2c_setsda(void *data, int state)
+{
+	scx200_gpio_set(sda, state);
+} 
+
+static int scx200_i2c_getscl(void *data)
+{
+	return scx200_gpio_get(scl);
+}
+
+static int scx200_i2c_getsda(void *data)
+{
+	return scx200_gpio_get(sda);
+}
+
+static int scx200_i2c_reg(struct i2c_client *client)
+{
+	return 0;
+}
+
+static int scx200_i2c_unreg(struct i2c_client *client)
+{
+	return 0;
+}
+
+static void scx200_i2c_inc_use(struct i2c_adapter *adap)
+{
+	MOD_INC_USE_COUNT;
+}
+
+static void scx200_i2c_dec_use(struct i2c_adapter *adap)
+{
+	MOD_DEC_USE_COUNT;
+}
+
+/* ------------------------------------------------------------------------
+ * Encapsulate the above functions in the correct operations structure.
+ * This is only done when more than one hardware adapter is supported.
+ */
+
+static struct i2c_algo_bit_data scx200_i2c_data = {
+	NULL,
+	scx200_i2c_setsda,
+	scx200_i2c_setscl,
+	scx200_i2c_getsda,
+	scx200_i2c_getscl,
+	10, 10, 100,		/* waits, timeout */
+};
+
+static struct i2c_adapter scx200_i2c_ops = {
+	.name              = "NatSemi SCx200 I2C",
+	.id		   = I2C_HW_B_VELLE,
+	.algo_data	   = &scx200_i2c_data,
+	.inc_use	   = scx200_i2c_inc_use,
+	.dec_use	   = scx200_i2c_dec_use,
+	.client_register   = scx200_i2c_reg,
+	.client_unregister = scx200_i2c_unreg,
+};
+
+int scx200_i2c_init(void)
+{
+	printk(KERN_DEBUG NAME ": NatSemi SCx200 I2C Driver\n");
+
+	if (!scx200_gpio_present()) {
+		printk(KERN_ERR NAME ": no SCx200 gpio pins available\n");
+		return -ENODEV;
+	}
+
+	printk(KERN_DEBUG NAME ": SCL=GPIO%02u, SDA=GPIO%02u\n", 
+	       scl, sda);
+
+	if (scl == -1 || sda == -1 || scl == sda) {
+		printk(KERN_ERR NAME ": scl and sda must be specified\n");
+		return -EINVAL;
+	}
+
+	/* Configure GPIOs as open collector outputs */
+	scx200_gpio_configure(scl, ~2, 5);
+	scx200_gpio_configure(sda, ~2, 5);
+
+	if (i2c_bit_add_bus(&scx200_i2c_ops) < 0) {
+		printk(KERN_ERR NAME ": adapter %s registration failed\n", 
+		       scx200_i2c_ops.name);
+		return -ENODEV;
+	}
+	
+	return 0;
+}
+
+void scx200_i2c_cleanup(void)
+{
+	i2c_bit_del_bus(&scx200_i2c_ops);
+}
+
+module_init(scx200_i2c_init);
+module_exit(scx200_i2c_cleanup);
+
+/*
+    Local variables:
+        compile-command: "make -k -C ../.. SUBDIRS=drivers/i2c modules"
+        c-basic-offset: 8
+    End:
+*/
diff --git a/drivers/mtd/maps/Config.help b/drivers/mtd/maps/Config.help
index bee634915694..aaf3a1aa894e 100644
--- a/drivers/mtd/maps/Config.help
+++ b/drivers/mtd/maps/Config.help
@@ -127,6 +127,14 @@ CONFIG_MTD_MIXMEM
   you probably want to enable this mapping driver. More info is at
   <http://www.itc.hu/>.
 
+CONFIG_MTD_SCx200_DOCFLASH
+  Enable support for a flash chip mapped using the DOCCS signal on a
+  National Semiconductor SCx200 processor.
+
+  If you don't know what to do here, say N.
+
+  If compiled as a module, it will be called scx200_docflash.o.
+
 CONFIG_MTD_OCTAGON
   This provides a 'mapping' driver which supports the way in which
   the flash chips are connected in the Octagon-5066 Single Board
diff --git a/drivers/mtd/maps/Config.in b/drivers/mtd/maps/Config.in
index dc9b5069cc4e..7b4cbd4eda85 100644
--- a/drivers/mtd/maps/Config.in
+++ b/drivers/mtd/maps/Config.in
@@ -26,6 +26,7 @@ if [ "$CONFIG_X86" = "y" ]; then
    dep_tristate '  JEDEC Flash device mapped on Mixcom piggyback card' CONFIG_MTD_MIXMEM $CONFIG_MTD_JEDEC
    dep_tristate '  JEDEC Flash device mapped on Octagon 5066 SBC' CONFIG_MTD_OCTAGON $CONFIG_MTD_JEDEC
    dep_tristate '  JEDEC Flash device mapped on Tempustech VMAX SBC301' CONFIG_MTD_VMAX $CONFIG_MTD_JEDEC
+   dep_tristate '  Flash device mapped with DOCCS on NatSemi SCx200' CONFIG_MTD_SCx200_DOCFLASH $CONFIG_MTD_CFI
    dep_tristate '  BIOS flash chip on Intel L440GX boards' CONFIG_MTD_L440GX $CONFIG_MTD_JEDEC
 fi
 
diff --git a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile
index d6cd7af1b203..c0bdc2fa8f23 100644
--- a/drivers/mtd/maps/Makefile
+++ b/drivers/mtd/maps/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_MTD_SC520CDP)	+= sc520cdp.o
 obj-$(CONFIG_MTD_NETSC520)	+= netsc520.o
 obj-$(CONFIG_MTD_SUN_UFLASH)    += sun_uflash.o
 obj-$(CONFIG_MTD_VMAX)		+= vmax301.o
+obj-$(CONFIG_MTD_SCx200_DOCFLASH)+= scx200_docflash.o
 obj-$(CONFIG_MTD_DBOX2)		+= dbox2-flash.o
 obj-$(CONFIG_MTD_OCELOT)	+= ocelot.o
 obj-$(CONFIG_MTD_SOLUTIONENGINE)+= solutionengine.o
diff --git a/drivers/mtd/maps/scx200_docflash.c b/drivers/mtd/maps/scx200_docflash.c
new file mode 100644
index 000000000000..64583df88707
--- /dev/null
+++ b/drivers/mtd/maps/scx200_docflash.c
@@ -0,0 +1,268 @@
+/* linux/drivers/mtd/maps/scx200_docflash.c 
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   National Semiconductor SCx200 flash mapped with DOCCS
+*/
+
+#include <linux/module.h>
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <asm/io.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/map.h>
+#include <linux/mtd/partitions.h>
+
+#include <linux/pci.h>
+#include <linux/scx200.h>
+
+#define NAME "scx200_docflash"
+
+MODULE_AUTHOR("Christer Weinigel <wingel@hack.org>");
+MODULE_DESCRIPTION("NatSemi SCx200 DOCCS Flash Driver");
+MODULE_LICENSE("GPL");
+
+/* Set this to one if you want to partition the flash */
+#define PARTITION 1
+
+MODULE_PARM(probe, "i");
+MODULE_PARM_DESC(probe, "Probe for a BIOS mapping");
+MODULE_PARM(size, "i");
+MODULE_PARM_DESC(size, "Size of the flash mapping");
+MODULE_PARM(width, "i");
+MODULE_PARM_DESC(width, "Data width of the flash mapping (8/16)");
+MODULE_PARM(flashtype, "s");
+MODULE_PARM_DESC(flashtype, "Type of MTD probe to do");
+
+static int probe = 0;		/* Don't autoprobe */
+static unsigned size = 0x1000000; /* 16 MB the whole ISA address space */
+static unsigned width = 8;	/* Default to 8 bits wide */
+static char *flashtype = "cfi_probe";
+
+static struct resource docmem = {
+	.flags = IORESOURCE_MEM,
+	.name  = "NatSemi SCx200 DOCCS Flash",
+};
+
+static struct mtd_info *mymtd;
+
+#if PARTITION
+static struct mtd_partition partition_info[] = {
+	{ 
+		.name   = "DOCCS Boot kernel", 
+		.offset = 0, 
+		.size   = 0xc0000
+	},
+	{ 
+		.name   = "DOCCS Low BIOS", 
+		.offset = 0xc0000, 
+		.size   = 0x40000
+	},
+	{ 
+		.name   = "DOCCS File system", 
+		.offset = 0x100000, 
+		.size   = ~0	/* calculate from flash size */
+	},
+	{ 
+		.name   = "DOCCS High BIOS", 
+		.offset = ~0, 	/* calculate from flash size */
+		.size   = 0x80000
+	},
+};
+#define NUM_PARTITIONS (sizeof(partition_info)/sizeof(partition_info[0]))
+#endif
+
+static __u8 scx200_docflash_read8(struct map_info *map, unsigned long ofs)
+{
+	return __raw_readb(map->map_priv_1 + ofs);
+}
+
+static __u16 scx200_docflash_read16(struct map_info *map, unsigned long ofs)
+{
+	return __raw_readw(map->map_priv_1 + ofs);
+}
+
+static void scx200_docflash_copy_from(struct map_info *map, void *to, unsigned long from, ssize_t len)
+{
+	memcpy_fromio(to, map->map_priv_1 + from, len);
+}
+
+static void scx200_docflash_write8(struct map_info *map, __u8 d, unsigned long adr)
+{
+	__raw_writeb(d, map->map_priv_1 + adr);
+	mb();
+}
+
+static void scx200_docflash_write16(struct map_info *map, __u16 d, unsigned long adr)
+{
+	__raw_writew(d, map->map_priv_1 + adr);
+	mb();
+}
+
+static void scx200_docflash_copy_to(struct map_info *map, unsigned long to, const void *from, ssize_t len)
+{
+	memcpy_toio(map->map_priv_1 + to, from, len);
+}
+
+static struct map_info scx200_docflash_map = {
+	.name      = "NatSemi SCx200 DOCCS Flash",
+	.read8     = scx200_docflash_read8,
+	.read16    = scx200_docflash_read16,
+	.copy_from = scx200_docflash_copy_from,
+	.write8    = scx200_docflash_write8,
+	.write16   = scx200_docflash_write16,
+	.copy_to   = scx200_docflash_copy_to
+};
+
+int __init init_scx200_docflash(void)
+{
+	unsigned u;
+	unsigned base;
+	unsigned ctrl;
+	unsigned pmr;
+	struct pci_dev *bridge;
+
+	printk(KERN_DEBUG NAME ": NatSemi SCx200 DOCCS Flash Driver\n");
+
+	if ((bridge = pci_find_device(PCI_VENDOR_ID_NS, 
+				      PCI_DEVICE_ID_NS_SCx200_BRIDGE,
+				      NULL)) == NULL)
+		return -ENODEV;
+	
+	if (!scx200_cb_probe(SCx200_CB_BASE)) {
+		printk(KERN_WARNING NAME ": no configuration block found\n");
+		return -ENODEV;
+	}
+
+	if (probe) {
+		/* Try to use the present flash mapping if any */
+		pci_read_config_dword(bridge, SCx200_DOCCS_BASE, &base);
+		pci_read_config_dword(bridge, SCx200_DOCCS_CTRL, &ctrl);
+		pmr = inl(SCx200_CB_BASE + SCx200_PMR);
+
+		if (base == 0
+		    || (ctrl & 0x07000000) != 0x07000000
+		    || (ctrl & 0x0007ffff) == 0)
+			return -ENODEV;
+
+		size = ((ctrl&0x1fff)<<13) + (1<<13);
+
+		for (u = size; u > 1; u >>= 1)
+			;
+		if (u != 1)
+			return -ENODEV;
+
+		if (pmr & (1<<6))
+			width = 16;
+		else
+			width = 8;
+
+		docmem.start = base;
+		docmem.end = base + size;
+
+		if (request_resource(&iomem_resource, &docmem)) {
+			printk(KERN_ERR NAME ": unable to allocate memory for flash mapping\n");
+			return -ENOMEM;
+		}
+	} else {
+		for (u = size; u > 1; u >>= 1)
+			;
+		if (u != 1) {
+			printk(KERN_ERR NAME ": invalid size for flash mapping\n");
+			return -EINVAL;
+		}
+		
+		if (width != 8 && width != 16) {
+			printk(KERN_ERR NAME ": invalid bus width for flash mapping\n");
+			return -EINVAL;
+		}
+		
+		if (allocate_resource(&iomem_resource, &docmem, 
+				      size,
+				      0xc0000000, 0xffffffff, 
+				      size, NULL, NULL)) {
+			printk(KERN_ERR NAME ": unable to allocate memory for flash mapping\n");
+			return -ENOMEM;
+		}
+		
+		ctrl = 0x07000000 | ((size-1) >> 13);
+
+		printk(KERN_INFO "DOCCS BASE=0x%08lx, CTRL=0x%08lx\n", (long)docmem.start, (long)ctrl);
+		
+		pci_write_config_dword(bridge, SCx200_DOCCS_BASE, docmem.start);
+		pci_write_config_dword(bridge, SCx200_DOCCS_CTRL, ctrl);
+		pmr = inl(SCx200_CB_BASE + SCx200_PMR);
+		
+		if (width == 8) {
+			pmr &= ~(1<<6);
+		} else {
+			pmr |= (1<<6);
+		}
+		outl(pmr, SCx200_CB_BASE + SCx200_PMR);
+	}
+	
+       	printk(KERN_INFO NAME ": DOCCS mapped at 0x%lx-0x%lx, width %d\n", 
+	       docmem.start, docmem.end, width);
+
+	scx200_docflash_map.size = size;
+	if (width == 8)
+		scx200_docflash_map.buswidth = 1;
+	else
+		scx200_docflash_map.buswidth = 2;
+
+	scx200_docflash_map.map_priv_1 = (unsigned long)ioremap(docmem.start, scx200_docflash_map.size);
+	if (!scx200_docflash_map.map_priv_1) {
+		printk(KERN_ERR NAME ": failed to ioremap the flash\n");
+		release_resource(&docmem);
+		return -EIO;
+	}
+
+	mymtd = do_map_probe(flashtype, &scx200_docflash_map);
+	if (!mymtd) {
+		printk(KERN_ERR NAME ": unable to detect flash\n");
+		iounmap((void *)scx200_docflash_map.map_priv_1);
+		release_resource(&docmem);
+		return -ENXIO;
+	}
+
+	if (size < mymtd->size)
+		printk(KERN_WARNING NAME ": warning, flash mapping is smaller than flash size\n");
+
+	mymtd->module = THIS_MODULE;
+
+#if PARTITION
+	partition_info[3].offset = mymtd->size-partition_info[3].size;
+	partition_info[2].size = partition_info[3].offset-partition_info[2].offset;
+	add_mtd_partitions(mymtd, partition_info, NUM_PARTITIONS);
+#else
+	add_mtd_device(mymtd);
+#endif
+	return 0;
+}
+
+static void __exit cleanup_scx200_docflash(void)
+{
+	if (mymtd) {
+#if PARTITION
+		del_mtd_partitions(mymtd);
+#else
+		del_mtd_device(mymtd);
+#endif
+		map_destroy(mymtd);
+	}
+	if (scx200_docflash_map.map_priv_1) {
+		iounmap((void *)scx200_docflash_map.map_priv_1);
+		release_resource(&docmem);
+	}
+}
+
+module_init(init_scx200_docflash);
+module_exit(cleanup_scx200_docflash);
+
+/*
+    Local variables:
+        compile-command: "make -k -C ../../.. SUBDIRS=drivers/mtd/maps modules"
+        c-basic-offset: 8
+    End:
+*/
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index be4d21956fc5..a652bbf6682d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -288,6 +288,12 @@
 #define PCI_DEVICE_ID_NS_87560_USB	0x0012
 #define PCI_DEVICE_ID_NS_83815		0x0020
 #define PCI_DEVICE_ID_NS_83820		0x0022
+#define PCI_DEVICE_ID_NS_SCx200_BRIDGE	0x0500
+#define PCI_DEVICE_ID_NS_SCx200_SMI	0x0501
+#define PCI_DEVICE_ID_NS_SCx200_IDE	0x0502
+#define PCI_DEVICE_ID_NS_SCx200_AUDIO	0x0503
+#define PCI_DEVICE_ID_NS_SCx200_VIDEO	0x0504
+#define PCI_DEVICE_ID_NS_SCx200_XBUS	0x0505
 #define PCI_DEVICE_ID_NS_87410		0xd001
 
 #define PCI_VENDOR_ID_TSENG		0x100c
diff --git a/include/linux/scx200.h b/include/linux/scx200.h
new file mode 100644
index 000000000000..af7d53acad99
--- /dev/null
+++ b/include/linux/scx200.h
@@ -0,0 +1,56 @@
+/* linux/include/linux/scx200.h
+
+   Copyright (c) 2001,2002 Christer Weinigel <wingel@nano-system.com>
+
+   Defines for the National Semiconductor SCx200 Processors
+*/
+
+/* Interesting stuff for the National Semiconductor SCx200 CPU */
+
+/* F0 PCI Header/Bridge Configuration Registers */
+#define SCx200_DOCCS_BASE 0x78	/* DOCCS Base Address Register */
+#define SCx200_DOCCS_CTRL 0x7c	/* DOCCS Control Register */
+
+/* GPIO Register Block */
+#define SCx200_GPIO_SIZE 0x2c	/* Size of GPIO register block */
+
+/* General Configuration Block */
+#define SCx200_CB_BASE 0x9000	/* Base fixed at 0x9000 according to errata */
+
+/* Watchdog Timer */
+#define SCx200_WDT_OFFSET 0x00	/* offset within configuration block */
+#define SCx200_WDT_SIZE 0x05	/* size */
+
+#define SCx200_WDT_WDTO 0x00	/* Time-Out Register */
+#define SCx200_WDT_WDCNFG 0x02	/* Configuration Register */
+#define SCx200_WDT_WDSTS 0x04	/* Status Register */
+#define SCx200_WDT_WDSTS_WDOVF (1<<0) /* Overflow bit */
+
+/* High Resolution Timer */
+#define SCx200_TIMER_OFFSET 0x08
+#define SCx200_TIMER_SIZE 0x05
+
+/* Clock Generators */
+#define SCx200_CLOCKGEN_OFFSET 0x10
+#define SCx200_CLOCKGEN_SIZE 0x10
+
+/* Pin Multiplexing and Miscellaneous Configuration Registers */
+#define SCx200_MISC_OFFSET 0x30
+#define SCx200_MISC_SIZE 0x10
+
+#define SCx200_PMR 0x30		/* Pin Multiplexing Register */
+#define SCx200_MCR 0x34		/* Miscellaneous Configuration Register */
+#define SCx200_INTSEL 0x38	/* Interrupt Selection Register */
+#define SCx200_IID 0x3c		/* IA On a Chip Identification Number Reg */
+#define SCx200_REV 0x3d		/* Revision Register */
+#define SCx200_CBA 0x3e		/* Configuration Base Address Register */
+
+/* Verify that the configuration block really is there */
+#define scx200_cb_probe(base) (inw((base) + SCx200_CBA) == (base))
+
+/*
+    Local variables:
+        compile-command: "make -C ../.. bzImage modules"
+        c-basic-offset: 8
+    End:
+*/
diff --git a/include/linux/scx200_gpio.h b/include/linux/scx200_gpio.h
new file mode 100644
index 000000000000..2c6d739706b6
--- /dev/null
+++ b/include/linux/scx200_gpio.h
@@ -0,0 +1,98 @@
+#include <linux/spinlock.h>
+
+u32 scx200_gpio_configure(int index, u32 set, u32 clear);
+void scx200_gpio_dump(unsigned index);
+
+extern unsigned scx200_gpio_base;
+extern spinlock_t scx200_gpio_lock;
+extern long scx200_gpio_shadow[2];
+
+#define scx200_gpio_present() (scx200_gpio_base!=0)
+
+/* Definitions to make sure I do the same thing in all functions */
+#define __SCx200_GPIO_BANK unsigned bank = index>>5
+#define __SCx200_GPIO_IOADDR unsigned short ioaddr = scx200_gpio_base+0x10*bank
+#define __SCx200_GPIO_SHADOW long *shadow = scx200_gpio_shadow+bank
+#define __SCx200_GPIO_INDEX index &= 31
+
+#define __SCx200_GPIO_OUT __asm__ __volatile__("outsl":"=mS" (shadow):"d" (ioaddr), "0" (shadow))
+
+/* returns the value of the GPIO pin */
+
+static inline int scx200_gpio_get(int index) {
+	__SCx200_GPIO_BANK;
+	__SCx200_GPIO_IOADDR + 0x04;
+	__SCx200_GPIO_INDEX;
+		
+	return (inl(ioaddr) & (1<<index)) ? 1 : 0;
+}
+
+/* return the value driven on the GPIO signal (the value that will be
+   driven if the GPIO is configured as an output, it might not be the
+   state of the GPIO right now if the GPIO is configured as an input) */
+
+static inline int scx200_gpio_current(int index) {
+        __SCx200_GPIO_BANK;
+	__SCx200_GPIO_INDEX;
+		
+	return (scx200_gpio_shadow[bank] & (1<<index)) ? 1 : 0;
+}
+
+/* drive the GPIO signal high */
+
+static inline void scx200_gpio_set_high(int index) {
+	__SCx200_GPIO_BANK;
+	__SCx200_GPIO_IOADDR;
+	__SCx200_GPIO_SHADOW;
+	__SCx200_GPIO_INDEX;
+	set_bit(index, shadow);
+	__SCx200_GPIO_OUT;
+}
+
+/* drive the GPIO signal low */
+
+static inline void scx200_gpio_set_low(int index) {
+	__SCx200_GPIO_BANK;
+	__SCx200_GPIO_IOADDR;
+	__SCx200_GPIO_SHADOW;
+	__SCx200_GPIO_INDEX;
+	clear_bit(index, shadow);
+	__SCx200_GPIO_OUT;
+}
+
+/* drive the GPIO signal to state */
+
+static inline void scx200_gpio_set(int index, int state) {
+	__SCx200_GPIO_BANK;
+	__SCx200_GPIO_IOADDR;
+	__SCx200_GPIO_SHADOW;
+	__SCx200_GPIO_INDEX;
+	if (state)
+		set_bit(index, shadow);
+	else
+		clear_bit(index, shadow);
+	__SCx200_GPIO_OUT;
+}
+
+/* toggle the GPIO signal */
+static inline void scx200_gpio_change(int index) {
+	__SCx200_GPIO_BANK;
+	__SCx200_GPIO_IOADDR;
+	__SCx200_GPIO_SHADOW;
+	__SCx200_GPIO_INDEX;
+	change_bit(index, shadow);
+	__SCx200_GPIO_OUT;
+}
+
+#undef __SCx200_GPIO_BANK
+#undef __SCx200_GPIO_IOADDR
+#undef __SCx200_GPIO_SHADOW
+#undef __SCx200_GPIO_INDEX
+#undef __SCx200_GPIO_OUT
+
+/*
+    Local variables:
+        compile-command: "make -C ../.. bzImage modules"
+        c-basic-offset: 8
+    End:
+*/
-- 
cgit v1.2.3


From 40f51070ed7e2f06f5836590cd2c23db2cbe6609 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Fri, 4 Oct 2002 20:15:55 -0700
Subject: [PATCH] struct super_block cleanup - hpfs

Remove hpfs_sb from struct super_block.
---
 fs/hpfs/alloc.c            |  54 +++++++++--------
 fs/hpfs/anode.c            |  18 +++---
 fs/hpfs/buffer.c           |  12 ++--
 fs/hpfs/dentry.c           |   2 +-
 fs/hpfs/dir.c              |  12 ++--
 fs/hpfs/dnode.c            |  34 +++++------
 fs/hpfs/hpfs_fn.h          |  39 +++++++-----
 fs/hpfs/inode.c            |  38 ++++++------
 fs/hpfs/map.c              |  24 ++++----
 fs/hpfs/name.c             |   8 +--
 fs/hpfs/namei.c            |   6 +-
 fs/hpfs/super.c            | 148 +++++++++++++++++++++++++--------------------
 include/linux/fs.h         |   2 -
 include/linux/hpfs_fs_sb.h |  27 ---------
 14 files changed, 209 insertions(+), 215 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c
index 2a9e1c3da6d8..511616f1eb18 100644
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -24,8 +24,8 @@ static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
 		goto fail1;
 	}
 	hpfs_brelse4(&qbh);
-	if (sec >= s->s_hpfs_dirband_start && sec < s->s_hpfs_dirband_start + s->s_hpfs_dirband_size) {
-		unsigned ssec = (sec - s->s_hpfs_dirband_start) / 4;
+	if (sec >= hpfs_sb(s)->sb_dirband_start && sec < hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) {
+		unsigned ssec = (sec - hpfs_sb(s)->sb_dirband_start) / 4;
 		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) goto fail;
 		if ((bmp[ssec >> 5] >> (ssec & 0x1f)) & 1) {
 			hpfs_error(s, "sector '%s' - %08x not allocated in directory bitmap", msg, sec);
@@ -48,11 +48,11 @@ static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
 int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
 {
 	if (start + len < start || start < 0x12 ||
-	    start + len > s->s_hpfs_fs_size) {
+	    start + len > hpfs_sb(s)->sb_fs_size) {
 	    	hpfs_error(s, "sector(s) '%s' badly placed at %08x", msg, start);
 		return 1;
 	}
-	if (s->s_hpfs_chk>=2) {
+	if (hpfs_sb(s)->sb_chk>=2) {
 		int i;
 		for (i = 0; i < len; i++)
 			if (chk_if_allocated(s, start + i, msg)) return 1;
@@ -127,7 +127,7 @@ static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigne
 	}
 	rt:
 	if (ret) {
-		if (s->s_hpfs_chk && ((ret >> 14) != (bs >> 14) || (bmp[(ret & 0x3fff) >> 5] | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) {
+		if (hpfs_sb(s)->sb_chk && ((ret >> 14) != (bs >> 14) || (bmp[(ret & 0x3fff) >> 5] | ~(((1 << n) - 1) << (ret & 0x1f))) != 0xffffffff)) {
 			hpfs_error(s, "Allocation doesn't work! Wanted %d, allocated at %08x", n, ret);
 			ret = 0;
 			goto b;
@@ -155,14 +155,15 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
 	secno sec;
 	unsigned i;
 	unsigned n_bmps;
-	int b = s->s_hpfs_c_bitmap;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	int b = sbi->sb_c_bitmap;
 	int f_p = 0;
 	if (forward < 0) {
 		forward = -forward;
 		f_p = 1;
 	}
 	if (lock) hpfs_lock_creation(s);
-	if (near && near < s->s_hpfs_fs_size)
+	if (near && near < sbi->sb_fs_size)
 		if ((sec = alloc_in_bmp(s, near, n, f_p ? forward : forward/4))) goto ret;
 	if (b != -1) {
 		if ((sec = alloc_in_bmp(s, b<<14, n, f_p ? forward : forward/2))) {
@@ -171,25 +172,25 @@ secno hpfs_alloc_sector(struct super_block *s, secno near, unsigned n, int forwa
 		}
 		if (b > 0x10000000) if ((sec = alloc_in_bmp(s, (b&0xfffffff)<<14, n, f_p ? forward : 0))) goto ret;
 	}	
-	n_bmps = (s->s_hpfs_fs_size + 0x4000 - 1) >> 14;
+	n_bmps = (sbi->sb_fs_size + 0x4000 - 1) >> 14;
 	for (i = 0; i < n_bmps / 2; i++) {
 		if ((sec = alloc_in_bmp(s, (n_bmps/2+i) << 14, n, forward))) {
-			s->s_hpfs_c_bitmap = n_bmps/2+i;
+			sbi->sb_c_bitmap = n_bmps/2+i;
 			goto ret;
 		}	
 		if ((sec = alloc_in_bmp(s, (n_bmps/2-i-1) << 14, n, forward))) {
-			s->s_hpfs_c_bitmap = n_bmps/2-i-1;
+			sbi->sb_c_bitmap = n_bmps/2-i-1;
 			goto ret;
 		}
 	}
 	if ((sec = alloc_in_bmp(s, (n_bmps-1) << 14, n, forward))) {
-		s->s_hpfs_c_bitmap = n_bmps-1;
+		sbi->sb_c_bitmap = n_bmps-1;
 		goto ret;
 	}
 	if (!f_p) {
 		for (i = 0; i < n_bmps; i++)
 			if ((sec = alloc_in_bmp(s, i << 14, n, 0))) {
-				s->s_hpfs_c_bitmap = 0x10000000 + i;
+				sbi->sb_c_bitmap = 0x10000000 + i;
 				goto ret;
 			}
 	}
@@ -212,17 +213,18 @@ static secno alloc_in_dirband(struct super_block *s, secno near, int lock)
 {
 	unsigned nr = near;
 	secno sec;
-	if (nr < s->s_hpfs_dirband_start)
-		nr = s->s_hpfs_dirband_start;
-	if (nr >= s->s_hpfs_dirband_start + s->s_hpfs_dirband_size)
-		nr = s->s_hpfs_dirband_start + s->s_hpfs_dirband_size - 4;
-	nr -= s->s_hpfs_dirband_start;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (nr < sbi->sb_dirband_start)
+		nr = sbi->sb_dirband_start;
+	if (nr >= sbi->sb_dirband_start + sbi->sb_dirband_size)
+		nr = sbi->sb_dirband_start + sbi->sb_dirband_size - 4;
+	nr -= sbi->sb_dirband_start;
 	nr >>= 2;
 	if (lock) hpfs_lock_creation(s);
 	sec = alloc_in_bmp(s, (~0x3fff) | nr, 1, 0);
 	if (lock) hpfs_unlock_creation(s);
 	if (!sec) return 0;
-	return ((sec & 0x3fff) << 2) + s->s_hpfs_dirband_start;
+	return ((sec & 0x3fff) << 2) + sbi->sb_dirband_start;
 }
 
 /* Alloc sector if it's free */
@@ -303,8 +305,8 @@ void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
 
 int hpfs_check_free_dnodes(struct super_block *s, int n)
 {
-	int n_bmps = (s->s_hpfs_fs_size + 0x4000 - 1) >> 14;
-	int b = s->s_hpfs_c_bitmap & 0x0fffffff;
+	int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
+	int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
 	int i, j;
 	unsigned *bmp;
 	struct quad_buffer_head qbh;
@@ -320,7 +322,7 @@ int hpfs_check_free_dnodes(struct super_block *s, int n)
 	}
 	hpfs_brelse4(&qbh);
 	i = 0;
-	if (s->s_hpfs_c_bitmap != -1 ) {
+	if (hpfs_sb(s)->sb_c_bitmap != -1 ) {
 		bmp = hpfs_map_bitmap(s, b, &qbh, "chkdn1");
 		goto chk_bmp;
 	}
@@ -349,17 +351,17 @@ int hpfs_check_free_dnodes(struct super_block *s, int n)
 
 void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
 {
-	if (s->s_hpfs_chk) if (dno & 3) {
+	if (hpfs_sb(s)->sb_chk) if (dno & 3) {
 		hpfs_error(s, "hpfs_free_dnode: dnode %08x not aligned", dno);
 		return;
 	}
-	if (dno < s->s_hpfs_dirband_start ||
-	    dno >= s->s_hpfs_dirband_start + s->s_hpfs_dirband_size) {
+	if (dno < hpfs_sb(s)->sb_dirband_start ||
+	    dno >= hpfs_sb(s)->sb_dirband_start + hpfs_sb(s)->sb_dirband_size) {
 		hpfs_free_sectors(s, dno, 4);
 	} else {
 		struct quad_buffer_head qbh;
 		unsigned *bmp;
-		unsigned ssec = (dno - s->s_hpfs_dirband_start) / 4;
+		unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
 		lock_super(s);
 		if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
 			unlock_super(s);
@@ -377,7 +379,7 @@ struct dnode *hpfs_alloc_dnode(struct super_block *s, secno near,
 			 int lock)
 {
 	struct dnode *d;
-	if (hpfs_count_one_bitmap(s, s->s_hpfs_dmap) > FREE_DNODES_ADD) {
+	if (hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_dmap) > FREE_DNODES_ADD) {
 		if (!(*dno = alloc_in_dirband(s, near, lock)))
 			if (!(*dno = hpfs_alloc_sector(s, near, 4, 0, lock))) return NULL;
 	} else {
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c
index 100efe503f6d..4135a8c8368f 100644
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -20,7 +20,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
 	int i;
 	int c1, c2 = 0;
 	go_down:
-	if (s->s_hpfs_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
+	if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
 	if (btree->internal) {
 		for (i = 0; i < btree->n_used_nodes; i++)
 			if (btree->u.internal[i].file_secno > sec) {
@@ -38,7 +38,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
 		if (btree->u.external[i].file_secno <= sec &&
 		    btree->u.external[i].file_secno + btree->u.external[i].length > sec) {
 			a = btree->u.external[i].disk_secno + sec - btree->u.external[i].file_secno;
-			if (s->s_hpfs_chk) if (hpfs_chk_sectors(s, a, 1, "data")) {
+			if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, a, 1, "data")) {
 				brelse(bh);
 				return -1;
 			}
@@ -88,7 +88,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
 		btree->u.internal[n].file_secno = -1;
 		mark_buffer_dirty(bh);
 		brelse(bh);
-		if (s->s_hpfs_chk)
+		if (hpfs_sb(s)->sb_chk)
 			if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_add_sector_to_btree #1")) return -1;
 		if (!(anode = hpfs_map_anode(s, a, &bh))) return -1;
 		btree = &anode->btree;
@@ -164,7 +164,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
 	c2 = 0;
 	while (up != -1) {
 		struct anode *new_anode;
-		if (s->s_hpfs_chk)
+		if (hpfs_sb(s)->sb_chk)
 			if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1;
 		if (up != node || !fnod) {
 			if (!(anode = hpfs_map_anode(s, up, &bh))) return -1;
@@ -283,7 +283,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
 	while (btree1->internal) {
 		ano = btree1->u.internal[pos].down;
 		if (level) brelse(bh);
-		if (s->s_hpfs_chk)
+		if (hpfs_sb(s)->sb_chk)
 			if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1"))
 				return;
 		if (!(anode = hpfs_map_anode(s, ano, &bh))) return;
@@ -296,7 +296,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
 	go_up:
 	if (!level) return;
 	brelse(bh);
-	if (s->s_hpfs_chk)
+	if (hpfs_sb(s)->sb_chk)
 		if (hpfs_stop_cycles(s, ano, &c1, &c2, "hpfs_remove_btree #2")) return;
 	hpfs_free_sectors(s, ano, 1);
 	oano = ano;
@@ -343,7 +343,7 @@ int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos,
 			if ((sec = anode_lookup(s, a, pos >> 9)) == -1)
 				return -1;
 		} else sec = a + (pos >> 9);
-		if (s->s_hpfs_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #1")) return -1;
+		if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #1")) return -1;
 		if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9)))
 			return -1;
 		l = 0x200 - (pos & 0x1ff); if (l > len) l = len;
@@ -366,7 +366,7 @@ int hpfs_ea_write(struct super_block *s, secno a, int ano, unsigned pos,
 			if ((sec = anode_lookup(s, a, pos >> 9)) == -1)
 				return -1;
 		} else sec = a + (pos >> 9);
-		if (s->s_hpfs_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #2")) return -1;
+		if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, sec, 1, "ea #2")) return -1;
 		if (!(data = hpfs_map_sector(s, sec, &bh, (len - 1) >> 9)))
 			return -1;
 		l = 0x200 - (pos & 0x1ff); if (l > len) l = len;
@@ -440,7 +440,7 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
 		}
 		node = btree->u.internal[i].down;
 		brelse(bh);
-		if (s->s_hpfs_chk)
+		if (hpfs_sb(s)->sb_chk)
 			if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree"))
 				return;
 		if (!(anode = hpfs_map_anode(s, node, &bh))) return;
diff --git a/fs/hpfs/buffer.c b/fs/hpfs/buffer.c
index d1ce3afadcd4..36f463947a20 100644
--- a/fs/hpfs/buffer.c
+++ b/fs/hpfs/buffer.c
@@ -15,7 +15,7 @@ void hpfs_lock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("lock creation\n");
 #endif
-	down(&s->u.hpfs_sb.hpfs_creation_de);
+	down(&hpfs_sb(s)->hpfs_creation_de);
 }
 
 void hpfs_unlock_creation(struct super_block *s)
@@ -23,7 +23,7 @@ void hpfs_unlock_creation(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("unlock creation\n");
 #endif
-	up(&s->u.hpfs_sb.hpfs_creation_de);
+	up(&hpfs_sb(s)->hpfs_creation_de);
 }
 
 void hpfs_lock_iget(struct super_block *s, int mode)
@@ -31,8 +31,8 @@ void hpfs_lock_iget(struct super_block *s, int mode)
 #ifdef DEBUG_LOCKS
 	printk("lock iget\n");
 #endif
-	while (s->s_hpfs_rd_inode) sleep_on(&s->s_hpfs_iget_q);
-	s->s_hpfs_rd_inode = mode;
+	while (hpfs_sb(s)->sb_rd_inode) sleep_on(&hpfs_sb(s)->sb_iget_q);
+	hpfs_sb(s)->sb_rd_inode = mode;
 }
 
 void hpfs_unlock_iget(struct super_block *s)
@@ -40,8 +40,8 @@ void hpfs_unlock_iget(struct super_block *s)
 #ifdef DEBUG_LOCKS
 	printk("unlock iget\n");
 #endif
-	s->s_hpfs_rd_inode = 0;
-	wake_up(&s->s_hpfs_iget_q);
+	hpfs_sb(s)->sb_rd_inode = 0;
+	wake_up(&hpfs_sb(s)->sb_iget_q);
 }
 
 void hpfs_lock_inode(struct inode *i)
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index 3aad4ba50ce2..e572d3e0d6f3 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -28,7 +28,7 @@ int hpfs_hash_dentry(struct dentry *dentry, struct qstr *qstr)
 
 	hash = init_name_hash();
 	for (i = 0; i < l; i++)
-		hash = partial_name_hash(hpfs_upcase(dentry->d_sb->s_hpfs_cp_table,qstr->name[i]), hash);
+		hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash);
 	qstr->hash = end_name_hash(hash);
 
 	return 0;
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index 2a7574901f8a..ff8039bc4fe8 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -65,7 +65,7 @@ int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 	int c1, c2 = 0;
 	int ret = 0;
 
-	if (inode->i_sb->s_hpfs_chk) {
+	if (hpfs_sb(inode->i_sb)->sb_chk) {
 		if (hpfs_chk_sectors(inode->i_sb, inode->i_ino, 1, "dir_fnode")) {
 			ret = -EFSERROR;
 			goto out;
@@ -75,7 +75,7 @@ int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			goto out;
 		}
 	}
-	if (inode->i_sb->s_hpfs_chk >= 2) {
+	if (hpfs_sb(inode->i_sb)->sb_chk >= 2) {
 		struct buffer_head *bh;
 		struct fnode *fno;
 		int e = 0;
@@ -97,7 +97,7 @@ int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 			goto out;
 		}
 	}
-	lc = inode->i_sb->s_hpfs_lowercase;
+	lc = hpfs_sb(inode->i_sb)->sb_lowercase;
 	if (filp->f_pos == 12) { /* diff -r requires this (note, that diff -r */
 		filp->f_pos = 13; /* also fails on msdos filesystem in 2.0) */
 		goto out;
@@ -114,7 +114,7 @@ int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 		/* This won't work when cycle is longer than number of dirents
 		   accepted by filldir, but what can I do?
 		   maybe killall -9 ls helps */
-		if (inode->i_sb->s_hpfs_chk)
+		if (hpfs_sb(inode->i_sb)->sb_chk)
 			if (hpfs_stop_cycles(inode->i_sb, filp->f_pos, &c1, &c2, "hpfs_readdir")) {
 				hpfs_unlock_inode(inode);
 				ret = -EFSERROR;
@@ -160,7 +160,7 @@ int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 				goto out;
 			}
 			if (de->first || de->last) {
-				if (inode->i_sb->s_hpfs_chk) {
+				if (hpfs_sb(inode->i_sb)->sb_chk) {
 					if (de->first && !de->last && (de->namelen != 2 || de ->name[0] != 1 || de->name[1] != 1)) hpfs_error(inode->i_sb, "hpfs_readdir: bad ^A^A entry; pos = %08x", old_pos);
 					if (de->last && (de->namelen != 1 || de ->name[0] != 255)) hpfs_error(inode->i_sb, "hpfs_readdir: bad \\377 entry; pos = %08x", old_pos);
 				}
@@ -241,7 +241,7 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry)
 	 * Go find or make an inode.
 	 */
 
-	hpfs_lock_iget(dir->i_sb, de->directory || (de->ea_size && dir->i_sb->s_hpfs_eas) ? 1 : 2);
+	hpfs_lock_iget(dir->i_sb, de->directory || (de->ea_size && hpfs_sb(dir->i_sb)->sb_eas) ? 1 : 2);
 	if (!(result = iget(dir->i_sb, ino))) {
 		hpfs_unlock_iget(dir->i_sb);
 		hpfs_error(dir->i_sb, "hpfs_lookup: can't get inode");
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c
index 37971b77c751..72b3be59c1f1 100644
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -134,7 +134,7 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
 		hpfs_error(s, "set_last_pointer: empty dnode %08x", d->self);
 		return;
 	}
-	if (s->s_hpfs_chk) {
+	if (hpfs_sb(s)->sb_chk) {
 		if (de->down) {
 			hpfs_error(s, "set_last_pointer: dnode %08x has already last pointer %08x",
 				d->self, de_down_pointer(de));
@@ -253,7 +253,7 @@ int hpfs_add_to_dnode(struct inode *i, dnode_secno dno, unsigned char *name, uns
 		return 1;
 	}
 	go_up_a:
-	if (i->i_sb->s_hpfs_chk)
+	if (hpfs_sb(i->i_sb)->sb_chk)
 		if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_to_dnode")) {
 			hpfs_brelse4(&qbh);
 			if (nd) kfree(nd);
@@ -379,7 +379,7 @@ int hpfs_add_dirent(struct inode *i, unsigned char *name, unsigned namelen,
 	int c1, c2 = 0;
 	dno = hpfs_inode->i_dno;
 	down:
-	if (i->i_sb->s_hpfs_chk)
+	if (hpfs_sb(i->i_sb)->sb_chk)
 		if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "hpfs_add_dirent")) return 1;
 	if (!(d = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 1;
 	de_end = dnode_end_de(d);
@@ -427,11 +427,11 @@ static secno move_to_top(struct inode *i, dnode_secno from, dnode_secno to)
 	int c1, c2 = 0;
 	dno = from;
 	while (1) {
-		if (i->i_sb->s_hpfs_chk)
+		if (hpfs_sb(i->i_sb)->sb_chk)
 			if (hpfs_stop_cycles(i->i_sb, dno, &c1, &c2, "move_to_top"))
 				return 0;
 		if (!(dnode = hpfs_map_dnode(i->i_sb, dno, &qbh))) return 0;
-		if (i->i_sb->s_hpfs_chk) {
+		if (hpfs_sb(i->i_sb)->sb_chk) {
 			if (dnode->up != chk_up) {
 				hpfs_error(i->i_sb, "move_to_top: up pointer from %08x should be %08x, is %08x",
 					dno, chk_up, dnode->up);
@@ -519,7 +519,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
 		up = dnode->up;
 		de = dnode_first_de(dnode);
 		down = de->down ? de_down_pointer(de) : 0;
-		if (i->i_sb->s_hpfs_chk) if (root && !down) {
+		if (hpfs_sb(i->i_sb)->sb_chk) if (root && !down) {
 			hpfs_error(i->i_sb, "delete_empty_dnode: root dnode %08x is empty", dno);
 			goto end;
 		}
@@ -532,7 +532,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
 			struct buffer_head *bh;
 			struct dnode *d1;
 			struct quad_buffer_head qbh1;
-			if (i->i_sb->s_hpfs_chk) if (up != i->i_ino) {
+			if (hpfs_sb(i->i_sb)->sb_chk) if (up != i->i_ino) {
 				hpfs_error(i->i_sb, "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08x", dno, up, i->i_ino);
 				return;
 			}
@@ -628,14 +628,14 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
 			dlp = del->down ? de_down_pointer(del) : 0;
 			if (!dlp && down) {
 				if (d1->first_free > 2044) {
-					if (i->i_sb->s_hpfs_chk >= 2) {
+					if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
 						printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
 						printk("HPFS: warning: terminating balancing operation\n");
 					}
 					hpfs_brelse4(&qbh1);
 					goto endm;
 				}
-				if (i->i_sb->s_hpfs_chk >= 2) {
+				if (hpfs_sb(i->i_sb)->sb_chk >= 2) {
 					printk("HPFS: warning: unbalanced dnode tree, see hpfs.txt 4 more info\n");
 					printk("HPFS: warning: goin'on\n");
 				}
@@ -738,12 +738,12 @@ void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes,
 	int d1, d2 = 0;
 	go_down:
 	if (n_dnodes) (*n_dnodes)++;
-	if (s->s_hpfs_chk)
+	if (hpfs_sb(s)->sb_chk)
 		if (hpfs_stop_cycles(s, dno, &c1, &c2, "hpfs_count_dnodes #1")) return;
 	ptr = 0;
 	go_up:
 	if (!(dnode = hpfs_map_dnode(s, dno, &qbh))) return;
-	if (s->s_hpfs_chk) if (odno && odno != -1 && dnode->up != odno)
+	if (hpfs_sb(s)->sb_chk) if (odno && odno != -1 && dnode->up != odno)
 		hpfs_error(s, "hpfs_count_dnodes: bad up pointer; dnode %08x, down %08x points to %08x", odno, dno, dnode->up);
 	de = dnode_first_de(dnode);
 	if (ptr) while(1) {
@@ -774,7 +774,7 @@ void hpfs_count_dnodes(struct super_block *s, dnode_secno dno, int *n_dnodes,
 		return;
 	}
 	hpfs_brelse4(&qbh);
-	if (s->s_hpfs_chk)
+	if (hpfs_sb(s)->sb_chk)
 		if (hpfs_stop_cycles(s, ptr, &d1, &d2, "hpfs_count_dnodes #2")) return;
 	odno = -1;
 	goto go_up;
@@ -811,11 +811,11 @@ dnode_secno hpfs_de_as_down_as_possible(struct super_block *s, dnode_secno dno)
 	int c1, c2 = 0;
 
 	again:
-	if (s->s_hpfs_chk)
+	if (hpfs_sb(s)->sb_chk)
 		if (hpfs_stop_cycles(s, d, &c1, &c2, "hpfs_de_as_down_as_possible"))
 			return d;
 	if (!(de = map_nth_dirent(s, d, 1, &qbh, NULL))) return dno;
-	if (s->s_hpfs_chk)
+	if (hpfs_sb(s)->sb_chk)
 		if (up && ((struct dnode *)qbh.data)->up != up)
 			hpfs_error(s, "hpfs_de_as_down_as_possible: bad up pointer; dnode %08x, down %08x points to %08x", up, d, ((struct dnode *)qbh.data)->up);
 	if (!de->down) {
@@ -901,7 +901,7 @@ struct hpfs_dirent *map_dirent(struct inode *inode, dnode_secno dno, char *name,
 
 	if (!S_ISDIR(inode->i_mode)) hpfs_error(inode->i_sb, "map_dirent: not a directory\n");
 	again:
-	if (inode->i_sb->s_hpfs_chk)
+	if (hpfs_sb(inode->i_sb)->sb_chk)
 		if (hpfs_stop_cycles(inode->i_sb, dno, &c1, &c2, "map_dirent")) return NULL;
 	if (!(dnode = hpfs_map_dnode(inode->i_sb, dno, qbh))) return NULL;
 	
@@ -1046,7 +1046,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
 	if (c < 0 && de->down) {
 		dno = de_down_pointer(de);
 		hpfs_brelse4(qbh);
-		if (s->s_hpfs_chk)
+		if (hpfs_sb(s)->sb_chk)
 			if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) {
 			kfree(name2);
 			return NULL;
@@ -1065,7 +1065,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
 	downd = dno;
 	dno = d->up;
 	hpfs_brelse4(qbh);
-	if (s->s_hpfs_chk)
+	if (hpfs_sb(s)->sb_chk)
 		if (hpfs_stop_cycles(s, downd, &d1, &d2, "map_fnode_dirent #2")) {
 			kfree(name2);
 			return NULL;
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 775cc92a77ee..237e55f6ee42 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -13,6 +13,7 @@
 #include <linux/fs.h>
 #include <linux/hpfs_fs.h>
 #include <linux/hpfs_fs_i.h>
+#include <linux/hpfs_fs_sb.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
@@ -57,22 +58,6 @@
 
 typedef void nonconst; /* What this is for ? */
 
-/*
- * local time (HPFS) to GMT (Unix)
- */
-
-extern inline time_t local_to_gmt(struct super_block *s, time_t t)
-{
-	extern struct timezone sys_tz;
-	return t + sys_tz.tz_minuteswest * 60 + s->s_hpfs_timeshift;
-}
-
-extern inline time_t gmt_to_local(struct super_block *s, time_t t)
-{
-	extern struct timezone sys_tz;
-	return t - sys_tz.tz_minuteswest * 60 - s->s_hpfs_timeshift;
-}
-
 /*
  * conv= options
  */
@@ -309,6 +294,11 @@ static inline struct hpfs_inode_info *hpfs_i(struct inode *inode)
 	return list_entry(inode, struct hpfs_inode_info, vfs_inode);
 }
 
+static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb)
+{
+	return sb->u.generic_sbp;
+}
+
 /* super.c */
 
 void hpfs_error(struct super_block *, char *, ...);
@@ -319,3 +309,20 @@ unsigned hpfs_count_one_bitmap(struct super_block *, secno);
 int hpfs_statfs(struct super_block *, struct statfs *);
 
 extern struct address_space_operations hpfs_aops;
+
+/*
+ * local time (HPFS) to GMT (Unix)
+ */
+
+extern inline time_t local_to_gmt(struct super_block *s, time_t t)
+{
+	extern struct timezone sys_tz;
+	return t + sys_tz.tz_minuteswest * 60 + hpfs_sb(s)->sb_timeshift;
+}
+
+extern inline time_t gmt_to_local(struct super_block *s, time_t t)
+{
+	extern struct timezone sys_tz;
+	return t - sys_tz.tz_minuteswest * 60 - hpfs_sb(s)->sb_timeshift;
+}
+
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index cdcfb294fc49..407acc268f7e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -66,10 +66,10 @@ void hpfs_read_inode(struct inode *i)
 	unsigned char *ea;
 	int ea_size;
 
-	i->i_uid = sb->s_hpfs_uid;
-	i->i_gid = sb->s_hpfs_gid;
-	i->i_mode = sb->s_hpfs_mode;
-	hpfs_inode->i_conv = sb->s_hpfs_conv;
+	i->i_uid = hpfs_sb(sb)->sb_uid;
+	i->i_gid = hpfs_sb(sb)->sb_gid;
+	i->i_mode = hpfs_sb(sb)->sb_mode;
+	hpfs_inode->i_conv = hpfs_sb(sb)->sb_conv;
 	i->i_blksize = 512;
 	i->i_size = -1;
 	i->i_blocks = -1;
@@ -93,9 +93,9 @@ void hpfs_read_inode(struct inode *i)
 	i->i_mtime = 0;
 	i->i_ctime = 0;
 
-	if (!i->i_sb->s_hpfs_rd_inode)
-		hpfs_error(i->i_sb, "read_inode: s_hpfs_rd_inode == 0");
-	if (i->i_sb->s_hpfs_rd_inode == 2) {
+	if (!hpfs_sb(i->i_sb)->sb_rd_inode)
+		hpfs_error(i->i_sb, "read_inode: sb_rd_inode == 0");
+	if (hpfs_sb(i->i_sb)->sb_rd_inode == 2) {
 		i->i_mode |= S_IFREG;
 		i->i_mode &= ~0111;
 		i->i_op = &hpfs_file_iops;
@@ -112,7 +112,7 @@ void hpfs_read_inode(struct inode *i)
 		make_bad_inode(i);
 		return;
 	}
-	if (i->i_sb->s_hpfs_eas) {
+	if (hpfs_sb(i->i_sb)->sb_eas) {
 		if ((ea = hpfs_get_ea(i->i_sb, fnode, "UID", &ea_size))) {
 			if (ea_size == 2) {
 				i->i_uid = ea[0] + (ea[1] << 8);
@@ -140,7 +140,7 @@ void hpfs_read_inode(struct inode *i)
 		}
 		if ((ea = hpfs_get_ea(i->i_sb, fnode, "MODE", &ea_size))) {
 			int rdev = 0;
-			umode_t mode = sb->s_hpfs_mode;
+			umode_t mode = hpfs_sb(sb)->sb_mode;
 			if (ea_size == 2) {
 				mode = ea[0] + (ea[1] << 8);
 				hpfs_inode->i_ea_mode = 1;
@@ -171,7 +171,7 @@ void hpfs_read_inode(struct inode *i)
 		i->i_fop = &hpfs_dir_ops;
 		hpfs_inode->i_parent_dir = fnode->up;
 		hpfs_inode->i_dno = fnode->u.external[0].disk_secno;
-		if (sb->s_hpfs_chk >= 2) {
+		if (hpfs_sb(sb)->sb_chk >= 2) {
 			struct buffer_head *bh0;
 			if (hpfs_map_fnode(sb, hpfs_inode->i_parent_dir, &bh0)) brelse(bh0);
 		}
@@ -201,24 +201,24 @@ void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
 		/* Some unknown structures like ACL may be in fnode,
 		   we'd better not overwrite them */
 		hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
-	} else if (i->i_sb->s_hpfs_eas >= 2) {
+	} else if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
 		unsigned char ea[4];
-		if ((i->i_uid != i->i_sb->s_hpfs_uid) || hpfs_inode->i_ea_uid) {
+		if ((i->i_uid != hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
 			ea[0] = i->i_uid & 0xff;
 			ea[1] = i->i_uid >> 8;
 			hpfs_set_ea(i, fnode, "UID", ea, 2);
 			hpfs_inode->i_ea_uid = 1;
 		}
-		if ((i->i_gid != i->i_sb->s_hpfs_gid) || hpfs_inode->i_ea_gid) {
+		if ((i->i_gid != hpfs_sb(i->i_sb)->sb_gid) || hpfs_inode->i_ea_gid) {
 			ea[0] = i->i_gid & 0xff;
 			ea[1] = i->i_gid >> 8;
 			hpfs_set_ea(i, fnode, "GID", ea, 2);
 			hpfs_inode->i_ea_gid = 1;
 		}
 		if (!S_ISLNK(i->i_mode))
-			if ((i->i_mode != ((i->i_sb->s_hpfs_mode & ~(S_ISDIR(i->i_mode) ? 0 : 0111))
+			if ((i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0 : 0111))
 			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))
-			  && i->i_mode != ((i->i_sb->s_hpfs_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
+			  && i->i_mode != ((hpfs_sb(i->i_sb)->sb_mode & ~(S_ISDIR(i->i_mode) ? 0222 : 0333))
 			  | (S_ISDIR(i->i_mode) ? S_IFDIR : S_IFREG))) || hpfs_inode->i_ea_mode) {
 				ea[0] = i->i_mode & 0xff;
 				ea[1] = i->i_mode >> 8;
@@ -241,7 +241,7 @@ void hpfs_write_inode(struct inode *i)
 	struct hpfs_inode_info *hpfs_inode = hpfs_i(i);
 	struct inode *parent;
 	if (!i->i_nlink) return;
-	if (i->i_ino == i->i_sb->s_hpfs_root) return;
+	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
 	if (hpfs_inode->i_rddir_off && !atomic_read(&i->i_count)) {
 		if (*hpfs_inode->i_rddir_off) printk("HPFS: write_inode: some position still there\n");
 		kfree(hpfs_inode->i_rddir_off);
@@ -264,9 +264,9 @@ void hpfs_write_inode_nolock(struct inode *i)
 	struct fnode *fnode;
 	struct quad_buffer_head qbh;
 	struct hpfs_dirent *de;
-	if (i->i_ino == i->i_sb->s_hpfs_root) return;
+	if (i->i_ino == hpfs_sb(i->i_sb)->sb_root) return;
 	if (!(fnode = hpfs_map_fnode(i->i_sb, i->i_ino, &bh))) return;
-	if (i->i_ino != i->i_sb->s_hpfs_root) {
+	if (i->i_ino != hpfs_sb(i->i_sb)->sb_root) {
 		if (!(de = map_fnode_dirent(i->i_sb, i->i_ino, fnode, &qbh))) {
 			brelse(bh);
 			return;
@@ -309,7 +309,7 @@ int hpfs_notify_change(struct dentry *dentry, struct iattr *attr)
 	int error=0;
 	lock_kernel();
 	if ( ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) ||
-	     (inode->i_sb->s_hpfs_root == inode->i_ino) ) {
+	     (hpfs_sb(inode->i_sb)->sb_root == inode->i_ino) ) {
 		error = -EINVAL;
 	} else if ((error = inode_change_ok(inode, attr))) {
 	} else if ((error = inode_setattr(inode, attr))) {
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index 762522742a6a..f5774801b67f 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -11,19 +11,19 @@
 
 unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
 {
-	return hpfs_map_4sectors(s, s->s_hpfs_dmap, qbh, 0);
+	return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
 }
 
 unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
 			 struct quad_buffer_head *qbh, char *id)
 {
 	secno sec;
-	if (s->s_hpfs_chk) if (bmp_block * 16384 > s->s_hpfs_fs_size) {
+	if (hpfs_sb(s)->sb_chk) if (bmp_block * 16384 > hpfs_sb(s)->sb_fs_size) {
 		hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id);
 		return NULL;
 	}
-	sec = s->s_hpfs_bmp_dir[bmp_block];
-	if (!sec || sec > s->s_hpfs_fs_size-4) {
+	sec = hpfs_sb(s)->sb_bmp_dir[bmp_block];
+	if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) {
 		hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id);
 		return NULL;
 	}
@@ -93,7 +93,7 @@ char *hpfs_load_code_page(struct super_block *s, secno cps)
 secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
 {
 	struct buffer_head *bh;
-	int n = (s->s_hpfs_fs_size + 0x200000 - 1) >> 21;
+	int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
 	int i;
 	secno *b;
 	if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
@@ -119,11 +119,11 @@ secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_head **bhp)
 {
 	struct fnode *fnode;
-	if (s->s_hpfs_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) {
+	if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) {
 		return NULL;
 	}
 	if ((fnode = hpfs_map_sector(s, ino, bhp, FNODE_RD_AHEAD))) {
-		if (s->s_hpfs_chk) {
+		if (hpfs_sb(s)->sb_chk) {
 			struct extended_attribute *ea;
 			struct extended_attribute *ea_end;
 			if (fnode->magic != FNODE_MAGIC) {
@@ -168,9 +168,9 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea
 struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buffer_head **bhp)
 {
 	struct anode *anode;
-	if (s->s_hpfs_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL;
+	if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL;
 	if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD)))
-		if (s->s_hpfs_chk) {
+		if (hpfs_sb(s)->sb_chk) {
 			if (anode->magic != ANODE_MAGIC || anode->self != ano) {
 				hpfs_error(s, "bad magic on anode %08x", ano);
 				goto bail;
@@ -200,7 +200,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 			     struct quad_buffer_head *qbh)
 {
 	struct dnode *dnode;
-	if (s->s_hpfs_chk) {
+	if (hpfs_sb(s)->sb_chk) {
 		if (hpfs_chk_sectors(s, secno, 4, "dnode")) return NULL;
 		if (secno & 3) {
 			hpfs_error(s, "dnode %08x not byte-aligned", secno);
@@ -208,7 +208,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 		}	
 	}
 	if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD)))
-		if (s->s_hpfs_chk) {
+		if (hpfs_sb(s)->sb_chk) {
 			unsigned p, pp = 0;
 			unsigned char *d = (char *)dnode;
 			int b = 0;
@@ -234,7 +234,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno,
 					hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp);
 					goto bail;
 				}
-				if (s->s_hpfs_chk >= 2) b |= 1 << de->down;
+				if (hpfs_sb(s)->sb_chk >= 2) b |= 1 << de->down;
 				if (de->down) if (de_down_pointer(de) < 0x10) {
 					hpfs_error(s, "bad down pointer in dnode %08x, dirent %03x, last %03x", secno, p, pp);
 					goto bail;
diff --git a/fs/hpfs/name.c b/fs/hpfs/name.c
index a50b0b004c5b..6dca89a70590 100644
--- a/fs/hpfs/name.c
+++ b/fs/hpfs/name.c
@@ -89,7 +89,7 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
 {
 	char *to;
 	int i;
-	if (s->s_hpfs_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
+	if (hpfs_sb(s)->sb_chk >= 2) if (hpfs_is_name_long(from, len) != lng) {
 		printk("HPFS: Long name flag mismatch - name ");
 		for (i=0; i<len; i++) printk("%c", from[i]);
 		printk(" misidentified as %s.\n", lng ? "short" : "long");
@@ -100,7 +100,7 @@ char *hpfs_translate_name(struct super_block *s, unsigned char *from,
 		printk("HPFS: can't allocate memory for name conversion buffer\n");
 		return from;
 	}
-	for (i = 0; i < len; i++) to[i] = locase(s->s_hpfs_cp_table,from[i]);
+	for (i = 0; i < len; i++) to[i] = locase(hpfs_sb(s)->sb_cp_table,from[i]);
 	return to;
 }
 
@@ -111,8 +111,8 @@ int hpfs_compare_names(struct super_block *s, unsigned char *n1, unsigned l1,
 	unsigned i;
 	if (last) return -1;
 	for (i = 0; i < l; i++) {
-		unsigned char c1 = upcase(s->s_hpfs_cp_table,n1[i]);
-		unsigned char c2 = upcase(s->s_hpfs_cp_table,n2[i]);
+		unsigned char c1 = upcase(hpfs_sb(s)->sb_cp_table,n1[i]);
+		unsigned char c2 = upcase(hpfs_sb(s)->sb_cp_table,n2[i]);
 		if (c1 < c2) return -1;
 		if (c1 > c2) return 1;
 	}
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index ce4040c730c9..080470e3b38e 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -187,7 +187,7 @@ int hpfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int rdev)
 	struct inode *result = NULL;
 	int err;
 	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
-	if (dir->i_sb->s_hpfs_eas < 2) return -EPERM;
+	if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
 	lock_kernel();
 	if (!(fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh))) goto bail;
 	memset(&dee, 0, sizeof dee);
@@ -255,7 +255,7 @@ int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink)
 	int err;
 	if ((err = hpfs_chk_name((char *)name, &len))) return err==-ENOENT ? -EINVAL : err;
 	lock_kernel();
-	if (dir->i_sb->s_hpfs_eas < 2) {
+	if (hpfs_sb(dir->i_sb)->sb_eas < 2) {
 		unlock_kernel();
 		return -EPERM;
 	}
@@ -559,7 +559,7 @@ int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		mark_buffer_dirty(bh);
 		brelse(bh);
 	}
-	hpfs_i(i)->i_conv = i->i_sb->s_hpfs_conv;
+	hpfs_i(i)->i_conv = hpfs_sb(i->i_sb)->sb_conv;
 	hpfs_decide_conv(i, (char *)new_name, new_len);
 	end1:
 	hpfs_unlock_3inodes(old_dir, new_dir, i);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index b8306f5f5db9..4d7ae1631844 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -16,7 +16,7 @@
 
 static void mark_dirty(struct super_block *s)
 {
-	if (s->s_hpfs_chkdsk && !(s->s_flags & MS_RDONLY)) {
+	if (hpfs_sb(s)->sb_chkdsk && !(s->s_flags & MS_RDONLY)) {
 		struct buffer_head *bh;
 		struct hpfs_spare_block *sb;
 		if ((sb = hpfs_map_sector(s, 17, &bh, 0))) {
@@ -37,8 +37,8 @@ static void unmark_dirty(struct super_block *s)
 	struct hpfs_spare_block *sb;
 	if (s->s_flags & MS_RDONLY) return;
 	if ((sb = hpfs_map_sector(s, 17, &bh, 0))) {
-		sb->dirty = s->s_hpfs_chkdsk > 1 - s->s_hpfs_was_error;
-		sb->old_wrote = s->s_hpfs_chkdsk >= 2 && !s->s_hpfs_was_error;
+		sb->dirty = hpfs_sb(s)->sb_chkdsk > 1 - hpfs_sb(s)->sb_was_error;
+		sb->old_wrote = hpfs_sb(s)->sb_chkdsk >= 2 && !hpfs_sb(s)->sb_was_error;
 		mark_buffer_dirty(bh);
 		brelse(bh);
 	}
@@ -60,12 +60,12 @@ void hpfs_error(struct super_block *s, char *m,...)
 	printk("HPFS: filesystem error: ");
 	if (buf) printk("%s", buf);
 	else printk("%s\n",m);
-	if (!s->s_hpfs_was_error) {
-		if (s->s_hpfs_err == 2) {
+	if (!hpfs_sb(s)->sb_was_error) {
+		if (hpfs_sb(s)->sb_err == 2) {
 			printk("; crashing the system because you wanted it\n");
 			mark_dirty(s);
 			panic("HPFS panic");
-		} else if (s->s_hpfs_err == 1) {
+		} else if (hpfs_sb(s)->sb_err == 1) {
 			if (s->s_flags & MS_RDONLY) printk("; already mounted read-only\n");
 			else {
 				printk("; remounting read-only\n");
@@ -76,7 +76,7 @@ void hpfs_error(struct super_block *s, char *m,...)
 		else printk("; corrupted filesystem mounted read/write - your computer will explode within 20 seconds ... but you wanted it so!\n");
 	} else printk("\n");
 	if (buf) kfree(buf);
-	s->s_hpfs_was_error = 1;
+	hpfs_sb(s)->sb_was_error = 1;
 }
 
 /* 
@@ -101,9 +101,12 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 
 void hpfs_put_super(struct super_block *s)
 {
-	if (s->s_hpfs_cp_table) kfree(s->s_hpfs_cp_table);
-	if (s->s_hpfs_bmp_dir) kfree(s->s_hpfs_bmp_dir);
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
+	if (sbi->sb_cp_table) kfree(sbi->sb_cp_table);
+	if (sbi->sb_bmp_dir) kfree(sbi->sb_bmp_dir);
 	unmark_dirty(s);
+	s->u.generic_sbp = NULL;
+	kfree(sbi);
 }
 
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
@@ -125,28 +128,29 @@ unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
 static unsigned count_bitmaps(struct super_block *s)
 {
 	unsigned n, count, n_bands;
-	n_bands = (s->s_hpfs_fs_size + 0x3fff) >> 14;
+	n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14;
 	count = 0;
 	for (n = 0; n < n_bands; n++)
-		count += hpfs_count_one_bitmap(s, s->s_hpfs_bmp_dir[n]);
+		count += hpfs_count_one_bitmap(s, hpfs_sb(s)->sb_bmp_dir[n]);
 	return count;
 }
 
 int hpfs_statfs(struct super_block *s, struct statfs *buf)
 {
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	lock_kernel();
 
-	/*if (s->s_hpfs_n_free == -1) {*/
-		s->s_hpfs_n_free = count_bitmaps(s);
-		s->s_hpfs_n_free_dnodes = hpfs_count_one_bitmap(s, s->s_hpfs_dmap);
+	/*if (sbi->sb_n_free == -1) {*/
+		sbi->sb_n_free = count_bitmaps(s);
+		sbi->sb_n_free_dnodes = hpfs_count_one_bitmap(s, sbi->sb_dmap);
 	/*}*/
 	buf->f_type = s->s_magic;
 	buf->f_bsize = 512;
-	buf->f_blocks = s->s_hpfs_fs_size;
-	buf->f_bfree = s->s_hpfs_n_free;
-	buf->f_bavail = s->s_hpfs_n_free;
-	buf->f_files = s->s_hpfs_dirband_size / 4;
-	buf->f_ffree = s->s_hpfs_n_free_dnodes;
+	buf->f_blocks = sbi->sb_fs_size;
+	buf->f_bfree = sbi->sb_n_free;
+	buf->f_bavail = sbi->sb_n_free;
+	buf->f_files = sbi->sb_dirband_size / 4;
+	buf->f_ffree = sbi->sb_n_free_dnodes;
 	buf->f_namelen = 254;
 
 	unlock_kernel();
@@ -377,14 +381,15 @@ int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	umode_t umask;
 	int lowercase, conv, eas, chk, errs, chkdsk, timeshift;
 	int o;
+	struct hpfs_sb_info *sbi = hpfs_sb(s);
 	
 	*flags |= MS_NOATIME;
 	
-	uid = s->s_hpfs_uid; gid = s->s_hpfs_gid;
-	umask = 0777 & ~s->s_hpfs_mode;
-	lowercase = s->s_hpfs_lowercase; conv = s->s_hpfs_conv;
-	eas = s->s_hpfs_eas; chk = s->s_hpfs_chk; chkdsk = s->s_hpfs_chkdsk;
-	errs = s->s_hpfs_err; timeshift = s->s_hpfs_timeshift;
+	uid = sbi->sb_uid; gid = sbi->sb_gid;
+	umask = 0777 & ~sbi->sb_mode;
+	lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
+	eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk;
+	errs = sbi->sb_err; timeshift = sbi->sb_timeshift;
 
 	if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase, &conv,
 	    &eas, &chk, &errs, &chkdsk, &timeshift))) {
@@ -395,18 +400,18 @@ int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 		hpfs_help();
 		return 1;
 	}
-	if (timeshift != s->s_hpfs_timeshift) {
+	if (timeshift != sbi->sb_timeshift) {
 		printk("HPFS: timeshift can't be changed using remount.\n");
 		return 1;
 	}
 
 	unmark_dirty(s);
 
-	s->s_hpfs_uid = uid; s->s_hpfs_gid = gid;
-	s->s_hpfs_mode = 0777 & ~umask;
-	s->s_hpfs_lowercase = lowercase; s->s_hpfs_conv = conv;
-	s->s_hpfs_eas = eas; s->s_hpfs_chk = chk; s->s_hpfs_chkdsk = chkdsk;
-	s->s_hpfs_err = errs; s->s_hpfs_timeshift = timeshift;
+	sbi->sb_uid = uid; sbi->sb_gid = gid;
+	sbi->sb_mode = 0777 & ~umask;
+	sbi->sb_lowercase = lowercase; sbi->sb_conv = conv;
+	sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
+	sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
 
 	if (!(*flags & MS_RDONLY)) mark_dirty(s);
 
@@ -419,6 +424,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	struct hpfs_boot_block *bootblock;
 	struct hpfs_super_block *superblock;
 	struct hpfs_spare_block *spareblock;
+	struct hpfs_sb_info *sbi;
 
 	uid_t uid;
 	gid_t gid;
@@ -431,12 +437,18 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 
 	int o;
 
-	s->s_hpfs_bmp_dir = NULL;
-	s->s_hpfs_cp_table = NULL;
+	sbi = kmalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	s->u.generic_sbp = sbi;
+	memset(sbi, 0, sizeof(*sbi));
+
+	sbi->sb_bmp_dir = NULL;
+	sbi->sb_cp_table = NULL;
 
-	s->s_hpfs_rd_inode = 0;
-	init_MUTEX(&s->u.hpfs_sb.hpfs_creation_de);
-	init_waitqueue_head(&s->s_hpfs_iget_q);
+	sbi->sb_rd_inode = 0;
+	init_MUTEX(&sbi->hpfs_creation_de);
+	init_waitqueue_head(&sbi->sb_iget_q);
 
 	uid = current->uid;
 	gid = current->gid;
@@ -459,9 +471,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 		goto bail0;
 	}
 
-	/*s->s_hpfs_mounting = 1;*/
+	/*sbi->sb_mounting = 1;*/
 	sb_set_blocksize(s, 512);
-	s->s_hpfs_fs_size = -1;
+	sbi->sb_fs_size = -1;
 	if (!(bootblock = hpfs_map_sector(s, 0, &bh0, 0))) goto bail1;
 	if (!(superblock = hpfs_map_sector(s, 16, &bh1, 1))) goto bail2;
 	if (!(spareblock = hpfs_map_sector(s, 17, &bh2, 0))) goto bail3;
@@ -489,30 +501,30 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	s->s_magic = HPFS_SUPER_MAGIC;
 	s->s_op = &hpfs_sops;
 
-	s->s_hpfs_root = superblock->root;
-	s->s_hpfs_fs_size = superblock->n_sectors;
-	s->s_hpfs_bitmaps = superblock->bitmaps;
-	s->s_hpfs_dirband_start = superblock->dir_band_start;
-	s->s_hpfs_dirband_size = superblock->n_dir_band;
-	s->s_hpfs_dmap = superblock->dir_band_bitmap;
-	s->s_hpfs_uid = uid;
-	s->s_hpfs_gid = gid;
-	s->s_hpfs_mode = 0777 & ~umask;
-	s->s_hpfs_n_free = -1;
-	s->s_hpfs_n_free_dnodes = -1;
-	s->s_hpfs_lowercase = lowercase;
-	s->s_hpfs_conv = conv;
-	s->s_hpfs_eas = eas;
-	s->s_hpfs_chk = chk;
-	s->s_hpfs_chkdsk = chkdsk;
-	s->s_hpfs_err = errs;
-	s->s_hpfs_timeshift = timeshift;
-	s->s_hpfs_was_error = 0;
-	s->s_hpfs_cp_table = NULL;
-	s->s_hpfs_c_bitmap = -1;
+	sbi->sb_root = superblock->root;
+	sbi->sb_fs_size = superblock->n_sectors;
+	sbi->sb_bitmaps = superblock->bitmaps;
+	sbi->sb_dirband_start = superblock->dir_band_start;
+	sbi->sb_dirband_size = superblock->n_dir_band;
+	sbi->sb_dmap = superblock->dir_band_bitmap;
+	sbi->sb_uid = uid;
+	sbi->sb_gid = gid;
+	sbi->sb_mode = 0777 & ~umask;
+	sbi->sb_n_free = -1;
+	sbi->sb_n_free_dnodes = -1;
+	sbi->sb_lowercase = lowercase;
+	sbi->sb_conv = conv;
+	sbi->sb_eas = eas;
+	sbi->sb_chk = chk;
+	sbi->sb_chkdsk = chkdsk;
+	sbi->sb_err = errs;
+	sbi->sb_timeshift = timeshift;
+	sbi->sb_was_error = 0;
+	sbi->sb_cp_table = NULL;
+	sbi->sb_c_bitmap = -1;
 	
 	/* Load bitmap directory */
-	if (!(s->s_hpfs_bmp_dir = hpfs_load_bitmap_directory(s, superblock->bitmaps)))
+	if (!(sbi->sb_bmp_dir = hpfs_load_bitmap_directory(s, superblock->bitmaps)))
 		goto bail4;
 	
 	/* Check for general fs errors*/
@@ -557,20 +569,20 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 				superblock->dir_band_start, superblock->dir_band_end, superblock->n_dir_band);
 			goto bail4;
 		}
-		a = s->s_hpfs_dirband_size;
-		s->s_hpfs_dirband_size = 0;
+		a = sbi->sb_dirband_size;
+		sbi->sb_dirband_size = 0;
 		if (hpfs_chk_sectors(s, superblock->dir_band_start, superblock->n_dir_band, "dir_band") ||
 		    hpfs_chk_sectors(s, superblock->dir_band_bitmap, 4, "dir_band_bitmap") ||
 		    hpfs_chk_sectors(s, superblock->bitmaps, 4, "bitmaps")) {
 			mark_dirty(s);
 			goto bail4;
 		}
-		s->s_hpfs_dirband_size = a;
+		sbi->sb_dirband_size = a;
 	} else printk("HPFS: You really don't want any checks? You are crazy...\n");
 
 	/* Load code page table */
 	if (spareblock->n_code_pages)
-		if (!(s->s_hpfs_cp_table = hpfs_load_code_page(s, spareblock->code_page_dir)))
+		if (!(sbi->sb_cp_table = hpfs_load_code_page(s, spareblock->code_page_dir)))
 			printk("HPFS: Warning: code page support is disabled\n");
 
 	brelse(bh2);
@@ -578,7 +590,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	brelse(bh0);
 
 	hpfs_lock_iget(s, 1);
-	s->s_root = d_alloc_root(iget(s, s->s_hpfs_root));
+	s->s_root = d_alloc_root(iget(s, sbi->sb_root));
 	hpfs_unlock_iget(s);
 	if (!s->s_root || !s->s_root->d_inode) {
 		printk("HPFS: iget failed. Why???\n");
@@ -590,7 +602,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
 	 * find the root directory's . pointer & finish filling in the inode
 	 */
 
-	root_dno = hpfs_fnode_dno(s, s->s_hpfs_root);
+	root_dno = hpfs_fnode_dno(s, sbi->sb_root);
 	if (root_dno)
 		de = map_dirent(s->s_root->d_inode, root_dno, "\001\001", 2, NULL, &qbh);
 	if (!root_dno || !de) hpfs_error(s, "unable to find root dir");
@@ -612,8 +624,10 @@ bail3:	brelse(bh1);
 bail2:	brelse(bh0);
 bail1:
 bail0:
-	if (s->s_hpfs_bmp_dir) kfree(s->s_hpfs_bmp_dir);
-	if (s->s_hpfs_cp_table) kfree(s->s_hpfs_cp_table);
+	if (sbi->sb_bmp_dir) kfree(sbi->sb_bmp_dir);
+	if (sbi->sb_cp_table) kfree(sbi->sb_cp_table);
+	s->u.generic_sbp = NULL;
+	kfree(sbi);
 	return -EINVAL;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f0ba1e96325c..1157bf24d307 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -628,7 +628,6 @@ extern int send_sigurg(struct fown_struct *fown);
 #define MNT_DETACH	0x00000002	/* Just detach from the tree */
 
 #include <linux/ext3_fs_sb.h>
-#include <linux/hpfs_fs_sb.h>
 
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
@@ -671,7 +670,6 @@ struct super_block {
 
 	union {
 		struct ext3_sb_info	ext3_sb;
-		struct hpfs_sb_info	hpfs_sb;
 		void			*generic_sbp;
 	} u;
 	/*
diff --git a/include/linux/hpfs_fs_sb.h b/include/linux/hpfs_fs_sb.h
index 068035e9a4bd..c6355adaf6f1 100644
--- a/include/linux/hpfs_fs_sb.h
+++ b/include/linux/hpfs_fs_sb.h
@@ -36,31 +36,4 @@ struct hpfs_sb_info {
 	int sb_timeshift;
 };
 
-#define s_hpfs_root u.hpfs_sb.sb_root
-#define s_hpfs_fs_size u.hpfs_sb.sb_fs_size
-#define s_hpfs_bitmaps u.hpfs_sb.sb_bitmaps
-#define s_hpfs_dirband_start u.hpfs_sb.sb_dirband_start
-#define s_hpfs_dirband_size u.hpfs_sb.sb_dirband_size
-#define s_hpfs_dmap u.hpfs_sb.sb_dmap
-#define s_hpfs_uid u.hpfs_sb.sb_uid
-#define s_hpfs_gid u.hpfs_sb.sb_gid
-#define s_hpfs_mode u.hpfs_sb.sb_mode
-#define s_hpfs_n_free u.hpfs_sb.sb_n_free
-#define s_hpfs_n_free_dnodes u.hpfs_sb.sb_n_free_dnodes
-#define s_hpfs_lowercase u.hpfs_sb.sb_lowercase
-#define s_hpfs_conv u.hpfs_sb.sb_conv
-#define s_hpfs_eas u.hpfs_sb.sb_eas
-#define s_hpfs_err u.hpfs_sb.sb_err
-#define s_hpfs_chk u.hpfs_sb.sb_chk
-#define s_hpfs_was_error u.hpfs_sb.sb_was_error
-#define s_hpfs_chkdsk u.hpfs_sb.sb_chkdsk
-/*#define s_hpfs_rd_fnode u.hpfs_sb.sb_rd_fnode*/
-#define s_hpfs_rd_inode u.hpfs_sb.sb_rd_inode
-#define s_hpfs_cp_table u.hpfs_sb.sb_cp_table
-#define s_hpfs_bmp_dir u.hpfs_sb.sb_bmp_dir
-#define s_hpfs_c_bitmap u.hpfs_sb.sb_c_bitmap
-#define s_hpfs_iget_q u.hpfs_sb.sb_iget_q
-/*#define s_hpfs_mounting u.hpfs_sb.sb_mounting*/
-#define s_hpfs_timeshift u.hpfs_sb.sb_timeshift
-
 #endif
-- 
cgit v1.2.3


From 5868a4993b5d980a5486e4b2bec11b88d8e016e6 Mon Sep 17 00:00:00 2001
From: Brian Gerst <bgerst@didntduck.org>
Date: Fri, 4 Oct 2002 20:16:00 -0700
Subject: [PATCH] struct super_block cleanup - ext3

Removes the last member of the union, ext3.
---
 include/linux/ext3_fs.h | 1 +
 include/linux/fs.h      | 3 ---
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index d4550e28f37e..3f370ab642dd 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -17,6 +17,7 @@
 #define _LINUX_EXT3_FS_H
 
 #include <linux/types.h>
+#include <linux/ext3_fs_sb.h>
 
 /*
  * The second extended filesystem constants/structures
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 1157bf24d307..ad6648e3fd9c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -627,8 +627,6 @@ extern int send_sigurg(struct fown_struct *fown);
 #define MNT_FORCE	0x00000001	/* Attempt to forcibily umount */
 #define MNT_DETACH	0x00000002	/* Just detach from the tree */
 
-#include <linux/ext3_fs_sb.h>
-
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
 
@@ -669,7 +667,6 @@ struct super_block {
 	char s_id[32];				/* Informational name */
 
 	union {
-		struct ext3_sb_info	ext3_sb;
 		void			*generic_sbp;
 	} u;
 	/*
-- 
cgit v1.2.3


From 9d66d9e91730e97f653c3143b637f1d63605f074 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Fri, 4 Oct 2002 20:34:40 -0700
Subject: [PATCH] remove debug code from list_del()

It hasn't caught any bugs, and it is causing confusion over whether
this is a permanent part of list_del() behaviour.
---
 include/linux/list.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/list.h b/include/linux/list.h
index bd6f0ac3fb6b..634aab6c4c94 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -86,13 +86,12 @@ static inline void __list_del(struct list_head * prev, struct list_head * next)
 /**
  * list_del - deletes entry from list.
  * @entry: the element to delete from the list.
- * Note: list_empty on entry does not return true after this, the entry is in an undefined state.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
  */
 static inline void list_del(struct list_head *entry)
 {
 	__list_del(entry->prev, entry->next);
-	entry->next = (void *) 0;
-	entry->prev = (void *) 0;
 }
 
 /**
-- 
cgit v1.2.3


From d39755802e430876d612bc792e4c29652ed8b99b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Fri, 4 Oct 2002 20:34:45 -0700
Subject: [PATCH] distinguish between address span of a zone and the number

From David Mosberger

The patch below fixes a bug in nr_free_zone_pages() which shows when a
zone has hole.  The problem is due to the fact that "struct zone"
didn't keep track of the amount of real memory in a zone.  Because of
this, nr_free_zone_pages() simply assumed that a zone consists entirely
of real memory.  On machines with large holes, this has catastrophic
effects on VM performance, because the VM system ends up thinking that
there is plenty of memory left over in a zone, when in fact it may be
completely full.

The patch below fixes the problem by replacing the "size" member in
"struct zone" with "spanned_pages" and "present_pages" and updating
page_alloc.c.
---
 include/linux/mmzone.h |  3 ++-
 mm/page_alloc.c        | 17 +++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d7d12a69f505..dab0f76cfb20 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -120,7 +120,8 @@ struct zone {
 	 * rarely used fields:
 	 */
 	char			*name;
-	unsigned long		size;
+	unsigned long		spanned_pages;	/* total size, including holes */
+	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 } ____cacheline_maxaligned_in_smp;
 
 #define ZONE_DMA		0
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b9cced8d19a0..9694db4322b0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,7 @@ static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
  */
 static inline int bad_range(struct zone *zone, struct page *page)
 {
-	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->size)
+	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
 		return 1;
 	if (page_to_pfn(page) < zone->zone_start_pfn)
 		return 1;
@@ -509,7 +509,7 @@ static unsigned int nr_free_zone_pages(int offset)
 		struct zone *zone;
 
 		for (zone = *zonep++; zone; zone = *zonep++) {
-			unsigned long size = zone->size;
+			unsigned long size = zone->present_pages;
 			unsigned long high = zone->pages_high;
 			if (size > high)
 				sum += size - high;
@@ -681,7 +681,7 @@ void show_free_areas(void)
 			struct zone *zone = &pgdat->node_zones[type];
  			unsigned long nr, flags, order, total = 0;
 
-			if (!zone->size)
+			if (!zone->present_pages)
 				continue;
 
 			spin_lock_irqsave(&zone->lock, flags);
@@ -710,7 +710,7 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
 		BUG();
 	case ZONE_HIGHMEM:
 		zone = pgdat->node_zones + ZONE_HIGHMEM;
-		if (zone->size) {
+		if (zone->present_pages) {
 #ifndef CONFIG_HIGHMEM
 			BUG();
 #endif
@@ -718,11 +718,11 @@ static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zoneli
 		}
 	case ZONE_NORMAL:
 		zone = pgdat->node_zones + ZONE_NORMAL;
-		if (zone->size)
+		if (zone->present_pages)
 			zonelist->zones[j++] = zone;
 	case ZONE_DMA:
 		zone = pgdat->node_zones + ZONE_DMA;
-		if (zone->size)
+		if (zone->present_pages)
 			zonelist->zones[j++] = zone;
 	}
 
@@ -866,7 +866,8 @@ void __init free_area_init_core(pg_data_t *pgdat,
 			realsize -= zholes_size[j];
 
 		printk("  %s zone: %lu pages\n", zone_names[j], realsize);
-		zone->size = size;
+		zone->spanned_pages = size;
+		zone->present_pages = realsize;
 		zone->name = zone_names[j];
 		spin_lock_init(&zone->lock);
 		spin_lock_init(&zone->lru_lock);
@@ -1034,7 +1035,7 @@ static int frag_show(struct seq_file *m, void *arg)
 	int order;
 
 	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
-		if (!zone->size)
+		if (!zone->present_pages)
 			continue;
 
 		spin_lock_irqsave(&zone->lock, flags);
-- 
cgit v1.2.3


From bf3f607a57d27cab30d5ddfd203d873856bc22b7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Fri, 4 Oct 2002 20:35:08 -0700
Subject: [PATCH] separation of direct-reclaim and kswapd functions

There is some lack of clarity in what kswapd does and what
direct-reclaim tasks do; try_to_free_pages() tries to service both
functions, and they are different.

- kswapd's role is to keep all zones on its node at

	zone->free_pages >= zone->pages_high.

  and to never stop as long as any zones do not meet that condition.

- A direct reclaimer's role is to try to free some pages from the
  zones which are suitable for this particular allocation request, and
  to return when that has been achieved, or when all the relevant zones
  are at

	zone->free_pages >= zone->pages_high.

The patch explicitly separates these two code paths; kswapd does not
run try_to_free_pages() any more.  kswapd should not be aware of zone
fallbacks.
---
 include/linux/mmzone.h |   1 -
 mm/page_alloc.c        |   3 -
 mm/vmscan.c            | 230 +++++++++++++++++++++++--------------------------
 3 files changed, 110 insertions(+), 124 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index dab0f76cfb20..2b83fca9e08d 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -62,7 +62,6 @@ struct zone {
 	spinlock_t		lock;
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
-	int			need_balance;
 
 	ZONE_PADDING(_pad1_)
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9694db4322b0..7763adf4073e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -346,8 +346,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		}
 	}
 
-	classzone->need_balance = 1;
-	mb();
 	/* we're somewhat low on memory, failed to find what we needed */
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
@@ -873,7 +871,6 @@ void __init free_area_init_core(pg_data_t *pgdat,
 		spin_lock_init(&zone->lru_lock);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
-		zone->need_balance = 0;
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		atomic_set(&zone->refill_counter, 0);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f8b879a35775..e97711f8127d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -101,15 +101,19 @@ static inline int is_page_cache_freeable(struct page *page)
 	return page_count(page) - !!PagePrivate(page) == 2;
 }
 
+
+/*
+ * shrink_list returns the number of reclaimed pages
+ */
 static /* inline */ int
-shrink_list(struct list_head *page_list, int nr_pages,
-		unsigned int gfp_mask, int *max_scan, int *nr_mapped)
+shrink_list(struct list_head *page_list, unsigned int gfp_mask,
+		int *max_scan, int *nr_mapped)
 {
 	struct address_space *mapping;
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
-	const int nr_pages_in = nr_pages;
 	int pgactivate = 0;
+	int ret = 0;
 
 	pagevec_init(&freed_pvec);
 	while (!list_empty(page_list)) {
@@ -295,7 +299,7 @@ shrink_list(struct list_head *page_list, int nr_pages,
 		__put_page(page);	/* The pagecache ref */
 free_it:
 		unlock_page(page);
-		nr_pages--;
+		ret++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
@@ -312,11 +316,11 @@ keep:
 	list_splice(&ret_pages, page_list);
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
-	mod_page_state(pgsteal, nr_pages_in - nr_pages);
+	mod_page_state(pgsteal, ret);
 	if (current->flags & PF_KSWAPD)
-		mod_page_state(kswapd_steal, nr_pages_in - nr_pages);
+		mod_page_state(kswapd_steal, ret);
 	mod_page_state(pgactivate, pgactivate);
-	return nr_pages;
+	return ret;
 }
 
 /*
@@ -325,18 +329,19 @@ keep:
  * not freed will be added back to the LRU.
  *
  * shrink_cache() is passed the number of pages to try to free, and returns
- * the number which are yet-to-free.
+ * the number of pages which were reclaimed.
  *
  * For pagecache intensive workloads, the first loop here is the hottest spot
  * in the kernel (apart from the copy_*_user functions).
  */
 static /* inline */ int
-shrink_cache(int nr_pages, struct zone *zone,
+shrink_cache(const int nr_pages, struct zone *zone,
 		unsigned int gfp_mask, int max_scan, int *nr_mapped)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	int nr_to_process;
+	int ret = 0;
 
 	/*
 	 * Try to ensure that we free `nr_pages' pages in one pass of the loop.
@@ -349,10 +354,11 @@ shrink_cache(int nr_pages, struct zone *zone,
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	while (max_scan > 0 && nr_pages > 0) {
+	while (max_scan > 0 && ret < nr_pages) {
 		struct page *page;
 		int nr_taken = 0;
 		int nr_scan = 0;
+		int nr_freed;
 
 		while (nr_scan++ < nr_to_process &&
 				!list_empty(&zone->inactive_list)) {
@@ -383,10 +389,10 @@ shrink_cache(int nr_pages, struct zone *zone,
 
 		max_scan -= nr_scan;
 		mod_page_state(pgscan, nr_scan);
-		nr_pages = shrink_list(&page_list, nr_pages,
-				gfp_mask, &max_scan, nr_mapped);
-
-		if (nr_pages <= 0 && list_empty(&page_list))
+		nr_freed = shrink_list(&page_list, gfp_mask,
+					&max_scan, nr_mapped);
+		ret += nr_freed;
+		if (nr_freed <= 0 && list_empty(&page_list))
 			goto done;
 
 		spin_lock_irq(&zone->lru_lock);
@@ -412,7 +418,7 @@ shrink_cache(int nr_pages, struct zone *zone,
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
-	return nr_pages;	
+	return ret;
 }
 
 /*
@@ -533,9 +539,14 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in)
 	mod_page_state(pgdeactivate, pgdeactivate);
 }
 
+/*
+ * Try to reclaim `nr_pages' from this zone.  Returns the number of reclaimed
+ * pages.  This is a basic per-zone page freer.  Used by both kswapd and
+ * direct reclaim.
+ */
 static /* inline */ int
-shrink_zone(struct zone *zone, int max_scan,
-		unsigned int gfp_mask, int nr_pages, int *nr_mapped)
+shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
+		const int nr_pages, int *nr_mapped)
 {
 	unsigned long ratio;
 
@@ -556,36 +567,60 @@ shrink_zone(struct zone *zone, int max_scan,
 		atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter);
 		refill_inactive_zone(zone, SWAP_CLUSTER_MAX);
 	}
-	nr_pages = shrink_cache(nr_pages, zone, gfp_mask,
-				max_scan, nr_mapped);
-	return nr_pages;
+	return shrink_cache(nr_pages, zone, gfp_mask, max_scan, nr_mapped);
+}
+
+/*
+ * FIXME: don't do this for ZONE_HIGHMEM
+ */
+/*
+ * Here we assume it costs one seek to replace a lru page and that it also
+ * takes a seek to recreate a cache object.  With this in mind we age equal
+ * percentages of the lru and ageable caches.  This should balance the seeks
+ * generated by these structures.
+ *
+ * NOTE: for now I do this for all zones.  If we find this is too aggressive
+ * on large boxes we may want to exclude ZONE_HIGHMEM.
+ *
+ * If we're encountering mapped pages on the LRU then increase the pressure on
+ * slab to avoid swapping.
+ */
+static void shrink_slab(int total_scanned, int gfp_mask)
+{
+	int shrink_ratio;
+	int pages = nr_used_zone_pages();
+
+	shrink_ratio = (pages / (total_scanned + 1)) + 1;
+	shrink_dcache_memory(shrink_ratio, gfp_mask);
+	shrink_icache_memory(shrink_ratio, gfp_mask);
+	shrink_dqcache_memory(shrink_ratio, gfp_mask);
 }
 
+/*
+ * This is the direct reclaim path, for page-allocating processes.  We only
+ * try to reclaim pages from zones which will satisfy the caller's allocation
+ * request.
+ */
 static int
 shrink_caches(struct zone *classzone, int priority,
-		int *total_scanned, int gfp_mask, int nr_pages)
+		int *total_scanned, int gfp_mask, const int nr_pages)
 {
 	struct zone *first_classzone;
 	struct zone *zone;
-	int ratio;
 	int nr_mapped = 0;
-	int pages = nr_used_zone_pages();
+	int ret = 0;
 
 	first_classzone = classzone->zone_pgdat->node_zones;
 	for (zone = classzone; zone >= first_classzone; zone--) {
 		int max_scan;
 		int to_reclaim;
-		int unreclaimed;
 
 		to_reclaim = zone->pages_high - zone->free_pages;
 		if (to_reclaim < 0)
 			continue;	/* zone has enough memory */
 
-		if (to_reclaim > SWAP_CLUSTER_MAX)
-			to_reclaim = SWAP_CLUSTER_MAX;
-
-		if (to_reclaim < nr_pages)
-			to_reclaim = nr_pages;
+		to_reclaim = min(to_reclaim, SWAP_CLUSTER_MAX);
+		to_reclaim = max(to_reclaim, nr_pages);
 
 		/*
 		 * If we cannot reclaim `nr_pages' pages by scanning twice
@@ -594,33 +629,18 @@ shrink_caches(struct zone *classzone, int priority,
 		max_scan = zone->nr_inactive >> priority;
 		if (max_scan < to_reclaim * 2)
 			max_scan = to_reclaim * 2;
-		unreclaimed = shrink_zone(zone, max_scan,
-				gfp_mask, to_reclaim, &nr_mapped);
-		nr_pages -= to_reclaim - unreclaimed;
+		ret += shrink_zone(zone, max_scan, gfp_mask,
+				to_reclaim, &nr_mapped);
 		*total_scanned += max_scan;
+		*total_scanned += nr_mapped;
+		if (ret >= nr_pages)
+			break;
 	}
-
-	/*
-	 * Here we assume it costs one seek to replace a lru page and that
-	 * it also takes a seek to recreate a cache object.  With this in
-	 * mind we age equal percentages of the lru and ageable caches.
-	 * This should balance the seeks generated by these structures.
-	 *
-	 * NOTE: for now I do this for all zones.  If we find this is too
-	 * aggressive on large boxes we may want to exclude ZONE_HIGHMEM
-	 *
-	 * If we're encountering mapped pages on the LRU then increase the
-	 * pressure on slab to avoid swapping.
-	 */
-	ratio = (pages / (*total_scanned + nr_mapped + 1)) + 1;
-	shrink_dcache_memory(ratio, gfp_mask);
-	shrink_icache_memory(ratio, gfp_mask);
-	shrink_dqcache_memory(ratio, gfp_mask);
-	return nr_pages;
+	return ret;
 }
 
 /*
- * This is the main entry point to page reclaim.
+ * This is the main entry point to direct page reclaim.
  *
  * If a full scan of the inactive list fails to free enough memory then we
  * are "out of memory" and something needs to be killed.
@@ -640,17 +660,18 @@ int
 try_to_free_pages(struct zone *classzone,
 		unsigned int gfp_mask, unsigned int order)
 {
-	int priority = DEF_PRIORITY;
-	int nr_pages = SWAP_CLUSTER_MAX;
+	int priority;
+	const int nr_pages = SWAP_CLUSTER_MAX;
+	int nr_reclaimed = 0;
 
 	inc_page_state(pageoutrun);
 
 	for (priority = DEF_PRIORITY; priority; priority--) {
 		int total_scanned = 0;
 
-		nr_pages = shrink_caches(classzone, priority, &total_scanned,
-					gfp_mask, nr_pages);
-		if (nr_pages <= 0)
+		nr_reclaimed += shrink_caches(classzone, priority,
+					&total_scanned, gfp_mask, nr_pages);
+		if (nr_reclaimed >= nr_pages)
 			return 1;
 		if (total_scanned == 0)
 			return 1;	/* All zones had enough free memory */
@@ -665,62 +686,46 @@ try_to_free_pages(struct zone *classzone,
 
 		/* Take a nap, wait for some writeback to complete */
 		blk_congestion_wait(WRITE, HZ/4);
+		shrink_slab(total_scanned, gfp_mask);
 	}
 	if (gfp_mask & __GFP_FS)
 		out_of_memory();
 	return 0;
 }
 
-static int check_classzone_need_balance(struct zone *classzone)
+/*
+ * kswapd will work across all this node's zones until they are all at
+ * pages_high.
+ */
+static void kswapd_balance_pgdat(pg_data_t *pgdat)
 {
-	struct zone *first_classzone;
+	int priority = DEF_PRIORITY;
+	int i;
 
-	first_classzone = classzone->zone_pgdat->node_zones;
-	while (classzone >= first_classzone) {
-		if (classzone->free_pages > classzone->pages_high)
-			return 0;
-		classzone--;
-	}
-	return 1;
-}
+	for (priority = DEF_PRIORITY; priority; priority--) {
+		int success = 1;
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
-{
-	int need_more_balance = 0, i;
-	struct zone *zone;
+		for (i = 0; i < pgdat->nr_zones; i++) {
+			struct zone *zone = pgdat->node_zones + i;
+			int nr_mapped = 0;
+			int max_scan;
+			int to_reclaim;
 
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		cond_resched();
-		if (!zone->need_balance)
-			continue;
-		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
-			zone->need_balance = 0;
-			__set_current_state(TASK_INTERRUPTIBLE);
-			schedule_timeout(HZ);
-			continue;
+			to_reclaim = zone->pages_high - zone->free_pages;
+			if (to_reclaim <= 0)
+				continue;
+			success = 0;
+			max_scan = zone->nr_inactive >> priority;
+			if (max_scan < to_reclaim * 2)
+				max_scan = to_reclaim * 2;
+			shrink_zone(zone, max_scan, GFP_KSWAPD,
+					to_reclaim, &nr_mapped);
+			shrink_slab(max_scan + nr_mapped, GFP_KSWAPD);
 		}
-		if (check_classzone_need_balance(zone))
-			need_more_balance = 1;
-		else
-			zone->need_balance = 0;
-	}
-
-	return need_more_balance;
-}
-
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-	struct zone *zone;
-	int i;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (zone->need_balance)
-			return 0;
+		if (success)
+			break;	/* All zones are at pages_high */
+		blk_congestion_wait(WRITE, HZ/4);
 	}
-
-	return 1;
 }
 
 /*
@@ -740,7 +745,7 @@ int kswapd(void *p)
 {
 	pg_data_t *pgdat = (pg_data_t*)p;
 	struct task_struct *tsk = current;
-	DECLARE_WAITQUEUE(wait, tsk);
+	DEFINE_WAIT(wait);
 
 	daemonize();
 	set_cpus_allowed(tsk, __node_to_cpu_mask(pgdat->node_id));
@@ -761,27 +766,12 @@ int kswapd(void *p)
 	 */
 	tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
 
-	/*
-	 * Kswapd main loop.
-	 */
-	for (;;) {
+	for ( ; ; ) {
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&pgdat->kswapd_wait, &wait);
-
-		mb();
-		if (kswapd_can_sleep_pgdat(pgdat))
-			schedule();
-
-		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&pgdat->kswapd_wait, &wait);
-
-		/*
-		 * If we actually get into a low-memory situation,
-		 * the processes needing more memory will wake us
-		 * up on a more timely basis.
-		 */
+		prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&pgdat->kswapd_wait, &wait);
 		kswapd_balance_pgdat(pgdat);
 		blk_run_queues();
 	}
-- 
cgit v1.2.3


From 4ac833da2fec12985c33f4f23a446ff09950dd1f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Fri, 4 Oct 2002 20:35:37 -0700
Subject: [PATCH] remove write_mapping_buffers()

When the global buffer LRU was present, dirty ext2 indirect blocks were
automatically scheduled for writeback alongside their data.

I added write_mapping_buffers() to replace this - the idea was to
schedule the indirects close in time to the scheduling of their data.

It works OK for small-to-medium sized files but for large, linear writes
it doesn't work: the request queue is completely full of file data and
when we later come to scheduling the indirects, their neighbouring data
has already been written.

So writeback of really huge files tends to be a bit seeky.

So.  Kill it.  Will fix this problem by other means.
---
 fs/buffer.c                 | 75 ---------------------------------------------
 fs/ext2/inode.c             |  9 +-----
 fs/ext3/inode.c             |  9 +-----
 include/linux/buffer_head.h |  1 -
 4 files changed, 2 insertions(+), 92 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index d1da2c0ffac8..b00fb755781d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -735,81 +735,6 @@ int sync_mapping_buffers(struct address_space *mapping)
 }
 EXPORT_SYMBOL(sync_mapping_buffers);
 
-/**
- * write_mapping_buffers - Start writeout of a mapping's "associated" buffers.
- * @mapping - the mapping which wants those buffers written.
- *
- * Starts I/O against dirty buffers which are on @mapping->private_list.
- * Those buffers must be backed by @mapping->assoc_mapping.
- *
- * The private_list buffers generally contain filesystem indirect blocks.
- * The idea is that the filesystem can start I/O against the indirects at
- * the same time as running generic_writepages(), so the indirect's
- * I/O will be merged with the data.
- *
- * We sneakliy write the buffers in probable tail-to-head order.  This is
- * because generic_writepages() writes in probable head-to-tail
- * order.  If the file is so huge that the data or the indirects overflow
- * the request queue we will at least get some merging this way.
- *
- * Any clean+unlocked buffers are de-listed.  clean/locked buffers must be
- * left on the list for an fsync() to wait on.
- *
- * Couldn't think of a smart way of avoiding livelock, so chose the dumb
- * way instead.
- *
- * FIXME: duplicates fsync_inode_buffers() functionality a bit.
- */
-int write_mapping_buffers(struct address_space *mapping)
-{
-	spinlock_t *lock;
-	struct address_space *buffer_mapping;
-	unsigned nr_to_write;	/* livelock avoidance */
-	struct list_head *lh;
-	int ret = 0;
-
-	if (list_empty(&mapping->private_list))
-		goto out;
-
-	buffer_mapping = mapping->assoc_mapping;
-	lock = &buffer_mapping->private_lock;
-	spin_lock(lock);
-	nr_to_write = 0;
-	lh = mapping->private_list.next;
-	while (lh != &mapping->private_list) {
-		lh = lh->next;
-		nr_to_write++;
-	}
-	nr_to_write *= 2;	/* Allow for some late additions */
-
-	while (nr_to_write-- && !list_empty(&mapping->private_list)) {
-		struct buffer_head *bh;
-
-		bh = BH_ENTRY(mapping->private_list.prev);
-		list_del_init(&bh->b_assoc_buffers);
-		if (!buffer_dirty(bh) && !buffer_locked(bh))
-			continue;
-		/* Stick it on the far end of the list. Order is preserved. */
-		list_add(&bh->b_assoc_buffers, &mapping->private_list);
-		if (test_set_buffer_locked(bh))
-			continue;
-		get_bh(bh);
-		spin_unlock(lock);
-		if (test_clear_buffer_dirty(bh)) {
-			bh->b_end_io = end_buffer_io_sync;
-			submit_bh(WRITE, bh);
-		} else {
-			unlock_buffer(bh);
-			put_bh(bh);
-		}
-		spin_lock(lock);
-	}
-	spin_unlock(lock);
-out:
-	return ret;
-}
-EXPORT_SYMBOL(write_mapping_buffers);
-
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 99627183120e..d27313d1dd10 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -629,14 +629,7 @@ ext2_direct_IO(int rw, struct inode *inode, const struct iovec *iov,
 static int
 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	int ret;
-	int err;
-
-	ret = write_mapping_buffers(mapping);
-	err = mpage_writepages(mapping, wbc, ext2_get_block);
-	if (!ret)
-		ret = err;
-	return ret;
+	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
 struct address_space_operations ext2_aops = {
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 978e9e60d070..5b2c49a9b34e 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1477,14 +1477,7 @@ struct address_space_operations ext3_aops = {
 static int
 ext3_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-	int ret;
-	int err;
-
-	ret = write_mapping_buffers(mapping);
-	err = mpage_writepages(mapping, wbc, ext3_get_block);
-	if (!ret)
-		ret = err;
-	return ret;
+	return mpage_writepages(mapping, wbc, ext3_get_block);
 }
 #endif
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index dd8f1bfb150e..7ad9fafbea0b 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -140,7 +140,6 @@ void end_buffer_io_sync(struct buffer_head *bh, int uptodate);
 void buffer_insert_list(spinlock_t *lock,
 			struct buffer_head *, struct list_head *);
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode);
-int write_mapping_buffers(struct address_space *mapping);
 int inode_has_buffers(struct inode *);
 void invalidate_inode_buffers(struct inode *);
 int fsync_buffers_list(spinlock_t *lock, struct list_head *);
-- 
cgit v1.2.3


From 343893e647d27c96bf07e3f549b77b89aa9581ce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Fri, 4 Oct 2002 20:35:43 -0700
Subject: [PATCH] use buffer_boundary() for writeback scheduling  hints

This is the replacement for write_mapping_buffers().

Whenever the mpage code sees that it has just written a block which had
buffer_boundary() set, it assumes that the next block is dirty
filesystem metadata.  (This is a good assumption - that's what
buffer_boundary is for).

So we do a lookup in the blockdev mapping for the next block and it if
is present and dirty, then schedule it for IO.

So the indirect blocks in the blockdev mapping get merged with the data
blocks in the file mapping.

This is a bit more general than the write_mapping_buffers() approach.
write_mapping_buffers() required that the fs carefully maintain the
correct buffers on the mapping->private_list, and that the fs call
write_mapping_buffers(), and the implementation was generally rather
yuk.

This version will "just work" for filesystems which implement
buffer_boundary correctly.  Currently this is ext2, ext3 and some
not-yet-merged reiserfs patches.  JFS implements buffer_boundary() but
does not use ext2-like layouts - so there will be no change there.

Works nicely.
---
 fs/buffer.c                 | 17 +++++++++++++++++
 fs/mpage.c                  | 19 +++++++++++++++++--
 include/linux/buffer_head.h |  3 +++
 3 files changed, 37 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index b00fb755781d..f989f5fdd070 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -735,6 +735,23 @@ int sync_mapping_buffers(struct address_space *mapping)
 }
 EXPORT_SYMBOL(sync_mapping_buffers);
 
+/*
+ * Called when we've recently written block `bblock', and it is known that
+ * `bblock' was for a buffer_boundary() buffer.  This means that the block at
+ * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
+ * dirty, schedule it for IO.  So that indirects merge nicely with their data.
+ */
+void write_boundary_block(struct block_device *bdev,
+			sector_t bblock, unsigned blocksize)
+{
+	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
+	if (bh) {
+		if (buffer_dirty(bh))
+			ll_rw_block(WRITE, 1, &bh);
+		put_bh(bh);
+	}
+}
+
 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
 {
 	struct address_space *mapping = inode->i_mapping;
diff --git a/fs/mpage.c b/fs/mpage.c
index b082adc807ff..2cbd183dfd7c 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -331,6 +331,8 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 	unsigned first_unmapped = blocks_per_page;
 	struct block_device *bdev = NULL;
 	int boundary = 0;
+	sector_t boundary_block = 0;
+	struct block_device *boundary_bdev = NULL;
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -363,6 +365,10 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 			}
 			blocks[page_block++] = bh->b_blocknr;
 			boundary = buffer_boundary(bh);
+			if (boundary) {
+				boundary_block = bh->b_blocknr;
+				boundary_bdev = bh->b_bdev;
+			}
 			bdev = bh->b_bdev;
 		} while ((bh = bh->b_this_page) != head);
 
@@ -393,6 +399,10 @@ mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block,
 		if (buffer_new(&map_bh))
 			unmap_underlying_metadata(map_bh.b_bdev,
 						map_bh.b_blocknr);
+		if (buffer_boundary(&map_bh)) {
+			boundary_block = map_bh.b_blocknr;
+			boundary_bdev = map_bh.b_bdev;
+		}
 		if (page_block) {
 			if (map_bh.b_blocknr != blocks[page_block-1] + 1)
 				goto confused;
@@ -464,10 +474,15 @@ alloc_new:
 	BUG_ON(PageWriteback(page));
 	SetPageWriteback(page);
 	unlock_page(page);
-	if (boundary || (first_unmapped != blocks_per_page))
+	if (boundary || (first_unmapped != blocks_per_page)) {
 		bio = mpage_bio_submit(WRITE, bio);
-	else
+		if (boundary_block) {
+			write_boundary_block(boundary_bdev,
+					boundary_block, 1 << blkbits);
+		}
+	} else {
 		*last_block_in_bio = blocks[blocks_per_page - 1];
+	}
 	goto out;
 
 confused:
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 7ad9fafbea0b..71732e1216fc 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -167,6 +167,9 @@ void free_buffer_head(struct buffer_head * bh);
 void FASTCALL(unlock_buffer(struct buffer_head *bh));
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int submit_bh(int, struct buffer_head *);
+void write_boundary_block(struct block_device *bdev,
+			sector_t bblock, unsigned blocksize);
+
 extern int buffer_heads_over_limit;
 
 /*
-- 
cgit v1.2.3


From a27efcaff9ffd5ad05f4e111751da41a8820f7ab Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Fri, 4 Oct 2002 20:35:48 -0700
Subject: [PATCH] remove page->virtual

The patch removes page->virtual for all architectures which do not
define WANT_PAGE_VIRTUAL.  Hash for it instead.

Possibly we could define WANT_PAGE_VIRTUAL for CONFIG_HIGHMEM4G, but it
seems unlikely.

A lot of the pressure went off kmap() and page_address() as a result of
the move to kmap_atomic().  That should be the preferred way to address
CPU load in the set_page_address() and page_address() hashing and
locking.

If kmap_atomic is not usable then the next best approach is for users
to cache the result of kmap() in a local rather than calling
page_address() repeatedly.

One heavy user of kmap() and page_address() is the ext2 directory code.

On a 7G Quad PIII, running four concurrent instances of

	while true
	do
		find /usr/src/linux > /dev/null
	done

on ext2 with everything cached, profiling shows that the new hashed
set_page_address() and page_address() implementations consume 0.4% and
1.3% of CPU time respectively.   I think that's OK.
---
 include/linux/mm.h |  48 +++++++++----------
 init/main.c        |   1 +
 kernel/ksyms.c     |   3 ++
 mm/highmem.c       | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++---
 mm/page_alloc.c    |   5 +-
 5 files changed, 156 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 4ae8eb10dcb2..a5107b5043f7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -176,7 +176,7 @@ struct page {
 	 * Architectures with slow multiplication can define
 	 * WANT_PAGE_VIRTUAL in asm/page.h
 	 */
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
+#if defined(WANT_PAGE_VIRTUAL)
 	void *virtual;			/* Kernel virtual address (NULL if
 					   not kmapped, ie. highmem) */
 #endif /* CONFIG_HIGMEM || WANT_PAGE_VIRTUAL */
@@ -289,38 +289,34 @@ static inline void set_page_zone(struct page *page, unsigned long zone_num)
 	page->flags |= zone_num << ZONE_SHIFT;
 }
 
-/*
- * In order to avoid #ifdefs within C code itself, we define
- * set_page_address to a noop for non-highmem machines, where
- * the field isn't useful.
- * The same is true for page_address() in arch-dependent code.
- */
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
+#define lowmem_page_address(page)					\
+	__va( ( ((page) - page_zone(page)->zone_mem_map)		\
+			+ page_zone(page)->zone_start_pfn) << PAGE_SHIFT)
+
+#if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
+#define HASHED_PAGE_VIRTUAL
+#endif
 
+#if defined(WANT_PAGE_VIRTUAL)
+#define page_address(page) ((page)->virtual)
 #define set_page_address(page, address)			\
 	do {						\
 		(page)->virtual = (address);		\
 	} while(0)
+#define page_address_init()  do { } while(0)
+#endif
 
-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-#define set_page_address(page, address)  do { } while(0)
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-
-/*
- * Permanent address of a page. Obviously must never be
- * called on a highmem page.
- */
-#if defined(CONFIG_HIGHMEM) || defined(WANT_PAGE_VIRTUAL)
-
-#define page_address(page) ((page)->virtual)
-
-#else /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
-
-#define page_address(page)						\
-	__va( ( ((page) - page_zone(page)->zone_mem_map)		\
-			+ page_zone(page)->zone_start_pfn) << PAGE_SHIFT)
+#if defined(HASHED_PAGE_VIRTUAL)
+void *page_address(struct page *page);
+void set_page_address(struct page *page, void *virtual);
+void page_address_init(void);
+#endif
 
-#endif /* CONFIG_HIGHMEM || WANT_PAGE_VIRTUAL */
+#if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL)
+#define page_address(page) lowmem_page_address(page)
+#define set_page_address(page, address)  do { } while(0)
+#define page_address_init()  do { } while(0)
+#endif
 
 /*
  * Return true if this page is mapped into pagetables.  Subtle: test pte.direct
diff --git a/init/main.c b/init/main.c
index 3cf336cf6f8d..f69c298b9a6f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -433,6 +433,7 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
+	page_address_init();
 	mem_init();
 	kmem_cache_sizes_init();
 	pidhash_init();
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 4954fc80381f..56852f3be54c 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -132,6 +132,9 @@ EXPORT_SYMBOL(highmem_start_page);
 EXPORT_SYMBOL(kmap_prot);
 EXPORT_SYMBOL(kmap_pte);
 #endif
+#ifdef HASHED_PAGE_VIRTUAL
+EXPORT_SYMBOL(page_address);
+#endif
 EXPORT_SYMBOL(get_user_pages);
 
 /* filesystem internal functions */
diff --git a/mm/highmem.c b/mm/highmem.c
index 11d599cacbd0..68fedb0e559f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -22,6 +22,7 @@
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
+#include <linux/hash.h>
 #include <asm/pgalloc.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -88,7 +89,7 @@ static void flush_all_zero_pkmaps(void)
 		page = pte_page(pkmap_page_table[i]);
 		pte_clear(&pkmap_page_table[i]);
 
-		page->virtual = NULL;
+		set_page_address(page, NULL);
 	}
 	flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
 }
@@ -126,8 +127,8 @@ start:
 			spin_lock(&kmap_lock);
 
 			/* Somebody else might have mapped it while we slept */
-			if (page->virtual)
-				return (unsigned long) page->virtual;
+			if (page_address(page))
+				return (unsigned long)page_address(page);
 
 			/* Re-start */
 			goto start;
@@ -137,7 +138,7 @@ start:
 	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
 
 	pkmap_count[last_pkmap_nr] = 1;
-	page->virtual = (void *) vaddr;
+	set_page_address(page, (void *)vaddr);
 
 	return vaddr;
 }
@@ -153,7 +154,7 @@ void *kmap_high(struct page *page)
 	 * We cannot call this from interrupts, as it may block
 	 */
 	spin_lock(&kmap_lock);
-	vaddr = (unsigned long) page->virtual;
+	vaddr = (unsigned long)page_address(page);
 	if (!vaddr)
 		vaddr = map_new_virtual(page);
 	pkmap_count[PKMAP_NR(vaddr)]++;
@@ -170,7 +171,7 @@ void kunmap_high(struct page *page)
 	int need_wakeup;
 
 	spin_lock(&kmap_lock);
-	vaddr = (unsigned long) page->virtual;
+	vaddr = (unsigned long)page_address(page);
 	if (!vaddr)
 		BUG();
 	nr = PKMAP_NR(vaddr);
@@ -467,7 +468,7 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 	*bio_orig = bio;
 }
 
-#if CONFIG_DEBUG_HIGHMEM
+#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_HIGHMEM)
 void check_highmem_ptes(void)
 {
 	int idx, type;
@@ -484,3 +485,121 @@ void check_highmem_ptes(void)
 }
 #endif
 
+#if defined(HASHED_PAGE_VIRTUAL)
+
+#define PA_HASH_ORDER	7
+
+/*
+ * Describes one page->virtual association
+ */
+struct page_address_map {
+	struct page *page;
+	void *virtual;
+	struct list_head list;
+};
+
+/*
+ * page_address_map freelist, allocated from page_address_maps.
+ */
+static struct list_head page_address_pool;	/* freelist */
+static spinlock_t pool_lock;			/* protects page_address_pool */
+
+/*
+ * Hash table bucket
+ */
+static struct page_address_slot {
+	struct list_head lh;			/* List of page_address_maps */
+	spinlock_t lock;			/* Protect this bucket's list */
+} ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
+
+static struct page_address_slot *page_slot(struct page *page)
+{
+	return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
+}
+
+void *page_address(struct page *page)
+{
+	unsigned long flags;
+	void *ret;
+	struct page_address_slot *pas;
+
+	if (!PageHighMem(page))
+		return lowmem_page_address(page);
+
+	pas = page_slot(page);
+	ret = NULL;
+	spin_lock_irqsave(&pas->lock, flags);
+	if (!list_empty(&pas->lh)) {
+		struct page_address_map *pam;
+
+		list_for_each_entry(pam, &pas->lh, list) {
+			if (pam->page == page) {
+				ret = pam->virtual;
+				goto done;
+			}
+		}
+	}
+done:
+	spin_unlock_irqrestore(&pas->lock, flags);
+	return ret;
+}
+
+void set_page_address(struct page *page, void *virtual)
+{
+	unsigned long flags;
+	struct page_address_slot *pas;
+	struct page_address_map *pam;
+
+	BUG_ON(!PageHighMem(page));
+
+	pas = page_slot(page);
+	if (virtual) {		/* Add */
+		BUG_ON(list_empty(&page_address_pool));
+
+		spin_lock_irqsave(&pool_lock, flags);
+		pam = list_entry(page_address_pool.next,
+				struct page_address_map, list);
+		list_del(&pam->list);
+		spin_unlock_irqrestore(&pool_lock, flags);
+
+		pam->page = page;
+		pam->virtual = virtual;
+
+		spin_lock_irqsave(&pas->lock, flags);
+		list_add_tail(&pam->list, &pas->lh);
+		spin_unlock_irqrestore(&pas->lock, flags);
+	} else {		/* Remove */
+		spin_lock_irqsave(&pas->lock, flags);
+		list_for_each_entry(pam, &pas->lh, list) {
+			if (pam->page == page) {
+				list_del(&pam->list);
+				spin_unlock_irqrestore(&pas->lock, flags);
+				spin_lock_irqsave(&pool_lock, flags);
+				list_add_tail(&pam->list, &page_address_pool);
+				spin_unlock_irqrestore(&pool_lock, flags);
+				goto done;
+			}
+		}
+		spin_unlock_irqrestore(&pas->lock, flags);
+	}
+done:
+	return;
+}
+
+static struct page_address_map page_address_maps[LAST_PKMAP];
+
+void __init page_address_init(void)
+{
+	int i;
+
+	INIT_LIST_HEAD(&page_address_pool);
+	for (i = 0; i < ARRAY_SIZE(page_address_maps); i++)
+		list_add(&page_address_maps[i].list, &page_address_pool);
+	for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) {
+		INIT_LIST_HEAD(&page_address_htable[i].lh);
+		spin_lock_init(&page_address_htable[i].lock);
+	}
+	spin_lock_init(&pool_lock);
+}
+
+#endif	/* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7763adf4073e..4f892bb13250 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -921,12 +921,15 @@ void __init free_area_init_core(pg_data_t *pgdat,
 			set_page_count(page, 0);
 			SetPageReserved(page);
 			INIT_LIST_HEAD(&page->list);
+#ifdef WANT_PAGE_VIRTUAL
 			if (j != ZONE_HIGHMEM)
 				/*
 				 * The shift left won't overflow because the
 				 * ZONE_NORMAL is below 4G.
 				 */
-				set_page_address(page, __va(zone_start_pfn << PAGE_SHIFT));
+				set_page_address(page,
+					__va(zone_start_pfn << PAGE_SHIFT));
+#endif
 			zone_start_pfn++;
 		}
 
-- 
cgit v1.2.3


From ae8172699dc084417d18f0f839e220cfd3363166 Mon Sep 17 00:00:00 2001
From: Russell King <rmk@flint.arm.linux.org.uk>
Date: Sun, 6 Oct 2002 01:30:10 +0100
Subject: [SERIAL] Allow PCMCIA serial cards to work again. The PCMCIA layer
 claims the IO or memory regions for all cards.  This means that any port
 registered via 8250_cs must not cause the 8250 code to claim the resources
 itself.

We also add support for iomem-based ports at initialisation time for
PPC.
---
 drivers/serial/8250.c       | 53 +++++++++++++++++++++++++++------------------
 drivers/serial/8250.h       |  5 ++++-
 include/linux/serial_core.h |  2 ++
 3 files changed, 38 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 446243c8f4b9..ce9319f16132 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1560,21 +1560,22 @@ static int serial8250_request_port(struct uart_port *port)
 {
 	struct uart_8250_port *up = (struct uart_8250_port *)port;
 	struct resource *res = NULL, *res_rsa = NULL;
-	int ret = -EBUSY;
+	int ret = 0;
 
-	if (up->port.type == PORT_RSA) {
-		ret = serial8250_request_rsa_resource(up, &res_rsa);
-		if (ret)
-			return ret;
-	}
+	if (up->port.flags & UPF_RESOURCES) {
+		if (up->port.type == PORT_RSA) {
+			ret = serial8250_request_rsa_resource(up, &res_rsa);
+			if (ret)
+				return ret;
+		}
 
-	ret = serial8250_request_std_resource(up, &res);
+		ret = serial8250_request_std_resource(up, &res);
+	}
 
 	/*
 	 * If we have a mapbase, then request that as well.
 	 */
-	if (res != NULL && up->port.iotype == SERIAL_IO_MEM &&
-	    up->port.mapbase) {
+	if (ret == 0 && up->port.flags & UPF_IOREMAP) {
 		int size = res->end - res->start + 1;
 
 		up->port.membase = ioremap(up->port.mapbase, size);
@@ -1610,13 +1611,17 @@ static void serial8250_config_port(struct uart_port *port, int flags)
 	 * Find the region that we can probe for.  This in turn
 	 * tells us whether we can probe for the type of port.
 	 */
-	ret = serial8250_request_std_resource(up, &res_std);
-	if (ret)
-		return;
+	if (up->port.flags & UPF_RESOURCES) {
+		ret = serial8250_request_std_resource(up, &res_std);
+		if (ret)
+			return;
 
-	ret = serial8250_request_rsa_resource(up, &res_rsa);
-	if (ret)
+		ret = serial8250_request_rsa_resource(up, &res_rsa);
+		if (ret)
+			probeflags &= ~PROBE_RSA;
+	} else {
 		probeflags &= ~PROBE_RSA;
+	}
 
 	if (flags & UART_CONFIG_TYPE)
 		autoconfig(up, probeflags);
@@ -1678,6 +1683,7 @@ static struct uart_8250_port serial8250_ports[UART_NR];
 
 static void __init serial8250_isa_init_ports(void)
 {
+	struct uart_8250_port *up;
 	static int first = 1;
 	int i;
 
@@ -1685,13 +1691,18 @@ static void __init serial8250_isa_init_ports(void)
 		return;
 	first = 0;
 
-	for (i = 0; i < ARRAY_SIZE(old_serial_port); i++) {
-		serial8250_ports[i].port.iobase  = old_serial_port[i].port;
-		serial8250_ports[i].port.irq     = irq_cannonicalize(old_serial_port[i].irq);
-		serial8250_ports[i].port.uartclk = old_serial_port[i].base_baud * 16;
-		serial8250_ports[i].port.flags   = old_serial_port[i].flags;
-		serial8250_ports[i].port.hub6    = old_serial_port[i].hub6;
-		serial8250_ports[i].port.ops     = &serial8250_pops;
+	for (i = 0, up = serial8250_ports; i < ARRAY_SIZE(old_serial_port);
+	     i++, up++) {
+		up->port.iobase   = old_serial_port[i].port;
+		up->port.irq      = irq_cannonicalize(old_serial_port[i].irq);
+		up->port.uartclk  = old_serial_port[i].baud_base * 16;
+		up->port.flags    = old_serial_port[i].flags |
+				    UPF_RESOURCES;
+		up->port.hub6     = old_serial_port[i].hub6;
+		up->port.membase  = old_serial_port[i].iomem_base;
+		up->port.iotype   = old_serial_port[i].io_type;
+		up->port.regshift = old_serial_port[i].iomem_reg_shift;
+		up->port.ops      = &serial8250_pops;
 	}
 }
 
diff --git a/drivers/serial/8250.h b/drivers/serial/8250.h
index 7e40a4f497e4..030116e6ebf6 100644
--- a/drivers/serial/8250.h
+++ b/drivers/serial/8250.h
@@ -30,11 +30,14 @@ void serial8250_get_irq_map(unsigned int *map);
 
 struct old_serial_port {
 	unsigned int uart;
-	unsigned int base_baud;
+	unsigned int baud_base;
 	unsigned int port;
 	unsigned int irq;
 	unsigned int flags;
 	unsigned char hub6;
+	unsigned char io_type;
+	unsigned char *iomem_base;
+	unsigned short iomem_reg_shift;
 };
 
 #undef SERIAL_DEBUG_PCI
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 8cf9eccab7c1..c38330747ee1 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -168,6 +168,8 @@ struct uart_port {
 #define UPF_BUGGY_UART		(1 << 14)
 #define UPF_AUTOPROBE		(1 << 15)
 #define UPF_BOOT_AUTOCONF	(1 << 28)
+#define UPF_RESOURCES		(1 << 30)
+#define UPF_IOREMAP		(1 << 31)
 
 #define UPF_FLAGS		(0x7fff)
 #define UPF_USR_MASK		(UPF_SPD_MASK|UPF_LOW_LATENCY)
-- 
cgit v1.2.3


From 529ba807382a8fb05b096d4194c7bd16377f34ed Mon Sep 17 00:00:00 2001
From: Alexander Viro <viro@math.psu.edu>
Date: Sat, 5 Oct 2002 04:22:13 -0700
Subject: [PATCH] nbd switched to alloc_disk()

---
 drivers/block/nbd.c | 29 ++++++++++++++++++++++-------
 include/linux/nbd.h |  2 +-
 2 files changed, 23 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index e554bfb4d331..be27027d32b8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -444,15 +444,15 @@ static int nbd_ioctl(struct inode *inode, struct file *file,
 			temp >>= 1;
 		}
 		nbd_bytesizes[dev] &= ~(nbd_blksizes[dev]-1); 
-		set_capacity(&lo->disk, nbd_bytesizes[dev] >> 9);
+		set_capacity(lo->disk, nbd_bytesizes[dev] >> 9);
 		return 0;
 	case NBD_SET_SIZE:
 		nbd_bytesizes[dev] = arg & ~(nbd_blksizes[dev]-1); 
-		set_capacity(&lo->disk, nbd_bytesizes[dev] >> 9);
+		set_capacity(lo->disk, nbd_bytesizes[dev] >> 9);
 		return 0;
 	case NBD_SET_SIZE_BLOCKS:
 		nbd_bytesizes[dev] = ((u64) arg) << nbd_blksize_bits[dev]; 
-		set_capacity(&lo->disk, nbd_bytesizes[dev] >> 9);
+		set_capacity(lo->disk, nbd_bytesizes[dev] >> 9);
 		return 0;
 	case NBD_DO_IT:
 		if (!lo->file)
@@ -498,6 +498,7 @@ static struct block_device_operations nbd_fops =
 
 static int __init nbd_init(void)
 {
+	int err = -ENOMEM;
 	int i;
 
 	if (sizeof(struct nbd_request) != 28) {
@@ -505,17 +506,25 @@ static int __init nbd_init(void)
 		return -EIO;
 	}
 
+	for (i = 0; i < MAX_NBD; i++) {
+		struct gendisk *disk = alloc_disk();
+		if (!disk)
+			goto out;
+		nbd_dev[i].disk = disk;
+	}
+
 	if (register_blkdev(MAJOR_NR, "nbd", &nbd_fops)) {
 		printk("Unable to get major number %d for NBD\n",
 		       MAJOR_NR);
-		return -EIO;
+		err = -EIO;
+		goto out;
 	}
 #ifdef MODULE
 	printk("nbd: registered device at major %d\n", MAJOR_NR);
 #endif
 	blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request, &nbd_lock);
 	for (i = 0; i < MAX_NBD; i++) {
-		struct gendisk *disk = &nbd_dev[i].disk;
+		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].refcnt = 0;
 		nbd_dev[i].file = NULL;
 		nbd_dev[i].magic = LO_MAGIC;
@@ -541,13 +550,19 @@ static int __init nbd_init(void)
 			       &nbd_fops, NULL);
 
 	return 0;
+out:
+	while (i--)
+		put_disk(nbd_dev[i].disk);
+	return err;
 }
 
 static void __exit nbd_cleanup(void)
 {
 	int i;
-	for (i = 0; i < MAX_NBD; i++)
-		del_gendisk(&nbd_dev[i].disk);
+	for (i = 0; i < MAX_NBD; i++) {
+		del_gendisk(nbd_dev[i].disk);
+		put_disk(nbd_dev[i].disk);
+	}
 	devfs_unregister (devfs_handle);
 	blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR));
 
diff --git a/include/linux/nbd.h b/include/linux/nbd.h
index 8e4b8c236e8b..b20c045dbb79 100644
--- a/include/linux/nbd.h
+++ b/include/linux/nbd.h
@@ -79,7 +79,7 @@ struct nbd_device {
 	spinlock_t queue_lock;
 	struct list_head queue_head;	/* Requests are added here...			*/
 	struct semaphore tx_lock;
-	struct gendisk disk;
+	struct gendisk *disk;
 };
 #endif
 
-- 
cgit v1.2.3


From 4d1fc631c727b96c5fc7de3e7aa3bef70f57dddf Mon Sep 17 00:00:00 2001
From: Alexander Viro <viro@math.psu.edu>
Date: Sat, 5 Oct 2002 04:22:53 -0700
Subject: [PATCH] amiga floppy switched to alloc_disk()

---
 drivers/block/amiflop.c | 47 +++++++++++++++++++++++++++--------------------
 include/linux/amifd.h   |  2 +-
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 53c752eed240..22790c4145fe 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1643,7 +1643,7 @@ static int floppy_open(struct inode *inode, struct file *filp)
 	unit[drive].dtype=&data_types[system];
 	unit[drive].blocks=unit[drive].type->heads*unit[drive].type->tracks*
 		data_types[system].sects*unit[drive].type->sect_mult;
-	set_capacity(&unit[drive].disk, unit[drive].blocks);
+	set_capacity(unit[drive].gendisk, unit[drive].blocks);
 
 	printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
 	       unit[drive].type->name, data_types[system].name);
@@ -1731,25 +1731,31 @@ static int __init fd_probe_drives(void)
 	drives=0;
 	nomem=0;
 	for(drive=0;drive<FD_MAX_UNITS;drive++) {
+		struct gendisk *disk;
 		fd_probe(drive);
-		if (unit[drive].type->code != FD_NODRIVE) {
-			struct gendisk *disk = &unit[drive].disk;
-			drives++;
-			if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
-				printk("no mem for ");
-				unit[drive].type = &drive_types[num_dr_types - 1]; /* FD_NODRIVE */
-				drives--;
-				nomem = 1;
-			}
-			printk("fd%d ",drive);
-			disk->major = MAJOR_NR;
-			disk->first_minor = drive;
-			disk->minor_shift = 0;
-			disk->fops = &floppy_fops;
-			sprintf(disk->disk_name, "fd%d", drive);
-			set_capacity(disk, 880*2);
-			add_disk(disk);
+		if (unit[drive].type->code == FD_NODRIVE)
+			continue;
+		disk = alloc_disk();
+		if (!disk) {
+			unit[drive].type->code = FD_NODRIVE;
+			continue;
+		}
+		unit[drive].gendisk = disk;
+		drives++;
+		if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
+			printk("no mem for ");
+			unit[drive].type = &drive_types[num_dr_types - 1]; /* FD_NODRIVE */
+			drives--;
+			nomem = 1;
 		}
+		printk("fd%d ",drive);
+		disk->major = MAJOR_NR;
+		disk->first_minor = drive;
+		disk->minor_shift = 0;
+		disk->fops = &floppy_fops;
+		sprintf(disk->disk_name, "fd%d", drive);
+		set_capacity(disk, 880*2);
+		add_disk(disk);
 	}
 	if ((drives > 0) || (nomem == 0)) {
 		if (drives == 0)
@@ -1766,7 +1772,7 @@ static struct gendisk *floppy_find(int minor)
 	int drive = minor & 3;
 	if (unit[drive].type->code == FD_NODRIVE)
 		return NULL;
-	return &unit[drive].disk;
+	return unit[drive].gendisk;
 }
 
 int __init amiga_floppy_init(void)
@@ -1875,7 +1881,8 @@ void cleanup_module(void)
 
 	for( i = 0; i < FD_MAX_UNITS; i++) {
 		if (unit[i].type->code != FD_NODRIVE) {
-			del_gendisk(&unit[i].disk);
+			del_gendisk(unit[i].gendisk);
+			put_disk(unit[i].gendisk);
 			kfree(unit[i].trackbuf);
 		}
 	}
diff --git a/include/linux/amifd.h b/include/linux/amifd.h
index eac36f77fa8f..346993268b45 100644
--- a/include/linux/amifd.h
+++ b/include/linux/amifd.h
@@ -55,7 +55,7 @@ struct amiga_floppy_struct {
     int busy;			/* true when drive is active */
     int dirty;			/* true when trackbuf is not on disk */
     int status;			/* current error code for unit */
-    struct gendisk disk;
+    struct gendisk *gendisk;
 };
 #endif
 
-- 
cgit v1.2.3