From 3dccf5d07f68e850b84daede79f3c8bc121f1546 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:39:25 -0700
Subject: [PATCH] x86-64 update

From: Andi Kleen <ak@muc.de>

Current x86-64 patchkit for 2.6.5.

- Add drivers/firmware/Kconfig

- Clarify description of CONFIG_IOMMU_DEBUG

- Use correct gcc option to optimize for Intel CPUs

- Add EDD support (Matt Domsch)

- Add workaround for broken IOMMU on VIA hardware.  Uses swiotlb there now.

- Handle more than 8 local APICs (Suresh B Siddha)

- Delete obsolete mtrr Makefile

- Add x86_cache_alignment and set it up properly for P4 (128 bytes instead
  of 64bytes).  Also report in /proc/cpuinfo

- Minor cleanup in in_gate_area

- Make asm-generic/dma-mapping.h compile with !CONFIG_PCI Just stub out all
  functions in this case.  This is mainly to work around sysfs.

- More !CONFIG_PCI compile fixes

- Make u64 sector_t unconditional
---
 include/asm-generic/dma-mapping.h | 117 ++++++++++++++++++++++++++++++++++++++
 include/asm-x86_64/apicdef.h      |   2 +
 include/asm-x86_64/bootsetup.h    |   3 +
 include/asm-x86_64/pci.h          |   3 +-
 include/asm-x86_64/processor.h    |   3 +-
 include/asm-x86_64/proto.h        |   2 +
 include/asm-x86_64/smp.h          |  24 +++++++-
 include/asm-x86_64/types.h        |   4 --
 8 files changed, 150 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h
index 54b0f7b71e95..04a28e6dd366 100644
--- a/include/asm-generic/dma-mapping.h
+++ b/include/asm-generic/dma-mapping.h
@@ -7,6 +7,10 @@
 #ifndef _ASM_GENERIC_DMA_MAPPING_H
 #define _ASM_GENERIC_DMA_MAPPING_H
 
+#include <linux/config.h>
+
+#ifdef CONFIG_PCI
+
 /* we implement the API below in terms of the existing PCI one,
  * so include it */
 #include <linux/pci.h>
@@ -146,6 +150,119 @@ dma_mapping_error(dma_addr_t dma_addr)
 	return pci_dma_mapping_error(dma_addr);
 }
 
+
+#else
+
+static inline int
+dma_supported(struct device *dev, u64 mask)
+{
+	return 0;
+}
+
+static inline int
+dma_set_mask(struct device *dev, u64 dma_mask)
+{
+	BUG();
+	return 0;
+}
+
+static inline void *
+dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		   int flag)
+{
+	BUG();
+	return NULL;
+}
+
+static inline void
+dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
+		    dma_addr_t dma_handle)
+{
+	BUG();
+}
+
+static inline dma_addr_t
+dma_map_single(struct device *dev, void *cpu_addr, size_t size,
+	       enum dma_data_direction direction)
+{
+	BUG();
+	return 0;
+}
+
+static inline void
+dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
+		 enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline dma_addr_t
+dma_map_page(struct device *dev, struct page *page,
+	     unsigned long offset, size_t size,
+	     enum dma_data_direction direction)
+{
+	BUG();
+	return 0;
+}
+
+static inline void
+dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
+	       enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline int
+dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+	   enum dma_data_direction direction)
+{
+	BUG();
+	return 0;
+}
+
+static inline void
+dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
+	     enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
+			   enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+		       enum dma_data_direction direction)
+{
+	BUG();
+}
+
+static inline int
+dma_error(dma_addr_t dma_addr)
+{
+	return 0;
+}
+
+#endif
+
 /* Now for the API extensions over the pci_ one */
 
 #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
diff --git a/include/asm-x86_64/apicdef.h b/include/asm-x86_64/apicdef.h
index 3a32c1452a89..8ba1d6ef13b2 100644
--- a/include/asm-x86_64/apicdef.h
+++ b/include/asm-x86_64/apicdef.h
@@ -370,4 +370,6 @@ struct local_apic {
 
 #undef u32
 
+#define BAD_APICID 0xFFu
+
 #endif
diff --git a/include/asm-x86_64/bootsetup.h b/include/asm-x86_64/bootsetup.h
index b4644415575a..ee1557748b0e 100644
--- a/include/asm-x86_64/bootsetup.h
+++ b/include/asm-x86_64/bootsetup.h
@@ -26,6 +26,9 @@ extern char x86_boot_params[2048];
 #define INITRD_START (*(unsigned int *) (PARAM+0x218))
 #define INITRD_SIZE (*(unsigned int *) (PARAM+0x21c))
 #define EDID_INFO (*(struct edid_info *) (PARAM+0x440))
+#define DISK80_SIGNATURE (*(unsigned int*) (PARAM+DISK80_SIG_BUFFER))
+#define EDD_NR     (*(unsigned char *) (PARAM+EDDNR))
+#define EDD_BUF     ((struct edd_info *) (PARAM+EDDBUF))
 #define COMMAND_LINE saved_command_line
 #define COMMAND_LINE_SIZE 256
 
diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h
index 3e40884b0b19..ac9e9581d0a3 100644
--- a/include/asm-x86_64/pci.h
+++ b/include/asm-x86_64/pci.h
@@ -357,8 +357,9 @@ static inline void pcibios_add_platform_entries(struct pci_dev *dev)
 #endif /* __KERNEL__ */
 
 /* generic pci stuff */
+#ifdef CONFIG_PCI
 #include <asm-generic/pci.h>
-
 #include <linux/dma-mapping.h>
+#endif
 
 #endif /* __x8664_PCI_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 14bab87d299b..a0ecd64a6a89 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -59,6 +59,7 @@ struct cpuinfo_x86 {
 	char	x86_model_id[64];
 	int 	x86_cache_size;  /* in KB */
 	int	x86_clflush_size;
+	int	x86_cache_alignment;
 	int	x86_tlbsize;	/* number of 4K pages in DTLB/ITLB combined(in pages)*/
         __u8    x86_virt_bits, x86_phys_bits;
         __u32   x86_power; 	
@@ -453,6 +454,6 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 	ti->task;					\
 })
 
-#define cache_line_size() (boot_cpu_data.x86_clflush_size)
+#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
 
 #endif /* __ASM_X86_64_PROCESSOR_H */
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 693f1a022314..5b0c38182172 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -101,6 +101,8 @@ extern int acpi_disabled;
 extern int fallback_aper_order;
 extern int fallback_aper_force;
 extern int iommu_aperture;
+extern int iommu_aperture_disabled;
+extern int iommu_aperture_allowed;
 
 extern void smp_local_timer_interrupt(struct pt_regs * regs);
 
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 97a19c35864f..e82b6a9884fb 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -80,10 +80,30 @@ extern __inline int hard_smp_processor_id(void)
  * the real APIC ID <-> CPU # mapping.
  * AK: why is this volatile?
  */
-extern volatile char x86_apicid_to_cpu[NR_CPUS];
 extern volatile char x86_cpu_to_apicid[NR_CPUS];
 
-#define safe_smp_processor_id() (disable_apic ? 0 : x86_apicid_to_cpu[hard_smp_processor_id()])
+static inline char x86_apicid_to_cpu(char apicid)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; ++i)
+		if (x86_cpu_to_apicid[i] == apicid)
+			return i;
+
+	return -1;
+}
+
+#define safe_smp_processor_id() (disable_apic ? 0 : x86_apicid_to_cpu(hard_smp_processor_id()))
+
+extern u8 bios_cpu_apicid[];
+
+static inline int cpu_present_to_apicid(int mps_cpu)
+{
+	if (mps_cpu < NR_CPUS)
+		return (int)bios_cpu_apicid[mps_cpu];
+	else
+		return BAD_APICID;
+}
 
 #define cpu_online(cpu) cpu_isset(cpu, cpu_online_map)
 #endif /* !ASSEMBLY */
diff --git a/include/asm-x86_64/types.h b/include/asm-x86_64/types.h
index b7c4d2fb9509..c86c2e6793e2 100644
--- a/include/asm-x86_64/types.h
+++ b/include/asm-x86_64/types.h
@@ -33,8 +33,6 @@ typedef unsigned long long  __u64;
 
 #ifndef __ASSEMBLY__
 
-#include <linux/config.h>
-
 typedef signed char s8;
 typedef unsigned char u8;
 
@@ -50,10 +48,8 @@ typedef unsigned long long u64;
 typedef u64 dma64_addr_t;
 typedef u64 dma_addr_t;
 
-#ifdef CONFIG_LBD
 typedef u64 sector_t;
 #define HAVE_SECTOR_T
-#endif
 
 #endif /* __ASSEMBLY__ */
 
-- 
cgit v1.2.3


From 243c64b2cfea7e49e074c80db65fa7b90d765c6f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:39:51 -0700
Subject: [PATCH] feed devfs through Lindent

Nobody seems to have any outstanding work against devfs, so...
---
 fs/devfs/base.c                 | 2857 ++++++++++++++++++++-------------------
 fs/devfs/util.c                 |    2 +-
 include/linux/devfs_fs.h        |   32 +-
 include/linux/devfs_fs_kernel.h |   26 +-
 4 files changed, 1467 insertions(+), 1450 deletions(-)

(limited to 'include')

diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 952031e7dd99..c7ea29114c4f 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -710,8 +710,8 @@
 #define DEBUG_UNREGISTER   0x0000004
 #define DEBUG_FREE         0x0000008
 #define DEBUG_SET_FLAGS    0x0000010
-#define DEBUG_S_READ       0x0000100        /*  Break  */
-#define DEBUG_I_LOOKUP     0x0001000        /*  Break  */
+#define DEBUG_S_READ       0x0000100	/*  Break  */
+#define DEBUG_I_LOOKUP     0x0001000	/*  Break  */
 #define DEBUG_I_CREATE     0x0002000
 #define DEBUG_I_GET        0x0004000
 #define DEBUG_I_CHANGE     0x0008000
@@ -719,8 +719,8 @@
 #define DEBUG_I_RLINK      0x0020000
 #define DEBUG_I_FLINK      0x0040000
 #define DEBUG_I_MKNOD      0x0080000
-#define DEBUG_F_READDIR    0x0100000        /*  Break  */
-#define DEBUG_D_DELETE     0x1000000        /*  Break  */
+#define DEBUG_F_READDIR    0x0100000	/*  Break  */
+#define DEBUG_D_DELETE     0x1000000	/*  Break  */
 #define DEBUG_D_RELEASE    0x2000000
 #define DEBUG_D_IPUT       0x4000000
 #define DEBUG_ALL          0xfffffff
@@ -753,88 +753,80 @@
 
 typedef struct devfs_entry *devfs_handle_t;
 
-struct directory_type
-{
-    rwlock_t lock;                   /*  Lock for searching(R)/updating(W)   */
-    struct devfs_entry *first;
-    struct devfs_entry *last;
-    unsigned char no_more_additions:1;
+struct directory_type {
+	rwlock_t lock;		/*  Lock for searching(R)/updating(W)   */
+	struct devfs_entry *first;
+	struct devfs_entry *last;
+	unsigned char no_more_additions:1;
 };
 
-struct symlink_type
-{
-    unsigned int length;         /*  Not including the NULL-termimator       */
-    char *linkname;              /*  This is NULL-terminated                 */
+struct symlink_type {
+	unsigned int length;	/*  Not including the NULL-termimator       */
+	char *linkname;		/*  This is NULL-terminated                 */
 };
 
-struct devfs_inode     /*  This structure is for "persistent" inode storage  */
-{
-    struct dentry *dentry;
-    struct timespec atime;
-    struct timespec mtime;
-    struct timespec ctime;
-    unsigned int ino;            /*  Inode number as seen in the VFS         */
-    uid_t uid;
-    gid_t gid;
+struct devfs_inode {		/*  This structure is for "persistent" inode storage  */
+	struct dentry *dentry;
+	struct timespec atime;
+	struct timespec mtime;
+	struct timespec ctime;
+	unsigned int ino;	/*  Inode number as seen in the VFS         */
+	uid_t uid;
+	gid_t gid;
 };
 
-struct devfs_entry
-{
+struct devfs_entry {
 #ifdef CONFIG_DEVFS_DEBUG
-    unsigned int magic_number;
+	unsigned int magic_number;
 #endif
-    void *info;
-    atomic_t refcount;           /*  When this drops to zero, it's unused    */
-    union 
-    {
-	struct directory_type dir;
-	dev_t dev;
-	struct symlink_type symlink;
-	const char *name;        /*  Only used for (mode == 0)               */
-    }
-    u;
-    struct devfs_entry *prev;    /*  Previous entry in the parent directory  */
-    struct devfs_entry *next;    /*  Next entry in the parent directory      */
-    struct devfs_entry *parent;  /*  The parent directory                    */
-    struct devfs_inode inode;
-    umode_t mode;
-    unsigned short namelen;      /*  I think 64k+ filenames are a way off... */
-    unsigned char vfs:1;/*  Whether the VFS may delete the entry   */
-    char name[1];                /*  This is just a dummy: the allocated array
-				     is bigger. This is NULL-terminated      */
+	void *info;
+	atomic_t refcount;	/*  When this drops to zero, it's unused    */
+	union {
+		struct directory_type dir;
+		dev_t dev;
+		struct symlink_type symlink;
+		const char *name;	/*  Only used for (mode == 0)               */
+	} u;
+	struct devfs_entry *prev;	/*  Previous entry in the parent directory  */
+	struct devfs_entry *next;	/*  Next entry in the parent directory      */
+	struct devfs_entry *parent;	/*  The parent directory                    */
+	struct devfs_inode inode;
+	umode_t mode;
+	unsigned short namelen;	/*  I think 64k+ filenames are a way off... */
+	unsigned char vfs:1;	/*  Whether the VFS may delete the entry   */
+	char name[1];		/*  This is just a dummy: the allocated array
+				   is bigger. This is NULL-terminated      */
 };
 
 /*  The root of the device tree  */
 static struct devfs_entry *root_entry;
 
-struct devfsd_buf_entry
-{
-    struct devfs_entry *de;      /*  The name is generated with this         */
-    unsigned short type;         /*  The type of event                       */
-    umode_t mode;
-    uid_t uid;
-    gid_t gid;
-    struct devfsd_buf_entry *next;
+struct devfsd_buf_entry {
+	struct devfs_entry *de;	/*  The name is generated with this         */
+	unsigned short type;	/*  The type of event                       */
+	umode_t mode;
+	uid_t uid;
+	gid_t gid;
+	struct devfsd_buf_entry *next;
 };
 
-struct fs_info                  /*  This structure is for the mounted devfs  */
-{
-    struct super_block *sb;
-    spinlock_t devfsd_buffer_lock;  /*  Lock when inserting/deleting events  */
-    struct devfsd_buf_entry *devfsd_first_event;
-    struct devfsd_buf_entry *devfsd_last_event;
-    volatile int devfsd_sleeping;
-    volatile struct task_struct *devfsd_task;
-    volatile pid_t devfsd_pgrp;
-    volatile struct file *devfsd_file;
-    struct devfsd_notify_struct *devfsd_info;
-    volatile unsigned long devfsd_event_mask;
-    atomic_t devfsd_overrun_count;
-    wait_queue_head_t devfsd_wait_queue;      /*  Wake devfsd on input       */
-    wait_queue_head_t revalidate_wait_queue;  /*  Wake when devfsd sleeps    */
+struct fs_info {		/*  This structure is for the mounted devfs  */
+	struct super_block *sb;
+	spinlock_t devfsd_buffer_lock;	/*  Lock when inserting/deleting events  */
+	struct devfsd_buf_entry *devfsd_first_event;
+	struct devfsd_buf_entry *devfsd_last_event;
+	volatile int devfsd_sleeping;
+	volatile struct task_struct *devfsd_task;
+	volatile pid_t devfsd_pgrp;
+	volatile struct file *devfsd_file;
+	struct devfsd_notify_struct *devfsd_info;
+	volatile unsigned long devfsd_event_mask;
+	atomic_t devfsd_overrun_count;
+	wait_queue_head_t devfsd_wait_queue;	/*  Wake devfsd on input       */
+	wait_queue_head_t revalidate_wait_queue;	/*  Wake when devfsd sleeps    */
 };
 
-static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED};
+static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED };
 static kmem_cache_t *devfsd_buf_cache;
 #ifdef CONFIG_DEVFS_DEBUG
 static unsigned int devfs_debug_init __initdata = DEBUG_NONE;
@@ -844,7 +836,7 @@ static unsigned int stat_num_entries;
 static unsigned int stat_num_bytes;
 #endif
 static unsigned char poison_array[8] =
-    {0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a};
+    { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a };
 
 #ifdef CONFIG_DEVFS_MOUNT
 static unsigned int boot_options = OPTION_MOUNT;
@@ -853,75 +845,77 @@ static unsigned int boot_options = OPTION_NONE;
 #endif
 
 /*  Forward function declarations  */
-static devfs_handle_t _devfs_walk_path (struct devfs_entry *dir,
-					const char *name, int namelen,
-					int traverse_symlink);
-static ssize_t devfsd_read (struct file *file, char *buf, size_t len,
-			    loff_t *ppos);
-static int devfsd_ioctl (struct inode *inode, struct file *file,
-			 unsigned int cmd, unsigned long arg);
-static int devfsd_close (struct inode *inode, struct file *file);
+static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir,
+				       const char *name, int namelen,
+				       int traverse_symlink);
+static ssize_t devfsd_read(struct file *file, char *buf, size_t len,
+			   loff_t * ppos);
+static int devfsd_ioctl(struct inode *inode, struct file *file,
+			unsigned int cmd, unsigned long arg);
+static int devfsd_close(struct inode *inode, struct file *file);
 #ifdef CONFIG_DEVFS_DEBUG
-static ssize_t stat_read (struct file *file, char *buf, size_t len,
-			  loff_t *ppos);
-static struct file_operations stat_fops =
-{
-    .read    = stat_read,
+static ssize_t stat_read(struct file *file, char *buf, size_t len,
+			 loff_t * ppos);
+static struct file_operations stat_fops = {
+	.read = stat_read,
 };
 #endif
 
-
 /*  Devfs daemon file operations  */
-static struct file_operations devfsd_fops =
-{
-    .read    = devfsd_read,
-    .ioctl   = devfsd_ioctl,
-    .release = devfsd_close,
+static struct file_operations devfsd_fops = {
+	.read = devfsd_read,
+	.ioctl = devfsd_ioctl,
+	.release = devfsd_close,
 };
 
-
 /*  Support functions follow  */
 
-
 /**
  *	devfs_get - Get a reference to a devfs entry.
  *	@de:  The devfs entry.
  */
 
-static struct devfs_entry *devfs_get (struct devfs_entry *de)
+static struct devfs_entry *devfs_get(struct devfs_entry *de)
 {
-    VERIFY_ENTRY (de);
-    if (de) atomic_inc (&de->refcount);
-    return de;
-}   /*  End Function devfs_get  */
+	VERIFY_ENTRY(de);
+	if (de)
+		atomic_inc(&de->refcount);
+	return de;
+}				/*  End Function devfs_get  */
 
 /**
  *	devfs_put - Put (release) a reference to a devfs entry.
  *	@de:  The handle to the devfs entry.
  */
 
-static void devfs_put (devfs_handle_t de)
-{
-    if (!de) return;
-    VERIFY_ENTRY (de);
-    if (de->info == POISON_PTR) OOPS ("(%p): poisoned pointer\n", de);
-    if ( !atomic_dec_and_test (&de->refcount) ) return;
-    if (de == root_entry) OOPS ("(%p): root entry being freed\n", de);
-    DPRINTK (DEBUG_FREE, "(%s): de: %p, parent: %p \"%s\"\n",
-	     de->name, de, de->parent,
-	     de->parent ? de->parent->name : "no parent");
-    if ( S_ISLNK (de->mode) ) kfree (de->u.symlink.linkname);
-    WRITE_ENTRY_MAGIC (de, 0);
+static void devfs_put(devfs_handle_t de)
+{
+	if (!de)
+		return;
+	VERIFY_ENTRY(de);
+	if (de->info == POISON_PTR)
+		OOPS("(%p): poisoned pointer\n", de);
+	if (!atomic_dec_and_test(&de->refcount))
+		return;
+	if (de == root_entry)
+		OOPS("(%p): root entry being freed\n", de);
+	DPRINTK(DEBUG_FREE, "(%s): de: %p, parent: %p \"%s\"\n",
+		de->name, de, de->parent,
+		de->parent ? de->parent->name : "no parent");
+	if (S_ISLNK(de->mode))
+		kfree(de->u.symlink.linkname);
+	WRITE_ENTRY_MAGIC(de, 0);
 #ifdef CONFIG_DEVFS_DEBUG
-    spin_lock (&stat_lock);
-    --stat_num_entries;
-    stat_num_bytes -= sizeof *de + de->namelen;
-    if ( S_ISLNK (de->mode) ) stat_num_bytes -= de->u.symlink.length + 1;
-    spin_unlock (&stat_lock);
+	spin_lock(&stat_lock);
+	--stat_num_entries;
+	stat_num_bytes -= sizeof *de + de->namelen;
+	if (S_ISLNK(de->mode))
+		stat_num_bytes -= de->u.symlink.length + 1;
+	spin_unlock(&stat_lock);
 #endif
-    de->info = POISON_PTR;
-    kfree (de);
-}   /*  End Function devfs_put  */
+	de->info = POISON_PTR;
+	kfree(de);
+}				/*  End Function devfs_put  */
 
 /**
  *	_devfs_search_dir - Search for a devfs entry in a directory.
@@ -934,26 +928,25 @@ static void devfs_put (devfs_handle_t de)
  *   An implicit devfs_get() is performed on the returned entry.
  */
 
-static struct devfs_entry *_devfs_search_dir (struct devfs_entry *dir,
-					      const char *name,
-					      unsigned int namelen)
+static struct devfs_entry *_devfs_search_dir(struct devfs_entry *dir,
+					     const char *name,
+					     unsigned int namelen)
 {
-    struct devfs_entry *curr;
-
-    if ( !S_ISDIR (dir->mode) )
-    {
-	PRINTK ("(%s): not a directory\n", dir->name);
-	return NULL;
-    }
-    for (curr = dir->u.dir.first; curr != NULL; curr = curr->next)
-    {
-	if (curr->namelen != namelen) continue;
-	if (memcmp (curr->name, name, namelen) == 0) break;
-	/*  Not found: try the next one  */
-    }
-    return devfs_get (curr);
-}   /*  End Function _devfs_search_dir  */
+	struct devfs_entry *curr;
 
+	if (!S_ISDIR(dir->mode)) {
+		PRINTK("(%s): not a directory\n", dir->name);
+		return NULL;
+	}
+	for (curr = dir->u.dir.first; curr != NULL; curr = curr->next) {
+		if (curr->namelen != namelen)
+			continue;
+		if (memcmp(curr->name, name, namelen) == 0)
+			break;
+		/*  Not found: try the next one  */
+	}
+	return devfs_get(curr);
+}				/*  End Function _devfs_search_dir  */
 
 /**
  *	_devfs_alloc_entry - Allocate a devfs entry.
@@ -965,36 +958,38 @@ static struct devfs_entry *_devfs_search_dir (struct devfs_entry *dir,
  *   %NULL.
  */
 
-static struct devfs_entry *_devfs_alloc_entry (const char *name,
-					       unsigned int namelen,
-					       umode_t mode)
+static struct devfs_entry *_devfs_alloc_entry(const char *name,
+					      unsigned int namelen,
+					      umode_t mode)
 {
-    struct devfs_entry *new;
-    static unsigned long inode_counter = FIRST_INODE;
-    static spinlock_t counter_lock = SPIN_LOCK_UNLOCKED;
-
-    if ( name && (namelen < 1) ) namelen = strlen (name);
-    if ( ( new = kmalloc (sizeof *new + namelen, GFP_KERNEL) ) == NULL )
-	return NULL;
-    memset (new, 0, sizeof *new + namelen);  /*  Will set '\0' on name  */
-    new->mode = mode;
-    if ( S_ISDIR (mode) ) rwlock_init (&new->u.dir.lock);
-    atomic_set (&new->refcount, 1);
-    spin_lock (&counter_lock);
-    new->inode.ino = inode_counter++;
-    spin_unlock (&counter_lock);
-    if (name) memcpy (new->name, name, namelen);
-    new->namelen = namelen;
-    WRITE_ENTRY_MAGIC (new, MAGIC_VALUE);
+	struct devfs_entry *new;
+	static unsigned long inode_counter = FIRST_INODE;
+	static spinlock_t counter_lock = SPIN_LOCK_UNLOCKED;
+
+	if (name && (namelen < 1))
+		namelen = strlen(name);
+	if ((new = kmalloc(sizeof *new + namelen, GFP_KERNEL)) == NULL)
+		return NULL;
+	memset(new, 0, sizeof *new + namelen);	/*  Will set '\0' on name  */
+	new->mode = mode;
+	if (S_ISDIR(mode))
+		rwlock_init(&new->u.dir.lock);
+	atomic_set(&new->refcount, 1);
+	spin_lock(&counter_lock);
+	new->inode.ino = inode_counter++;
+	spin_unlock(&counter_lock);
+	if (name)
+		memcpy(new->name, name, namelen);
+	new->namelen = namelen;
+	WRITE_ENTRY_MAGIC(new, MAGIC_VALUE);
 #ifdef CONFIG_DEVFS_DEBUG
-    spin_lock (&stat_lock);
-    ++stat_num_entries;
-    stat_num_bytes += sizeof *new + namelen;
-    spin_unlock (&stat_lock);
+	spin_lock(&stat_lock);
+	++stat_num_entries;
+	stat_num_bytes += sizeof *new + namelen;
+	spin_unlock(&stat_lock);
 #endif
-    return new;
-}   /*  End Function _devfs_alloc_entry  */
-
+	return new;
+}				/*  End Function _devfs_alloc_entry  */
 
 /**
  *	_devfs_append_entry - Append a devfs entry to a directory's child list.
@@ -1009,43 +1004,48 @@ static struct devfs_entry *_devfs_alloc_entry (const char *name,
  *   On failure, an implicit devfs_put() is performed on %de.
  */
 
-static int _devfs_append_entry (devfs_handle_t dir, devfs_handle_t de,
-				devfs_handle_t *old_de)
+static int _devfs_append_entry(devfs_handle_t dir, devfs_handle_t de,
+			       devfs_handle_t * old_de)
 {
-    int retval;
-
-    if (old_de) *old_de = NULL;
-    if ( !S_ISDIR (dir->mode) )
-    {
-	PRINTK ("(%s): dir: \"%s\" is not a directory\n", de->name, dir->name);
-	devfs_put (de);
-	return -ENOTDIR;
-    }
-    write_lock (&dir->u.dir.lock);
-    if (dir->u.dir.no_more_additions) retval = -ENOENT;
-    else
-    {
-	struct devfs_entry *old;
-
-	old = _devfs_search_dir (dir, de->name, de->namelen);
-	if (old_de) *old_de = old;
-	else devfs_put (old);
-	if (old == NULL)
-	{
-	    de->parent = dir;
-	    de->prev = dir->u.dir.last;
-	    /*  Append to the directory's list of children  */
-	    if (dir->u.dir.first == NULL) dir->u.dir.first = de;
-	    else dir->u.dir.last->next = de;
-	    dir->u.dir.last = de;
-	    retval = 0;
+	int retval;
+
+	if (old_de)
+		*old_de = NULL;
+	if (!S_ISDIR(dir->mode)) {
+		PRINTK("(%s): dir: \"%s\" is not a directory\n", de->name,
+		       dir->name);
+		devfs_put(de);
+		return -ENOTDIR;
 	}
-	else retval = -EEXIST;
-    }
-    write_unlock (&dir->u.dir.lock);
-    if (retval) devfs_put (de);
-    return retval;
-}   /*  End Function _devfs_append_entry  */
+	write_lock(&dir->u.dir.lock);
+	if (dir->u.dir.no_more_additions)
+		retval = -ENOENT;
+	else {
+		struct devfs_entry *old;
+
+		old = _devfs_search_dir(dir, de->name, de->namelen);
+		if (old_de)
+			*old_de = old;
+		else
+			devfs_put(old);
+		if (old == NULL) {
+			de->parent = dir;
+			de->prev = dir->u.dir.last;
+			/*  Append to the directory's list of children  */
+			if (dir->u.dir.first == NULL)
+				dir->u.dir.first = de;
+			else
+				dir->u.dir.last->next = de;
+			dir->u.dir.last = de;
+			retval = 0;
+		} else
+			retval = -EEXIST;
+	}
+	write_unlock(&dir->u.dir.lock);
+	if (retval)
+		devfs_put(de);
+	return retval;
+}				/*  End Function _devfs_append_entry  */
 
 /**
  *	_devfs_get_root_entry - Get the root devfs entry.
@@ -1067,7 +1067,7 @@ static struct devfs_entry *_devfs_get_root_entry(void)
 		return root_entry;
 
 	new = _devfs_alloc_entry(NULL, 0, MODE_DIR);
-	if (new == NULL )
+	if (new == NULL)
 		return NULL;
 
 	spin_lock(&root_lock);
@@ -1080,7 +1080,7 @@ static struct devfs_entry *_devfs_get_root_entry(void)
 	spin_unlock(&root_lock);
 
 	return root_entry;
-}   /*  End Function _devfs_get_root_entry  */
+}				/*  End Function _devfs_get_root_entry  */
 
 /**
  *	_devfs_descend - Descend down a tree using the next component name.
@@ -1096,142 +1096,134 @@ static struct devfs_entry *_devfs_get_root_entry(void)
  *   An implicit devfs_get() is performed on the returned entry.
  */
 
-static struct devfs_entry *_devfs_descend (struct devfs_entry *dir,
-					   const char *name, int namelen,
-					   int *next_pos)
-{
-    const char *stop, *ptr;
-    struct devfs_entry *entry;
-
-    if ( (namelen >= 3) && (strncmp (name, "../", 3) == 0) )
-    {   /*  Special-case going to parent directory  */
-	*next_pos = 3;
-	return devfs_get (dir->parent);
-    }
-    stop = name + namelen;
-    /*  Search for a possible '/'  */
-    for (ptr = name; (ptr < stop) && (*ptr != '/'); ++ptr);
-    *next_pos = ptr - name;
-    read_lock (&dir->u.dir.lock);
-    entry = _devfs_search_dir (dir, name, *next_pos);
-    read_unlock (&dir->u.dir.lock);
-    return entry;
-}   /*  End Function _devfs_descend  */
-
-
-static devfs_handle_t _devfs_make_parent_for_leaf (struct devfs_entry *dir,
-						   const char *name,
-						   int namelen, int *leaf_pos)
+static struct devfs_entry *_devfs_descend(struct devfs_entry *dir,
+					  const char *name, int namelen,
+					  int *next_pos)
 {
-    int next_pos = 0;
-
-    if (dir == NULL) dir = _devfs_get_root_entry ();
-    if (dir == NULL) return NULL;
-    devfs_get (dir);
-    /*  Search for possible trailing component and ignore it  */
-    for (--namelen; (namelen > 0) && (name[namelen] != '/'); --namelen);
-    *leaf_pos = (name[namelen] == '/') ? (namelen + 1) : 0;
-    for (; namelen > 0; name += next_pos, namelen -= next_pos)
-    {
-	struct devfs_entry *de, *old = NULL;
-
-	if ( ( de = _devfs_descend (dir, name, namelen, &next_pos) ) == NULL )
-	{
-	    de = _devfs_alloc_entry (name, next_pos, MODE_DIR);
-	    devfs_get (de);
-	    if ( !de || _devfs_append_entry (dir, de, &old) )
-	    {
-		devfs_put (de);
-		if ( !old || !S_ISDIR (old->mode) )
-		{
-		    devfs_put (old);
-		    devfs_put (dir);
-		    return NULL;
-		}
-		de = old;  /*  Use the existing directory  */
-	    }
+	const char *stop, *ptr;
+	struct devfs_entry *entry;
+
+	if ((namelen >= 3) && (strncmp(name, "../", 3) == 0)) {	/*  Special-case going to parent directory  */
+		*next_pos = 3;
+		return devfs_get(dir->parent);
 	}
-	if (de == dir->parent)
-	{
-	    devfs_put (dir);
-	    devfs_put (de);
-	    return NULL;
+	stop = name + namelen;
+	/*  Search for a possible '/'  */
+	for (ptr = name; (ptr < stop) && (*ptr != '/'); ++ptr) ;
+	*next_pos = ptr - name;
+	read_lock(&dir->u.dir.lock);
+	entry = _devfs_search_dir(dir, name, *next_pos);
+	read_unlock(&dir->u.dir.lock);
+	return entry;
+}				/*  End Function _devfs_descend  */
+
+static devfs_handle_t _devfs_make_parent_for_leaf(struct devfs_entry *dir,
+						  const char *name,
+						  int namelen, int *leaf_pos)
+{
+	int next_pos = 0;
+
+	if (dir == NULL)
+		dir = _devfs_get_root_entry();
+	if (dir == NULL)
+		return NULL;
+	devfs_get(dir);
+	/*  Search for possible trailing component and ignore it  */
+	for (--namelen; (namelen > 0) && (name[namelen] != '/'); --namelen) ;
+	*leaf_pos = (name[namelen] == '/') ? (namelen + 1) : 0;
+	for (; namelen > 0; name += next_pos, namelen -= next_pos) {
+		struct devfs_entry *de, *old = NULL;
+
+		if ((de =
+		     _devfs_descend(dir, name, namelen, &next_pos)) == NULL) {
+			de = _devfs_alloc_entry(name, next_pos, MODE_DIR);
+			devfs_get(de);
+			if (!de || _devfs_append_entry(dir, de, &old)) {
+				devfs_put(de);
+				if (!old || !S_ISDIR(old->mode)) {
+					devfs_put(old);
+					devfs_put(dir);
+					return NULL;
+				}
+				de = old;	/*  Use the existing directory  */
+			}
+		}
+		if (de == dir->parent) {
+			devfs_put(dir);
+			devfs_put(de);
+			return NULL;
+		}
+		devfs_put(dir);
+		dir = de;
+		if (name[next_pos] == '/')
+			++next_pos;
 	}
-	devfs_put (dir);
-	dir = de;
-	if (name[next_pos] == '/') ++next_pos;
-    }
-    return dir;
-}   /*  End Function _devfs_make_parent_for_leaf  */
-
+	return dir;
+}				/*  End Function _devfs_make_parent_for_leaf  */
 
-static devfs_handle_t _devfs_prepare_leaf (devfs_handle_t *dir,
-					   const char *name, umode_t mode)
+static devfs_handle_t _devfs_prepare_leaf(devfs_handle_t * dir,
+					  const char *name, umode_t mode)
 {
-    int namelen, leaf_pos;
-    struct devfs_entry *de;
-
-    namelen = strlen (name);
-    if ( ( *dir = _devfs_make_parent_for_leaf (*dir, name, namelen,
-					       &leaf_pos) ) == NULL )
-    {
-	PRINTK ("(%s): could not create parent path\n", name);
-	return NULL;
-    }
-    if ( ( de = _devfs_alloc_entry (name + leaf_pos, namelen - leaf_pos,mode) )
-	 == NULL )
-    {
-	PRINTK ("(%s): could not allocate entry\n", name);
-	devfs_put (*dir);
-	return NULL;
-    }
-    return de;
-}   /*  End Function _devfs_prepare_leaf  */
-
-
-static devfs_handle_t _devfs_walk_path (struct devfs_entry *dir,
-					const char *name, int namelen,
-					int traverse_symlink)
-{
-    int next_pos = 0;
-
-    if (dir == NULL) dir = _devfs_get_root_entry ();
-    if (dir == NULL) return NULL;
-    devfs_get (dir);
-    for (; namelen > 0; name += next_pos, namelen -= next_pos)
-    {
-	struct devfs_entry *de, *link;
-
-	if (!S_ISDIR (dir->mode))
-	{
-	    devfs_put (dir);
-	    return NULL;
-	}
+	int namelen, leaf_pos;
+	struct devfs_entry *de;
 
-	if ( ( de = _devfs_descend (dir, name, namelen, &next_pos) ) == NULL )
-	{
-	    devfs_put (dir);
-	    return NULL;
+	namelen = strlen(name);
+	if ((*dir = _devfs_make_parent_for_leaf(*dir, name, namelen,
+						&leaf_pos)) == NULL) {
+		PRINTK("(%s): could not create parent path\n", name);
+		return NULL;
 	}
-	if (S_ISLNK (de->mode) && traverse_symlink)
-	{   /*  Need to follow the link: this is a stack chomper  */
-		/* FIXME what if it puts outside of mounted tree? */
-	    link = _devfs_walk_path (dir, de->u.symlink.linkname,
-				     de->u.symlink.length, TRUE);
-	    devfs_put (de);
-	    if (!link)
-	    {
-		devfs_put (dir);
+	if ((de = _devfs_alloc_entry(name + leaf_pos, namelen - leaf_pos, mode))
+	    == NULL) {
+		PRINTK("(%s): could not allocate entry\n", name);
+		devfs_put(*dir);
 		return NULL;
-	    }
-	    de = link;
 	}
-	devfs_put (dir);
-	dir = de;
-	if (name[next_pos] == '/') ++next_pos;
-    }
-    return dir;
-}   /*  End Function _devfs_walk_path  */
+	return de;
+}				/*  End Function _devfs_prepare_leaf  */
+
+static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir,
+				       const char *name, int namelen,
+				       int traverse_symlink)
+{
+	int next_pos = 0;
+
+	if (dir == NULL)
+		dir = _devfs_get_root_entry();
+	if (dir == NULL)
+		return NULL;
+	devfs_get(dir);
+	for (; namelen > 0; name += next_pos, namelen -= next_pos) {
+		struct devfs_entry *de, *link;
+
+		if (!S_ISDIR(dir->mode)) {
+			devfs_put(dir);
+			return NULL;
+		}
+
+		if ((de =
+		     _devfs_descend(dir, name, namelen, &next_pos)) == NULL) {
+			devfs_put(dir);
+			return NULL;
+		}
+		if (S_ISLNK(de->mode) && traverse_symlink) {	/*  Need to follow the link: this is a stack chomper  */
+			/* FIXME what if it puts outside of mounted tree? */
+			link = _devfs_walk_path(dir, de->u.symlink.linkname,
+						de->u.symlink.length, TRUE);
+			devfs_put(de);
+			if (!link) {
+				devfs_put(dir);
+				return NULL;
+			}
+			de = link;
+		}
+		devfs_put(dir);
+		dir = de;
+		if (name[next_pos] == '/')
+			++next_pos;
+	}
+	return dir;
+}				/*  End Function _devfs_walk_path  */
 
 /**
  *	_devfs_find_entry - Find a devfs entry.
@@ -1244,40 +1236,37 @@ static devfs_handle_t _devfs_walk_path (struct devfs_entry *dir,
  *	devfs_get() is performed.
  */
 
-static struct devfs_entry *_devfs_find_entry (devfs_handle_t dir,
-					      const char *name,
-					      int traverse_symlink)
+static struct devfs_entry *_devfs_find_entry(devfs_handle_t dir,
+					     const char *name,
+					     int traverse_symlink)
 {
-    unsigned int namelen = strlen (name);
-
-    if (name[0] == '/')
-    {
-	/*  Skip leading pathname component  */
-	if (namelen < 2)
-	{
-	    PRINTK ("(%s): too short\n", name);
-	    return NULL;
-	}
-	for (++name, --namelen; (*name != '/') && (namelen > 0);
-	     ++name, --namelen);
-	if (namelen < 2)
-	{
-	    PRINTK ("(%s): too short\n", name);
-	    return NULL;
+	unsigned int namelen = strlen(name);
+
+	if (name[0] == '/') {
+		/*  Skip leading pathname component  */
+		if (namelen < 2) {
+			PRINTK("(%s): too short\n", name);
+			return NULL;
+		}
+		for (++name, --namelen; (*name != '/') && (namelen > 0);
+		     ++name, --namelen) ;
+		if (namelen < 2) {
+			PRINTK("(%s): too short\n", name);
+			return NULL;
+		}
+		++name;
+		--namelen;
 	}
-	++name;
-	--namelen;
-    }
-    return _devfs_walk_path (dir, name, namelen, traverse_symlink);
-}   /*  End Function _devfs_find_entry  */
+	return _devfs_walk_path(dir, name, namelen, traverse_symlink);
+}				/*  End Function _devfs_find_entry  */
 
-static struct devfs_entry *get_devfs_entry_from_vfs_inode (struct inode *inode)
+static struct devfs_entry *get_devfs_entry_from_vfs_inode(struct inode *inode)
 {
-    if (inode == NULL) return NULL;
-    VERIFY_ENTRY ( (struct devfs_entry *) inode->u.generic_ip );
-    return inode->u.generic_ip;
-}   /*  End Function get_devfs_entry_from_vfs_inode  */
-
+	if (inode == NULL)
+		return NULL;
+	VERIFY_ENTRY((struct devfs_entry *)inode->u.generic_ip);
+	return inode->u.generic_ip;
+}				/*  End Function get_devfs_entry_from_vfs_inode  */
 
 /**
  *	free_dentry - Free the dentry for a device entry and invalidate inode.
@@ -1287,20 +1276,21 @@ static struct devfs_entry *get_devfs_entry_from_vfs_inode (struct inode *inode)
  *	 parent directory.
  */
 
-static void free_dentry (struct devfs_entry *de)
+static void free_dentry(struct devfs_entry *de)
 {
-    struct dentry *dentry = de->inode.dentry;
-
-    if (!dentry) return;
-    spin_lock (&dcache_lock);
-    dget_locked (dentry);
-    spin_unlock (&dcache_lock);
-    /*  Forcefully remove the inode  */
-    if (dentry->d_inode != NULL) dentry->d_inode->i_nlink = 0;
-    d_drop (dentry);
-    dput (dentry);
-}   /*  End Function free_dentry  */
+	struct dentry *dentry = de->inode.dentry;
 
+	if (!dentry)
+		return;
+	spin_lock(&dcache_lock);
+	dget_locked(dentry);
+	spin_unlock(&dcache_lock);
+	/*  Forcefully remove the inode  */
+	if (dentry->d_inode != NULL)
+		dentry->d_inode->i_nlink = 0;
+	d_drop(dentry);
+	dput(dentry);
+}				/*  End Function free_dentry  */
 
 /**
  *	is_devfsd_or_child - Test if the current process is devfsd or one of its children.
@@ -1309,25 +1299,24 @@ static void free_dentry (struct devfs_entry *de)
  *	Returns %TRUE if devfsd or child, else %FALSE.
  */
 
-static int is_devfsd_or_child (struct fs_info *fs_info)
+static int is_devfsd_or_child(struct fs_info *fs_info)
 {
-    struct task_struct *p = current;
+	struct task_struct *p = current;
 
-    if (p == fs_info->devfsd_task) return (TRUE);
-    if (process_group(p) == fs_info->devfsd_pgrp) return (TRUE);
-    read_lock(&tasklist_lock);
-    for ( ; p != &init_task; p = p->real_parent)
-    {
 	if (p == fs_info->devfsd_task)
-	{
-	    read_unlock (&tasklist_lock);
-	    return (TRUE);
+		return (TRUE);
+	if (process_group(p) == fs_info->devfsd_pgrp)
+		return (TRUE);
+	read_lock(&tasklist_lock);
+	for (; p != &init_task; p = p->real_parent) {
+		if (p == fs_info->devfsd_task) {
+			read_unlock(&tasklist_lock);
+			return (TRUE);
+		}
 	}
-    }
-    read_unlock (&tasklist_lock);
-    return (FALSE);
-}   /*  End Function is_devfsd_or_child  */
-
+	read_unlock(&tasklist_lock);
+	return (FALSE);
+}				/*  End Function is_devfsd_or_child  */
 
 /**
  *	devfsd_queue_empty - Test if devfsd has work pending in its event queue.
@@ -1336,11 +1325,10 @@ static int is_devfsd_or_child (struct fs_info *fs_info)
  *	Returns %TRUE if the queue is empty, else %FALSE.
  */
 
-static inline int devfsd_queue_empty (struct fs_info *fs_info)
+static inline int devfsd_queue_empty(struct fs_info *fs_info)
 {
-    return (fs_info->devfsd_last_event) ? FALSE : TRUE;
-}   /*  End Function devfsd_queue_empty  */
-
+	return (fs_info->devfsd_last_event) ? FALSE : TRUE;
+}				/*  End Function devfsd_queue_empty  */
 
 /**
  *	wait_for_devfsd_finished - Wait for devfsd to finish processing its event queue.
@@ -1349,22 +1337,25 @@ static inline int devfsd_queue_empty (struct fs_info *fs_info)
  *	Returns %TRUE if no more waiting will be required, else %FALSE.
  */
 
-static int wait_for_devfsd_finished (struct fs_info *fs_info)
+static int wait_for_devfsd_finished(struct fs_info *fs_info)
 {
-    DECLARE_WAITQUEUE (wait, current);
-
-    if (fs_info->devfsd_task == NULL) return (TRUE);
-    if (devfsd_queue_empty (fs_info) && fs_info->devfsd_sleeping) return TRUE;
-    if ( is_devfsd_or_child (fs_info) ) return (FALSE);
-    set_current_state (TASK_UNINTERRUPTIBLE);
-    add_wait_queue (&fs_info->revalidate_wait_queue, &wait);
-    if (!devfsd_queue_empty (fs_info) || !fs_info->devfsd_sleeping)
-	if (fs_info->devfsd_task) schedule ();
-    remove_wait_queue (&fs_info->revalidate_wait_queue, &wait);
-    __set_current_state (TASK_RUNNING);
-    return (TRUE);
-}   /*  End Function wait_for_devfsd_finished  */
+	DECLARE_WAITQUEUE(wait, current);
 
+	if (fs_info->devfsd_task == NULL)
+		return (TRUE);
+	if (devfsd_queue_empty(fs_info) && fs_info->devfsd_sleeping)
+		return TRUE;
+	if (is_devfsd_or_child(fs_info))
+		return (FALSE);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	add_wait_queue(&fs_info->revalidate_wait_queue, &wait);
+	if (!devfsd_queue_empty(fs_info) || !fs_info->devfsd_sleeping)
+		if (fs_info->devfsd_task)
+			schedule();
+	remove_wait_queue(&fs_info->revalidate_wait_queue, &wait);
+	__set_current_state(TASK_RUNNING);
+	return (TRUE);
+}				/*  End Function wait_for_devfsd_finished  */
 
 /**
  *	devfsd_notify_de - Notify the devfsd daemon of a change.
@@ -1379,35 +1370,37 @@ static int wait_for_devfsd_finished (struct fs_info *fs_info)
  *	Returns %TRUE if an event was queued and devfsd woken up, else %FALSE.
  */
 
-static int devfsd_notify_de (struct devfs_entry *de,
-			     unsigned short type, umode_t mode,
-			     uid_t uid, gid_t gid, struct fs_info *fs_info)
+static int devfsd_notify_de(struct devfs_entry *de,
+			    unsigned short type, umode_t mode,
+			    uid_t uid, gid_t gid, struct fs_info *fs_info)
 {
-    struct devfsd_buf_entry *entry;
-    struct devfs_entry *curr;
-
-    if ( !( fs_info->devfsd_event_mask & (1 << type) ) ) return (FALSE);
-    if ( ( entry = kmem_cache_alloc (devfsd_buf_cache, SLAB_KERNEL) ) == NULL )
-    {
-	atomic_inc (&fs_info->devfsd_overrun_count);
-	return (FALSE);
-    }
-    for (curr = de; curr != NULL; curr = curr->parent) devfs_get (curr);
-    entry->de = de;
-    entry->type = type;
-    entry->mode = mode;
-    entry->uid = uid;
-    entry->gid = gid;
-    entry->next = NULL;
-    spin_lock (&fs_info->devfsd_buffer_lock);
-    if (!fs_info->devfsd_first_event) fs_info->devfsd_first_event = entry;
-    if (fs_info->devfsd_last_event) fs_info->devfsd_last_event->next = entry;
-    fs_info->devfsd_last_event = entry;
-    spin_unlock (&fs_info->devfsd_buffer_lock);
-    wake_up_interruptible (&fs_info->devfsd_wait_queue);
-    return (TRUE);
-}   /*  End Function devfsd_notify_de  */
+	struct devfsd_buf_entry *entry;
+	struct devfs_entry *curr;
 
+	if (!(fs_info->devfsd_event_mask & (1 << type)))
+		return (FALSE);
+	if ((entry = kmem_cache_alloc(devfsd_buf_cache, SLAB_KERNEL)) == NULL) {
+		atomic_inc(&fs_info->devfsd_overrun_count);
+		return (FALSE);
+	}
+	for (curr = de; curr != NULL; curr = curr->parent)
+		devfs_get(curr);
+	entry->de = de;
+	entry->type = type;
+	entry->mode = mode;
+	entry->uid = uid;
+	entry->gid = gid;
+	entry->next = NULL;
+	spin_lock(&fs_info->devfsd_buffer_lock);
+	if (!fs_info->devfsd_first_event)
+		fs_info->devfsd_first_event = entry;
+	if (fs_info->devfsd_last_event)
+		fs_info->devfsd_last_event->next = entry;
+	fs_info->devfsd_last_event = entry;
+	spin_unlock(&fs_info->devfsd_buffer_lock);
+	wake_up_interruptible(&fs_info->devfsd_wait_queue);
+	return (TRUE);
+}				/*  End Function devfsd_notify_de  */
 
 /**
  *	devfsd_notify - Notify the devfsd daemon of a change.
@@ -1417,11 +1410,11 @@ static int devfsd_notify_de (struct devfs_entry *de,
  *		the event.
  */
 
-static void devfsd_notify (struct devfs_entry *de,unsigned short type)
+static void devfsd_notify(struct devfs_entry *de, unsigned short type)
 {
 	devfsd_notify_de(de, type, de->mode, current->euid,
 			 current->egid, &fs_info);
-} 
+}
 
 static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args)
 {
@@ -1432,15 +1425,15 @@ static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args)
 	n = vsnprintf(buf, sizeof(buf), fmt, args);
 	if (n >= sizeof(buf) || !buf[0]) {
 		printk(KERN_WARNING "%s: invalid format string %s\n",
-				__FUNCTION__, fmt);
+		       __FUNCTION__, fmt);
 		return -EINVAL;
 	}
-	
+
 	de = _devfs_prepare_leaf(&dir, buf, mode);
 	if (!de) {
 		printk(KERN_WARNING "%s: could not prepare leaf for %s\n",
-				__FUNCTION__, buf);
-		return -ENOMEM;		/* could be more accurate... */
+		       __FUNCTION__, buf);
+		return -ENOMEM;	/* could be more accurate... */
 	}
 
 	de->u.dev = dev;
@@ -1448,12 +1441,12 @@ static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args)
 	error = _devfs_append_entry(dir, de, NULL);
 	if (error) {
 		printk(KERN_WARNING "%s: could not append to parent for %s\n",
-				__FUNCTION__, buf);
+		       __FUNCTION__, buf);
 		goto out;
 	}
 
 	devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
- out:
+      out:
 	devfs_put(dir);
 	return error;
 }
@@ -1464,7 +1457,7 @@ int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
 
 	if (!S_ISBLK(mode)) {
 		printk(KERN_WARNING "%s: invalide mode (%u) for %s\n",
-				__FUNCTION__, mode, fmt);
+		       __FUNCTION__, mode, fmt);
 		return -EINVAL;
 	}
 
@@ -1474,14 +1467,13 @@ int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
 
 EXPORT_SYMBOL(devfs_mk_bdev);
 
-
 int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
 {
 	va_list args;
 
 	if (!S_ISCHR(mode)) {
 		printk(KERN_WARNING "%s: invalide mode (%u) for %s\n",
-				__FUNCTION__, mode, fmt);
+		       __FUNCTION__, mode, fmt);
 		return -EINVAL;
 	}
 
@@ -1491,7 +1483,6 @@ int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
 
 EXPORT_SYMBOL(devfs_mk_cdev);
 
-
 /**
  *	_devfs_unhook - Unhook a device entry from its parents list
  *	@de: The entry to unhook.
@@ -1501,21 +1492,25 @@ EXPORT_SYMBOL(devfs_mk_cdev);
  *	The caller must have a write lock on the parent directory.
  */
 
-static int _devfs_unhook (struct devfs_entry *de)
-{
-    struct devfs_entry *parent;
-
-    if ( !de || (de->prev == de) ) return FALSE;
-    parent = de->parent;
-    if (de->prev == NULL) parent->u.dir.first = de->next;
-    else de->prev->next = de->next;
-    if (de->next == NULL) parent->u.dir.last = de->prev;
-    else de->next->prev = de->prev;
-    de->prev = de;          /*  Indicate we're unhooked                      */
-    de->next = NULL;        /*  Force early termination for <devfs_readdir>  */
-    return TRUE;
-}   /*  End Function _devfs_unhook  */
-
+static int _devfs_unhook(struct devfs_entry *de)
+{
+	struct devfs_entry *parent;
+
+	if (!de || (de->prev == de))
+		return FALSE;
+	parent = de->parent;
+	if (de->prev == NULL)
+		parent->u.dir.first = de->next;
+	else
+		de->prev->next = de->next;
+	if (de->next == NULL)
+		parent->u.dir.last = de->prev;
+	else
+		de->next->prev = de->prev;
+	de->prev = de;		/*  Indicate we're unhooked                      */
+	de->next = NULL;	/*  Force early termination for <devfs_readdir>  */
+	return TRUE;
+}				/*  End Function _devfs_unhook  */
 
 /**
  *	_devfs_unregister - Unregister a device entry from its parent.
@@ -1526,83 +1521,83 @@ static int _devfs_unhook (struct devfs_entry *de)
  *	unlocked by this function.
  */
 
-static void _devfs_unregister (struct devfs_entry *dir, struct devfs_entry *de)
+static void _devfs_unregister(struct devfs_entry *dir, struct devfs_entry *de)
 {
-    int unhooked = _devfs_unhook (de);
-
-    write_unlock (&dir->u.dir.lock);
-    if (!unhooked) return;
-    devfs_get (dir);
-    devfsd_notify (de, DEVFSD_NOTIFY_UNREGISTERED);
-    free_dentry (de);
-    devfs_put (dir);
-    if ( !S_ISDIR (de->mode) ) return;
-    while (TRUE)  /*  Recursively unregister: this is a stack chomper  */
-    {
-	struct devfs_entry *child;
-
-	write_lock (&de->u.dir.lock);
-	de->u.dir.no_more_additions = TRUE;
-	child = de->u.dir.first;
-	VERIFY_ENTRY (child);
-	_devfs_unregister (de, child);
-	if (!child) break;
-	DPRINTK (DEBUG_UNREGISTER, "(%s): child: %p  refcount: %d\n",
-		 child->name, child, atomic_read (&child->refcount) );
-	devfs_put (child);
-    }
-}   /*  End Function _devfs_unregister  */
-
-static int devfs_do_symlink (devfs_handle_t dir, const char *name,
-			     const char *link, devfs_handle_t *handle)
+	int unhooked = _devfs_unhook(de);
+
+	write_unlock(&dir->u.dir.lock);
+	if (!unhooked)
+		return;
+	devfs_get(dir);
+	devfsd_notify(de, DEVFSD_NOTIFY_UNREGISTERED);
+	free_dentry(de);
+	devfs_put(dir);
+	if (!S_ISDIR(de->mode))
+		return;
+	while (TRUE) {		/*  Recursively unregister: this is a stack chomper  */
+		struct devfs_entry *child;
+
+		write_lock(&de->u.dir.lock);
+		de->u.dir.no_more_additions = TRUE;
+		child = de->u.dir.first;
+		VERIFY_ENTRY(child);
+		_devfs_unregister(de, child);
+		if (!child)
+			break;
+		DPRINTK(DEBUG_UNREGISTER, "(%s): child: %p  refcount: %d\n",
+			child->name, child, atomic_read(&child->refcount));
+		devfs_put(child);
+	}
+}				/*  End Function _devfs_unregister  */
+
+static int devfs_do_symlink(devfs_handle_t dir, const char *name,
+			    const char *link, devfs_handle_t * handle)
 {
-    int err;
-    unsigned int linklength;
-    char *newlink;
-    struct devfs_entry *de;
-
-    if (handle != NULL) *handle = NULL;
-    if (name == NULL)
-    {
-	PRINTK ("(): NULL name pointer\n");
-	return -EINVAL;
-    }
-    if (link == NULL)
-    {
-	PRINTK ("(%s): NULL link pointer\n", name);
-	return -EINVAL;
-    }
-    linklength = strlen (link);
-    if ( ( newlink = kmalloc (linklength + 1, GFP_KERNEL) ) == NULL )
-	return -ENOMEM;
-    memcpy (newlink, link, linklength);
-    newlink[linklength] = '\0';
-    if ( ( de = _devfs_prepare_leaf (&dir, name, S_IFLNK | S_IRUGO | S_IXUGO) )
-	 == NULL )
-    {
-	PRINTK ("(%s): could not prepare leaf\n", name);
-	kfree (newlink);
-	return -ENOTDIR;
-    }
-    de->info = NULL;
-    de->u.symlink.linkname = newlink;
-    de->u.symlink.length = linklength;
-    if ( ( err = _devfs_append_entry (dir, de, NULL) ) != 0 )
-    {
-	PRINTK ("(%s): could not append to parent, err: %d\n", name, err);
-	devfs_put (dir);
-	return err;
-    }
-    devfs_put (dir);
+	int err;
+	unsigned int linklength;
+	char *newlink;
+	struct devfs_entry *de;
+
+	if (handle != NULL)
+		*handle = NULL;
+	if (name == NULL) {
+		PRINTK("(): NULL name pointer\n");
+		return -EINVAL;
+	}
+	if (link == NULL) {
+		PRINTK("(%s): NULL link pointer\n", name);
+		return -EINVAL;
+	}
+	linklength = strlen(link);
+	if ((newlink = kmalloc(linklength + 1, GFP_KERNEL)) == NULL)
+		return -ENOMEM;
+	memcpy(newlink, link, linklength);
+	newlink[linklength] = '\0';
+	if ((de = _devfs_prepare_leaf(&dir, name, S_IFLNK | S_IRUGO | S_IXUGO))
+	    == NULL) {
+		PRINTK("(%s): could not prepare leaf\n", name);
+		kfree(newlink);
+		return -ENOTDIR;
+	}
+	de->info = NULL;
+	de->u.symlink.linkname = newlink;
+	de->u.symlink.length = linklength;
+	if ((err = _devfs_append_entry(dir, de, NULL)) != 0) {
+		PRINTK("(%s): could not append to parent, err: %d\n", name,
+		       err);
+		devfs_put(dir);
+		return err;
+	}
+	devfs_put(dir);
 #ifdef CONFIG_DEVFS_DEBUG
-    spin_lock (&stat_lock);
-    stat_num_bytes += linklength + 1;
-    spin_unlock (&stat_lock);
+	spin_lock(&stat_lock);
+	stat_num_bytes += linklength + 1;
+	spin_unlock(&stat_lock);
 #endif
-    if (handle != NULL) *handle = de;
-    return 0;
-}   /*  End Function devfs_do_symlink  */
-
+	if (handle != NULL)
+		*handle = de;
+	return 0;
+}				/*  End Function devfs_do_symlink  */
 
 /**
  *	devfs_mk_symlink Create a symbolic link in the devfs namespace.
@@ -1626,7 +1621,6 @@ int devfs_mk_symlink(const char *from, const char *to)
 	return err;
 }
 
-
 /**
  *	devfs_mk_dir - Create a directory in the devfs namespace.
  *		new name is relative to the root of the devfs.
@@ -1668,19 +1662,18 @@ int devfs_mk_dir(const char *fmt, ...)
 		goto out_put;
 	} else if (error) {
 		PRINTK("(%s): could not append to dir: %p \"%s\"\n",
-				buf, dir, dir->name);
+		       buf, dir, dir->name);
 		devfs_put(old);
 		goto out_put;
 	}
-	
+
 	devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED);
 
- out_put:
+      out_put:
 	devfs_put(dir);
 	return error;
 }
 
-
 void devfs_remove(const char *fmt, ...)
 {
 	char buf[64];
@@ -1706,7 +1699,6 @@ void devfs_remove(const char *fmt, ...)
 	}
 }
 
-
 /**
  *	devfs_generate_path - Generate a pathname for an entry, relative to the devfs root.
  *	@de: The devfs entry.
@@ -1718,90 +1710,93 @@ void devfs_remove(const char *fmt, ...)
  *	else a negative error code.
  */
 
-static int devfs_generate_path (devfs_handle_t de, char *path, int buflen)
+static int devfs_generate_path(devfs_handle_t de, char *path, int buflen)
 {
-    int pos;
+	int pos;
 #define NAMEOF(de) ( (de)->mode ? (de)->name : (de)->u.name )
 
-    if (de == NULL) return -EINVAL;
-    VERIFY_ENTRY (de);
-    if (de->namelen >= buflen) return -ENAMETOOLONG; /*  Must be first       */
-    path[buflen - 1] = '\0';
-    if (de->parent == NULL) return buflen - 1;       /*  Don't prepend root  */
-    pos = buflen - de->namelen - 1;
-    memcpy (path + pos, NAMEOF (de), de->namelen);
-    for (de = de->parent; de->parent != NULL; de = de->parent)
-    {
-	if (pos - de->namelen - 1 < 0) return -ENAMETOOLONG;
-	path[--pos] = '/';
-	pos -= de->namelen;
-	memcpy (path + pos, NAMEOF (de), de->namelen);
-    }
-    return pos;
-}   /*  End Function devfs_generate_path  */
+	if (de == NULL)
+		return -EINVAL;
+	VERIFY_ENTRY(de);
+	if (de->namelen >= buflen)
+		return -ENAMETOOLONG;	/*  Must be first       */
+	path[buflen - 1] = '\0';
+	if (de->parent == NULL)
+		return buflen - 1;	/*  Don't prepend root  */
+	pos = buflen - de->namelen - 1;
+	memcpy(path + pos, NAMEOF(de), de->namelen);
+	for (de = de->parent; de->parent != NULL; de = de->parent) {
+		if (pos - de->namelen - 1 < 0)
+			return -ENAMETOOLONG;
+		path[--pos] = '/';
+		pos -= de->namelen;
+		memcpy(path + pos, NAMEOF(de), de->namelen);
+	}
+	return pos;
+}				/*  End Function devfs_generate_path  */
 
 /**
  *	devfs_setup - Process kernel boot options.
  *	@str: The boot options after the "devfs=".
  */
 
-static int __init devfs_setup (char *str)
+static int __init devfs_setup(char *str)
 {
-    static struct
-    {
-	char *name;
-	unsigned int mask;
-	unsigned int *opt;
-    } devfs_options_tab[] __initdata =
-    {
+	static struct {
+		char *name;
+		unsigned int mask;
+		unsigned int *opt;
+	} devfs_options_tab[] __initdata = {
 #ifdef CONFIG_DEVFS_DEBUG
-	{"dall",      DEBUG_ALL,          &devfs_debug_init},
-	{"dmod",      DEBUG_MODULE_LOAD,  &devfs_debug_init},
-	{"dreg",      DEBUG_REGISTER,     &devfs_debug_init},
-	{"dunreg",    DEBUG_UNREGISTER,   &devfs_debug_init},
-	{"dfree",     DEBUG_FREE,         &devfs_debug_init},
-	{"diget",     DEBUG_I_GET,        &devfs_debug_init},
-	{"dchange",   DEBUG_SET_FLAGS,    &devfs_debug_init},
-	{"dsread",    DEBUG_S_READ,       &devfs_debug_init},
-	{"dichange",  DEBUG_I_CHANGE,     &devfs_debug_init},
-	{"dimknod",   DEBUG_I_MKNOD,      &devfs_debug_init},
-	{"dilookup",  DEBUG_I_LOOKUP,     &devfs_debug_init},
-	{"diunlink",  DEBUG_I_UNLINK,     &devfs_debug_init},
-#endif  /*  CONFIG_DEVFS_DEBUG  */
-	{"mount",     OPTION_MOUNT,       &boot_options},
-	{NULL,        0,                  NULL}
-    };
-
-    while ( (*str != '\0') && !isspace (*str) )
-    {
-	int i, found = 0, invert = 0;
-
-	if (strncmp (str, "no", 2) == 0)
-	{
-	    invert = 1;
-	    str += 2;
-	}
-	for (i = 0; devfs_options_tab[i].name != NULL; i++)
-	{
-	    int len = strlen (devfs_options_tab[i].name);
-
-	    if (strncmp (str, devfs_options_tab[i].name, len) == 0)
-	    {
-		if (invert)
-		    *devfs_options_tab[i].opt &= ~devfs_options_tab[i].mask;
-		else
-		    *devfs_options_tab[i].opt |= devfs_options_tab[i].mask;
-		str += len;
-		found = 1;
-		break;
-	    }
+		{
+		"dall", DEBUG_ALL, &devfs_debug_init}, {
+		"dmod", DEBUG_MODULE_LOAD, &devfs_debug_init}, {
+		"dreg", DEBUG_REGISTER, &devfs_debug_init}, {
+		"dunreg", DEBUG_UNREGISTER, &devfs_debug_init}, {
+		"dfree", DEBUG_FREE, &devfs_debug_init}, {
+		"diget", DEBUG_I_GET, &devfs_debug_init}, {
+		"dchange", DEBUG_SET_FLAGS, &devfs_debug_init}, {
+		"dsread", DEBUG_S_READ, &devfs_debug_init}, {
+		"dichange", DEBUG_I_CHANGE, &devfs_debug_init}, {
+		"dimknod", DEBUG_I_MKNOD, &devfs_debug_init}, {
+		"dilookup", DEBUG_I_LOOKUP, &devfs_debug_init}, {
+		"diunlink", DEBUG_I_UNLINK, &devfs_debug_init},
+#endif				/*  CONFIG_DEVFS_DEBUG  */
+		{
+		"mount", OPTION_MOUNT, &boot_options}, {
+		NULL, 0, NULL}
+	};
+
+	while ((*str != '\0') && !isspace(*str)) {
+		int i, found = 0, invert = 0;
+
+		if (strncmp(str, "no", 2) == 0) {
+			invert = 1;
+			str += 2;
+		}
+		for (i = 0; devfs_options_tab[i].name != NULL; i++) {
+			int len = strlen(devfs_options_tab[i].name);
+
+			if (strncmp(str, devfs_options_tab[i].name, len) == 0) {
+				if (invert)
+					*devfs_options_tab[i].opt &=
+					    ~devfs_options_tab[i].mask;
+				else
+					*devfs_options_tab[i].opt |=
+					    devfs_options_tab[i].mask;
+				str += len;
+				found = 1;
+				break;
+			}
+		}
+		if (!found)
+			return 0;	/*  No match         */
+		if (*str != ',')
+			return 0;	/*  No more options  */
+		++str;
 	}
-	if (!found) return 0;       /*  No match         */
-	if (*str != ',') return 0;  /*  No more options  */
-	++str;
-    }
-    return 1;
-}   /*  End Function devfs_setup  */
+	return 1;
+}				/*  End Function devfs_setup  */
 
 __setup("devfs=", devfs_setup);
 
@@ -1809,7 +1804,6 @@ EXPORT_SYMBOL(devfs_mk_symlink);
 EXPORT_SYMBOL(devfs_mk_dir);
 EXPORT_SYMBOL(devfs_remove);
 
-
 /**
  *	try_modload - Notify devfsd of an inode lookup by a non-devfsd process.
  *	@parent: The parent devfs entry.
@@ -1822,26 +1816,26 @@ EXPORT_SYMBOL(devfs_remove);
  *	Returns 0 on success (event was queued), else a negative error code.
  */
 
-static int try_modload (struct devfs_entry *parent, struct fs_info *fs_info,
-			const char *name, unsigned namelen,
-			struct devfs_entry *buf)
-{
-    if ( !( fs_info->devfsd_event_mask & (1 << DEVFSD_NOTIFY_LOOKUP) ) )
-	return -ENOENT;
-    if ( is_devfsd_or_child (fs_info) ) return -ENOENT;
-    memset (buf, 0, sizeof *buf);
-    atomic_set (&buf->refcount, 1);
-    buf->parent = parent;
-    buf->namelen = namelen;
-    buf->u.name = name;
-    WRITE_ENTRY_MAGIC (buf, MAGIC_VALUE);
-    if ( !devfsd_notify_de (buf, DEVFSD_NOTIFY_LOOKUP, 0,
-			    current->euid, current->egid, fs_info) )
-	return -ENOENT;
-    /*  Possible success: event has been queued  */
-    return 0;
-}   /*  End Function try_modload  */
-
+static int try_modload(struct devfs_entry *parent, struct fs_info *fs_info,
+		       const char *name, unsigned namelen,
+		       struct devfs_entry *buf)
+{
+	if (!(fs_info->devfsd_event_mask & (1 << DEVFSD_NOTIFY_LOOKUP)))
+		return -ENOENT;
+	if (is_devfsd_or_child(fs_info))
+		return -ENOENT;
+	memset(buf, 0, sizeof *buf);
+	atomic_set(&buf->refcount, 1);
+	buf->parent = parent;
+	buf->namelen = namelen;
+	buf->u.name = name;
+	WRITE_ENTRY_MAGIC(buf, MAGIC_VALUE);
+	if (!devfsd_notify_de(buf, DEVFSD_NOTIFY_LOOKUP, 0,
+			      current->euid, current->egid, fs_info))
+		return -ENOENT;
+	/*  Possible success: event has been queued  */
+	return 0;
+}				/*  End Function try_modload  */
 
 /*  Superblock operations follow  */
 
@@ -1851,44 +1845,45 @@ static struct file_operations devfs_fops;
 static struct file_operations devfs_dir_fops;
 static struct inode_operations devfs_symlink_iops;
 
-static int devfs_notify_change (struct dentry *dentry, struct iattr *iattr)
+static int devfs_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
-    int retval;
-    struct devfs_entry *de;
-    struct inode *inode = dentry->d_inode;
-    struct fs_info *fs_info = inode->i_sb->s_fs_info;
-
-    de = get_devfs_entry_from_vfs_inode (inode);
-    if (de == NULL) return -ENODEV;
-    retval = inode_change_ok (inode, iattr);
-    if (retval != 0) return retval;
-    retval = inode_setattr (inode, iattr);
-    if (retval != 0) return retval;
-    DPRINTK (DEBUG_I_CHANGE, "(%d): VFS inode: %p  devfs_entry: %p\n",
-	     (int) inode->i_ino, inode, de);
-    DPRINTK (DEBUG_I_CHANGE, "():   mode: 0%o  uid: %d  gid: %d\n",
-	     (int) inode->i_mode, (int) inode->i_uid, (int) inode->i_gid);
-    /*  Inode is not on hash chains, thus must save permissions here rather
-	than in a write_inode() method  */
-    de->mode = inode->i_mode;
-    de->inode.uid = inode->i_uid;
-    de->inode.gid = inode->i_gid;
-    de->inode.atime = inode->i_atime;
-    de->inode.mtime = inode->i_mtime;
-    de->inode.ctime = inode->i_ctime;
-    if ( ( iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID) ) &&
-	 !is_devfsd_or_child (fs_info) )
-	devfsd_notify_de (de, DEVFSD_NOTIFY_CHANGE, inode->i_mode,
-			  inode->i_uid, inode->i_gid, fs_info);
-    return 0;
-}   /*  End Function devfs_notify_change  */
-
-static struct super_operations devfs_sops =
-{ 
-    .drop_inode    = generic_delete_inode,
-    .statfs        = simple_statfs,
-};
+	int retval;
+	struct devfs_entry *de;
+	struct inode *inode = dentry->d_inode;
+	struct fs_info *fs_info = inode->i_sb->s_fs_info;
 
+	de = get_devfs_entry_from_vfs_inode(inode);
+	if (de == NULL)
+		return -ENODEV;
+	retval = inode_change_ok(inode, iattr);
+	if (retval != 0)
+		return retval;
+	retval = inode_setattr(inode, iattr);
+	if (retval != 0)
+		return retval;
+	DPRINTK(DEBUG_I_CHANGE, "(%d): VFS inode: %p  devfs_entry: %p\n",
+		(int)inode->i_ino, inode, de);
+	DPRINTK(DEBUG_I_CHANGE, "():   mode: 0%o  uid: %d  gid: %d\n",
+		(int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid);
+	/*  Inode is not on hash chains, thus must save permissions here rather
+	   than in a write_inode() method  */
+	de->mode = inode->i_mode;
+	de->inode.uid = inode->i_uid;
+	de->inode.gid = inode->i_gid;
+	de->inode.atime = inode->i_atime;
+	de->inode.mtime = inode->i_mtime;
+	de->inode.ctime = inode->i_ctime;
+	if ((iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) &&
+	    !is_devfsd_or_child(fs_info))
+		devfsd_notify_de(de, DEVFSD_NOTIFY_CHANGE, inode->i_mode,
+				 inode->i_uid, inode->i_gid, fs_info);
+	return 0;
+}				/*  End Function devfs_notify_change  */
+
+static struct super_operations devfs_sops = {
+	.drop_inode = generic_delete_inode,
+	.statfs = simple_statfs,
+};
 
 /**
  *	_devfs_get_vfs_inode - Get a VFS inode.
@@ -1900,39 +1895,38 @@ static struct super_operations devfs_sops =
  *       performed if the inode is created.
  */
 
-static struct inode *_devfs_get_vfs_inode (struct super_block *sb,
-					   struct devfs_entry *de,
-					   struct dentry *dentry)
+static struct inode *_devfs_get_vfs_inode(struct super_block *sb,
+					  struct devfs_entry *de,
+					  struct dentry *dentry)
 {
-    struct inode *inode;
-
-    if (de->prev == de) return NULL;  /*  Quick check to see if unhooked  */
-    if ( ( inode = new_inode (sb) ) == NULL )
-    {
-	PRINTK ("(%s): new_inode() failed, de: %p\n", de->name, de);
-	return NULL;
-    }
-    if (de->parent)
-    {
-	read_lock (&de->parent->u.dir.lock);
-	if (de->prev != de) de->inode.dentry = dentry; /*      Not unhooked  */
-	read_unlock (&de->parent->u.dir.lock);
-    }
-    else de->inode.dentry = dentry;             /*  Root: no locking needed  */
-    if (de->inode.dentry != dentry)
-    {   /*  Must have been unhooked  */
-	iput (inode);
-	return NULL;
-    }
-    /* FIXME where is devfs_put? */
-    inode->u.generic_ip = devfs_get (de);
-    inode->i_ino = de->inode.ino;
-    DPRINTK (DEBUG_I_GET, "(%d): VFS inode: %p  devfs_entry: %p\n",
-	     (int) inode->i_ino, inode, de);
-    inode->i_blocks = 0;
-    inode->i_blksize = FAKE_BLOCK_SIZE;
-    inode->i_op = &devfs_iops;
-    inode->i_mode = de->mode;
+	struct inode *inode;
+
+	if (de->prev == de)
+		return NULL;	/*  Quick check to see if unhooked  */
+	if ((inode = new_inode(sb)) == NULL) {
+		PRINTK("(%s): new_inode() failed, de: %p\n", de->name, de);
+		return NULL;
+	}
+	if (de->parent) {
+		read_lock(&de->parent->u.dir.lock);
+		if (de->prev != de)
+			de->inode.dentry = dentry;	/*      Not unhooked  */
+		read_unlock(&de->parent->u.dir.lock);
+	} else
+		de->inode.dentry = dentry;	/*  Root: no locking needed  */
+	if (de->inode.dentry != dentry) {	/*  Must have been unhooked  */
+		iput(inode);
+		return NULL;
+	}
+	/* FIXME where is devfs_put? */
+	inode->u.generic_ip = devfs_get(de);
+	inode->i_ino = de->inode.ino;
+	DPRINTK(DEBUG_I_GET, "(%d): VFS inode: %p  devfs_entry: %p\n",
+		(int)inode->i_ino, inode, de);
+	inode->i_blocks = 0;
+	inode->i_blksize = FAKE_BLOCK_SIZE;
+	inode->i_op = &devfs_iops;
+	inode->i_mode = de->mode;
 	if (S_ISDIR(de->mode)) {
 		inode->i_op = &devfs_dir_iops;
 		inode->i_fop = &devfs_dir_fops;
@@ -1945,100 +1939,107 @@ static struct inode *_devfs_get_vfs_inode (struct super_block *sb,
 		init_special_inode(inode, de->mode, 0);
 	} else {
 		PRINTK("(%s): unknown mode %o de: %p\n",
-			de->name, de->mode, de);
+		       de->name, de->mode, de);
 		iput(inode);
 		devfs_put(de);
 		return NULL;
 	}
 
-    inode->i_uid = de->inode.uid;
-    inode->i_gid = de->inode.gid;
-    inode->i_atime = de->inode.atime;
-    inode->i_mtime = de->inode.mtime;
-    inode->i_ctime = de->inode.ctime;
-    DPRINTK (DEBUG_I_GET, "():   mode: 0%o  uid: %d  gid: %d\n",
-	     (int) inode->i_mode, (int) inode->i_uid, (int) inode->i_gid);
-    return inode;
-}   /*  End Function _devfs_get_vfs_inode  */
-
+	inode->i_uid = de->inode.uid;
+	inode->i_gid = de->inode.gid;
+	inode->i_atime = de->inode.atime;
+	inode->i_mtime = de->inode.mtime;
+	inode->i_ctime = de->inode.ctime;
+	DPRINTK(DEBUG_I_GET, "():   mode: 0%o  uid: %d  gid: %d\n",
+		(int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid);
+	return inode;
+}				/*  End Function _devfs_get_vfs_inode  */
 
 /*  File operations for device entries follow  */
 
-static int devfs_readdir (struct file *file, void *dirent, filldir_t filldir)
+static int devfs_readdir(struct file *file, void *dirent, filldir_t filldir)
 {
-    int err, count;
-    int stored = 0;
-    struct fs_info *fs_info;
-    struct devfs_entry *parent, *de, *next = NULL;
-    struct inode *inode = file->f_dentry->d_inode;
-
-    fs_info = inode->i_sb->s_fs_info;
-    parent = get_devfs_entry_from_vfs_inode (file->f_dentry->d_inode);
-    if ( (long) file->f_pos < 0 ) return -EINVAL;
-    DPRINTK (DEBUG_F_READDIR, "(%s): fs_info: %p  pos: %ld\n",
-	     parent->name, fs_info, (long) file->f_pos);
-    switch ( (long) file->f_pos )
-    {
-      case 0:
-	err = (*filldir) (dirent, "..", 2, file->f_pos,
-			  parent_ino (file->f_dentry), DT_DIR);
-	if (err == -EINVAL) break;
-	if (err < 0) return err;
-	file->f_pos++;
-	++stored;
-	/*  Fall through  */
-      case 1:
-	err = (*filldir) (dirent, ".", 1, file->f_pos, inode->i_ino, DT_DIR);
-	if (err == -EINVAL) break;
-	if (err < 0) return err;
-	file->f_pos++;
-	++stored;
-	/*  Fall through  */
-      default:
-	/*  Skip entries  */
-	count = file->f_pos - 2;
-	read_lock (&parent->u.dir.lock);
-	for (de = parent->u.dir.first; de && (count > 0); de = de->next)
-	    --count;
-	devfs_get (de);
-	read_unlock (&parent->u.dir.lock);
-	/*  Now add all remaining entries  */
-	while (de)
-	{
-	    err = (*filldir) (dirent, de->name, de->namelen,
-				file->f_pos, de->inode.ino, de->mode >> 12);
-	    if (err < 0) devfs_put (de);
-	    else
-	    {
-	        file->f_pos++;
-	        ++stored;
-	    }
-	    if (err == -EINVAL) break;
-	    if (err < 0) return err;
-	    read_lock (&parent->u.dir.lock);
-	    next = devfs_get (de->next);
-	    read_unlock (&parent->u.dir.lock);
-	    devfs_put (de);
-	    de = next;
+	int err, count;
+	int stored = 0;
+	struct fs_info *fs_info;
+	struct devfs_entry *parent, *de, *next = NULL;
+	struct inode *inode = file->f_dentry->d_inode;
+
+	fs_info = inode->i_sb->s_fs_info;
+	parent = get_devfs_entry_from_vfs_inode(file->f_dentry->d_inode);
+	if ((long)file->f_pos < 0)
+		return -EINVAL;
+	DPRINTK(DEBUG_F_READDIR, "(%s): fs_info: %p  pos: %ld\n",
+		parent->name, fs_info, (long)file->f_pos);
+	switch ((long)file->f_pos) {
+	case 0:
+		err = (*filldir) (dirent, "..", 2, file->f_pos,
+				  parent_ino(file->f_dentry), DT_DIR);
+		if (err == -EINVAL)
+			break;
+		if (err < 0)
+			return err;
+		file->f_pos++;
+		++stored;
+		/*  Fall through  */
+	case 1:
+		err =
+		    (*filldir) (dirent, ".", 1, file->f_pos, inode->i_ino,
+				DT_DIR);
+		if (err == -EINVAL)
+			break;
+		if (err < 0)
+			return err;
+		file->f_pos++;
+		++stored;
+		/*  Fall through  */
+	default:
+		/*  Skip entries  */
+		count = file->f_pos - 2;
+		read_lock(&parent->u.dir.lock);
+		for (de = parent->u.dir.first; de && (count > 0); de = de->next)
+			--count;
+		devfs_get(de);
+		read_unlock(&parent->u.dir.lock);
+		/*  Now add all remaining entries  */
+		while (de) {
+			err = (*filldir) (dirent, de->name, de->namelen,
+					  file->f_pos, de->inode.ino,
+					  de->mode >> 12);
+			if (err < 0)
+				devfs_put(de);
+			else {
+				file->f_pos++;
+				++stored;
+			}
+			if (err == -EINVAL)
+				break;
+			if (err < 0)
+				return err;
+			read_lock(&parent->u.dir.lock);
+			next = devfs_get(de->next);
+			read_unlock(&parent->u.dir.lock);
+			devfs_put(de);
+			de = next;
+		}
+		break;
 	}
-	break;
-    }
-    return stored;
-}   /*  End Function devfs_readdir  */
+	return stored;
+}				/*  End Function devfs_readdir  */
 
 /* Open devfs specific special files */
-static int devfs_open (struct inode *inode, struct file *file)
+static int devfs_open(struct inode *inode, struct file *file)
 {
 	int err;
 	int minor = MINOR(inode->i_rdev);
 	struct file_operations *old_fops, *new_fops;
 
 	switch (minor) {
-	case 0: /* /dev/.devfsd */
+	case 0:		/* /dev/.devfsd */
 		new_fops = fops_get(&devfsd_fops);
 		break;
 #ifdef CONFIG_DEVFS_DEBUG
-	case 1: /* /dev/.stat */
+	case 1:		/* /dev/.stat */
 		new_fops = fops_get(&stat_fops);
 		break;
 #endif
@@ -2057,32 +2058,28 @@ static int devfs_open (struct inode *inode, struct file *file)
 	} else
 		fops_put(old_fops);
 	return err;
-}   /*  End Function devfs_open  */
+}				/*  End Function devfs_open  */
 
-static struct file_operations devfs_fops =
-{
-    .open    = devfs_open,
+static struct file_operations devfs_fops = {
+	.open = devfs_open,
 };
 
-static struct file_operations devfs_dir_fops =
-{
-    .read    = generic_read_dir,
-    .readdir = devfs_readdir,
+static struct file_operations devfs_dir_fops = {
+	.read = generic_read_dir,
+	.readdir = devfs_readdir,
 };
 
-
 /*  Dentry operations for device entries follow  */
 
-
 /**
  *	devfs_d_release - Callback for when a dentry is freed.
  *	@dentry: The dentry.
  */
 
-static void devfs_d_release (struct dentry *dentry)
+static void devfs_d_release(struct dentry *dentry)
 {
-    DPRINTK (DEBUG_D_RELEASE, "(%p): inode: %p\n", dentry, dentry->d_inode);
-}   /*  End Function devfs_d_release  */
+	DPRINTK(DEBUG_D_RELEASE, "(%p): inode: %p\n", dentry, dentry->d_inode);
+}				/*  End Function devfs_d_release  */
 
 /**
  *	devfs_d_iput - Callback for when a dentry loses its inode.
@@ -2090,38 +2087,37 @@ static void devfs_d_release (struct dentry *dentry)
  *	@inode:	The inode.
  */
 
-static void devfs_d_iput (struct dentry *dentry, struct inode *inode)
+static void devfs_d_iput(struct dentry *dentry, struct inode *inode)
 {
-    struct devfs_entry *de;
-
-    de = get_devfs_entry_from_vfs_inode (inode);
-    DPRINTK (DEBUG_D_IPUT,"(%s): dentry: %p inode: %p de: %p de->dentry: %p\n",
-	     de->name, dentry, inode, de, de->inode.dentry);
-    if ( de->inode.dentry && (de->inode.dentry != dentry) )
-	OOPS ("(%s): de: %p dentry: %p de->dentry: %p\n",
-	      de->name, de, dentry, de->inode.dentry);
-    de->inode.dentry = NULL;
-    iput (inode);
-    devfs_put (de);
-}   /*  End Function devfs_d_iput  */
-
-static int devfs_d_delete (struct dentry *dentry);
-
-static struct dentry_operations devfs_dops =
-{
-    .d_delete     = devfs_d_delete,
-    .d_release    = devfs_d_release,
-    .d_iput       = devfs_d_iput,
+	struct devfs_entry *de;
+
+	de = get_devfs_entry_from_vfs_inode(inode);
+	DPRINTK(DEBUG_D_IPUT,
+		"(%s): dentry: %p inode: %p de: %p de->dentry: %p\n", de->name,
+		dentry, inode, de, de->inode.dentry);
+	if (de->inode.dentry && (de->inode.dentry != dentry))
+		OOPS("(%s): de: %p dentry: %p de->dentry: %p\n",
+		     de->name, de, dentry, de->inode.dentry);
+	de->inode.dentry = NULL;
+	iput(inode);
+	devfs_put(de);
+}				/*  End Function devfs_d_iput  */
+
+static int devfs_d_delete(struct dentry *dentry);
+
+static struct dentry_operations devfs_dops = {
+	.d_delete = devfs_d_delete,
+	.d_release = devfs_d_release,
+	.d_iput = devfs_d_iput,
 };
 
-static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *);
+static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *);
 
-static struct dentry_operations devfs_wait_dops =
-{
-    .d_delete     = devfs_d_delete,
-    .d_release    = devfs_d_release,
-    .d_iput       = devfs_d_iput,
-    .d_revalidate = devfs_d_revalidate_wait,
+static struct dentry_operations devfs_wait_dops = {
+	.d_delete = devfs_d_delete,
+	.d_release = devfs_d_release,
+	.d_iput = devfs_d_iput,
+	.d_revalidate = devfs_d_revalidate_wait,
 };
 
 /**
@@ -2129,653 +2125,673 @@ static struct dentry_operations devfs_wait_dops =
  *	@dentry: The dentry.
  */
 
-static int devfs_d_delete (struct dentry *dentry)
+static int devfs_d_delete(struct dentry *dentry)
 {
-    struct inode *inode = dentry->d_inode;
+	struct inode *inode = dentry->d_inode;
 
-    if (dentry->d_op == &devfs_wait_dops) dentry->d_op = &devfs_dops;
-    /*  Unhash dentry if negative (has no inode)  */
-    if (inode == NULL)
-    {
-	DPRINTK (DEBUG_D_DELETE, "(%p): dropping negative dentry\n", dentry);
-	return 1;
-    }
-    return 0;
-}   /*  End Function devfs_d_delete  */
+	if (dentry->d_op == &devfs_wait_dops)
+		dentry->d_op = &devfs_dops;
+	/*  Unhash dentry if negative (has no inode)  */
+	if (inode == NULL) {
+		DPRINTK(DEBUG_D_DELETE, "(%p): dropping negative dentry\n",
+			dentry);
+		return 1;
+	}
+	return 0;
+}				/*  End Function devfs_d_delete  */
 
-struct devfs_lookup_struct
-{
-    devfs_handle_t de;
-    wait_queue_head_t wait_queue;
+struct devfs_lookup_struct {
+	devfs_handle_t de;
+	wait_queue_head_t wait_queue;
 };
 
 /* XXX: this doesn't handle the case where we got a negative dentry
         but a devfs entry has been registered in the meanwhile */
-static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *nd)
+static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd)
 {
-    struct inode *dir = dentry->d_parent->d_inode;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-    devfs_handle_t parent = get_devfs_entry_from_vfs_inode (dir);
-    struct devfs_lookup_struct *lookup_info = dentry->d_fsdata;
-    DECLARE_WAITQUEUE (wait, current);
-    int need_lock;
-
-    /*
-     * FIXME HACK
-     *
-     * make sure that
-     *   d_instantiate always runs under lock
-     *   we release i_sem lock before going to sleep
-     *
-     * unfortunately sometimes d_revalidate is called with
-     * and sometimes without i_sem lock held. The following checks
-     * attempt to deduce when we need to add (and drop resp.) lock
-     * here. This relies on current (2.6.2) calling coventions:
-     *
-     *   lookup_hash is always run under i_sem and is passing NULL
-     *   as nd
-     *
-     *   open(...,O_CREATE,...) calls _lookup_hash under i_sem
-     *   and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
-     *
-     *   all other invocations of ->d_revalidate seem to happen
-     *   outside of i_sem
-     */
-    need_lock = nd &&
-		(!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
-
-    if (need_lock)
-	down(&dir->i_sem);
-
-    if ( is_devfsd_or_child (fs_info) )
-    {
-	devfs_handle_t de = lookup_info->de;
-	struct inode *inode;
+	struct inode *dir = dentry->d_parent->d_inode;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+	devfs_handle_t parent = get_devfs_entry_from_vfs_inode(dir);
+	struct devfs_lookup_struct *lookup_info = dentry->d_fsdata;
+	DECLARE_WAITQUEUE(wait, current);
+	int need_lock;
 
-	DPRINTK (DEBUG_I_LOOKUP,
-		 "(%s): dentry: %p inode: %p de: %p by: \"%s\"\n",
-		 dentry->d_name.name, dentry, dentry->d_inode, de,
-		 current->comm);
-	if (dentry->d_inode)
-	    goto out;
-	if (de == NULL)
-	{
-	    read_lock (&parent->u.dir.lock);
-	    de = _devfs_search_dir (parent, dentry->d_name.name,
-				    dentry->d_name.len);
-	    read_unlock (&parent->u.dir.lock);
-	    if (de == NULL)
-		goto out;
-	    lookup_info->de = de;
-	}
-	/*  Create an inode, now that the driver information is available  */
-	inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry);
-	if (!inode)
-	    goto out;
-	DPRINTK (DEBUG_I_LOOKUP,
-		 "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n",
-		 de->name, de->inode.ino, inode, de, current->comm);
-	d_instantiate (dentry, inode);
-	goto out;
-    }
-    if (lookup_info == NULL)
-	goto out;  /*  Early termination  */
-    read_lock (&parent->u.dir.lock);
-    if (dentry->d_fsdata)
-    {
-	set_current_state (TASK_UNINTERRUPTIBLE);
-	add_wait_queue (&lookup_info->wait_queue, &wait);
-	read_unlock (&parent->u.dir.lock);
-	/* at this point it is always (hopefully) locked */
-	up(&dir->i_sem);
-	schedule ();
-	down(&dir->i_sem);
 	/*
-	 * This does not need nor should remove wait from wait_queue.
-	 * Wait queue head is never reused - nothing is ever added to it
-	 * after all waiters have been waked up and head itself disappears
-	 * very soon after it. Moreover it is local variable on stack that
-	 * is likely to have already disappeared so any reference to it
-	 * at this point is buggy.
+	 * FIXME HACK
+	 *
+	 * make sure that
+	 *   d_instantiate always runs under lock
+	 *   we release i_sem lock before going to sleep
+	 *
+	 * unfortunately sometimes d_revalidate is called with
+	 * and sometimes without i_sem lock held. The following checks
+	 * attempt to deduce when we need to add (and drop resp.) lock
+	 * here. This relies on current (2.6.2) calling coventions:
+	 *
+	 *   lookup_hash is always run under i_sem and is passing NULL
+	 *   as nd
+	 *
+	 *   open(...,O_CREATE,...) calls _lookup_hash under i_sem
+	 *   and sets flags to LOOKUP_OPEN|LOOKUP_CREATE
+	 *
+	 *   all other invocations of ->d_revalidate seem to happen
+	 *   outside of i_sem
 	 */
+	need_lock = nd &&
+	    (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT));
+
+	if (need_lock)
+		down(&dir->i_sem);
+
+	if (is_devfsd_or_child(fs_info)) {
+		devfs_handle_t de = lookup_info->de;
+		struct inode *inode;
+
+		DPRINTK(DEBUG_I_LOOKUP,
+			"(%s): dentry: %p inode: %p de: %p by: \"%s\"\n",
+			dentry->d_name.name, dentry, dentry->d_inode, de,
+			current->comm);
+		if (dentry->d_inode)
+			goto out;
+		if (de == NULL) {
+			read_lock(&parent->u.dir.lock);
+			de = _devfs_search_dir(parent, dentry->d_name.name,
+					       dentry->d_name.len);
+			read_unlock(&parent->u.dir.lock);
+			if (de == NULL)
+				goto out;
+			lookup_info->de = de;
+		}
+		/*  Create an inode, now that the driver information is available  */
+		inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry);
+		if (!inode)
+			goto out;
+		DPRINTK(DEBUG_I_LOOKUP,
+			"(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n",
+			de->name, de->inode.ino, inode, de, current->comm);
+		d_instantiate(dentry, inode);
+		goto out;
+	}
+	if (lookup_info == NULL)
+		goto out;	/*  Early termination  */
+	read_lock(&parent->u.dir.lock);
+	if (dentry->d_fsdata) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&lookup_info->wait_queue, &wait);
+		read_unlock(&parent->u.dir.lock);
+		/* at this point it is always (hopefully) locked */
+		up(&dir->i_sem);
+		schedule();
+		down(&dir->i_sem);
+		/*
+		 * This does not need nor should remove wait from wait_queue.
+		 * Wait queue head is never reused - nothing is ever added to it
+		 * after all waiters have been waked up and head itself disappears
+		 * very soon after it. Moreover it is local variable on stack that
+		 * is likely to have already disappeared so any reference to it
+		 * at this point is buggy.
+		 */
 
-    }
-    else read_unlock (&parent->u.dir.lock);
-
-out:
-    if (need_lock)
-	up(&dir->i_sem);
-    return 1;
-}   /*  End Function devfs_d_revalidate_wait  */
+	} else
+		read_unlock(&parent->u.dir.lock);
 
+      out:
+	if (need_lock)
+		up(&dir->i_sem);
+	return 1;
+}				/*  End Function devfs_d_revalidate_wait  */
 
 /*  Inode operations for device entries follow  */
 
-static struct dentry *devfs_lookup (struct inode *dir, struct dentry *dentry, struct nameidata *nd)
+static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
 {
-    struct devfs_entry tmp;  /*  Must stay in scope until devfsd idle again  */
-    struct devfs_lookup_struct lookup_info;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-    struct devfs_entry *parent, *de;
-    struct inode *inode;
-    struct dentry *retval = NULL;
-
-    /*  Set up the dentry operations before anything else, to ensure cleaning
-	up on any error  */
-    dentry->d_op = &devfs_dops;
-    /*  First try to get the devfs entry for this directory  */
-    parent = get_devfs_entry_from_vfs_inode (dir);
-    DPRINTK (DEBUG_I_LOOKUP, "(%s): dentry: %p parent: %p by: \"%s\"\n",
-	     dentry->d_name.name, dentry, parent, current->comm);
-    if (parent == NULL) return ERR_PTR (-ENOENT);
-    read_lock (&parent->u.dir.lock);
-    de = _devfs_search_dir (parent, dentry->d_name.name, dentry->d_name.len);
-    read_unlock (&parent->u.dir.lock);
-    lookup_info.de = de;
-    init_waitqueue_head (&lookup_info.wait_queue);
-    dentry->d_fsdata = &lookup_info;
-    if (de == NULL)
-    {   /*  Try with devfsd. For any kind of failure, leave a negative dentry
-	    so someone else can deal with it (in the case where the sysadmin
-	    does a mknod()). It's important to do this before hashing the
-	    dentry, so that the devfsd queue is filled before revalidates
-	    can start  */
-	if (try_modload (parent, fs_info,
-			 dentry->d_name.name, dentry->d_name.len, &tmp) < 0)
-	{   /*  Lookup event was not queued to devfsd  */
-	    d_add (dentry, NULL);
-	    return NULL;
+	struct devfs_entry tmp;	/*  Must stay in scope until devfsd idle again  */
+	struct devfs_lookup_struct lookup_info;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+	struct devfs_entry *parent, *de;
+	struct inode *inode;
+	struct dentry *retval = NULL;
+
+	/*  Set up the dentry operations before anything else, to ensure cleaning
+	   up on any error  */
+	dentry->d_op = &devfs_dops;
+	/*  First try to get the devfs entry for this directory  */
+	parent = get_devfs_entry_from_vfs_inode(dir);
+	DPRINTK(DEBUG_I_LOOKUP, "(%s): dentry: %p parent: %p by: \"%s\"\n",
+		dentry->d_name.name, dentry, parent, current->comm);
+	if (parent == NULL)
+		return ERR_PTR(-ENOENT);
+	read_lock(&parent->u.dir.lock);
+	de = _devfs_search_dir(parent, dentry->d_name.name, dentry->d_name.len);
+	read_unlock(&parent->u.dir.lock);
+	lookup_info.de = de;
+	init_waitqueue_head(&lookup_info.wait_queue);
+	dentry->d_fsdata = &lookup_info;
+	if (de == NULL) {	/*  Try with devfsd. For any kind of failure, leave a negative dentry
+				   so someone else can deal with it (in the case where the sysadmin
+				   does a mknod()). It's important to do this before hashing the
+				   dentry, so that the devfsd queue is filled before revalidates
+				   can start  */
+		if (try_modload(parent, fs_info, dentry->d_name.name, dentry->d_name.len, &tmp) < 0) {	/*  Lookup event was not queued to devfsd  */
+			d_add(dentry, NULL);
+			return NULL;
+		}
 	}
-    }
-    dentry->d_op = &devfs_wait_dops;
-    d_add (dentry, NULL);  /*  Open the floodgates  */
-    /*  Unlock directory semaphore, which will release any waiters. They
-	will get the hashed dentry, and may be forced to wait for
-	revalidation  */
-    up (&dir->i_sem);
-    wait_for_devfsd_finished (fs_info);  /*  If I'm not devfsd, must wait  */
-    down (&dir->i_sem);      /*  Grab it again because them's the rules  */
-    de = lookup_info.de;
-    /*  If someone else has been so kind as to make the inode, we go home
-	early  */
-    if (dentry->d_inode) goto out;
-    if (de == NULL)
-    {
-	read_lock (&parent->u.dir.lock);
-	de = _devfs_search_dir (parent, dentry->d_name.name,
-				dentry->d_name.len);
-	read_unlock (&parent->u.dir.lock);
-	if (de == NULL) goto out;
-	/*  OK, there's an entry now, but no VFS inode yet  */
-    }
-    /*  Create an inode, now that the driver information is available  */
-    inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry);
-    if (!inode)
-    {
-	retval = ERR_PTR (-ENOMEM);
-	goto out;
-    }
-    DPRINTK (DEBUG_I_LOOKUP, "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n",
-	     de->name, de->inode.ino, inode, de, current->comm);
-    d_instantiate (dentry, inode);
-out:
-    write_lock (&parent->u.dir.lock);
-    dentry->d_op = &devfs_dops;
-    dentry->d_fsdata = NULL;
-    wake_up (&lookup_info.wait_queue);
-    write_unlock (&parent->u.dir.lock);
-    devfs_put (de);
-    return retval;
-}   /*  End Function devfs_lookup  */
-
-static int devfs_unlink (struct inode *dir, struct dentry *dentry)
-{
-    int unhooked;
-    struct devfs_entry *de;
-    struct inode *inode = dentry->d_inode;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-
-    de = get_devfs_entry_from_vfs_inode (inode);
-    DPRINTK (DEBUG_I_UNLINK, "(%s): de: %p\n", dentry->d_name.name, de);
-    if (de == NULL) return -ENOENT;
-    if (!de->vfs) return -EPERM;
-    write_lock (&de->parent->u.dir.lock);
-    unhooked = _devfs_unhook (de);
-    write_unlock (&de->parent->u.dir.lock);
-    if (!unhooked) return -ENOENT;
-    if ( !is_devfsd_or_child (fs_info) )
-	devfsd_notify_de (de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
-			  inode->i_uid, inode->i_gid, fs_info);
-    free_dentry (de);
-    devfs_put (de);
-    return 0;
-}   /*  End Function devfs_unlink  */
-
-static int devfs_symlink (struct inode *dir, struct dentry *dentry,
-			  const char *symname)
-{
-    int err;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-    struct devfs_entry *parent, *de;
-    struct inode *inode;
-
-    /*  First try to get the devfs entry for this directory  */
-    parent = get_devfs_entry_from_vfs_inode (dir);
-    if (parent == NULL) return -ENOENT;
-    err = devfs_do_symlink (parent, dentry->d_name.name, symname, &de);
-    DPRINTK (DEBUG_DISABLED, "(%s): errcode from <devfs_do_symlink>: %d\n",
-	     dentry->d_name.name, err);
-    if (err < 0) return err;
-    de->vfs = TRUE;
-    de->inode.uid = current->euid;
-    de->inode.gid = current->egid;
-    de->inode.atime = CURRENT_TIME;
-    de->inode.mtime = CURRENT_TIME;
-    de->inode.ctime = CURRENT_TIME;
-    if ( ( inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry) ) == NULL )
-	return -ENOMEM;
-    DPRINTK (DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
-	     dentry->d_name.name, de->inode.ino, inode, dentry);
-    d_instantiate (dentry, inode);
-    if ( !is_devfsd_or_child (fs_info) )
-	devfsd_notify_de (de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-			  inode->i_uid, inode->i_gid, fs_info);
-    return 0;
-}   /*  End Function devfs_symlink  */
-
-static int devfs_mkdir (struct inode *dir, struct dentry *dentry, int mode)
-{
-    int err;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-    struct devfs_entry *parent, *de;
-    struct inode *inode;
-
-    mode = (mode & ~S_IFMT) | S_IFDIR;  /*  VFS doesn't pass S_IFMT part  */
-    parent = get_devfs_entry_from_vfs_inode (dir);
-    if (parent == NULL) return -ENOENT;
-    de = _devfs_alloc_entry (dentry->d_name.name, dentry->d_name.len, mode);
-    if (!de) return -ENOMEM;
-    de->vfs = TRUE;
-    if ( ( err = _devfs_append_entry (parent, de, NULL) ) != 0 )
-	return err;
-    de->inode.uid = current->euid;
-    de->inode.gid = current->egid;
-    de->inode.atime = CURRENT_TIME;
-    de->inode.mtime = CURRENT_TIME;
-    de->inode.ctime = CURRENT_TIME;
-    if ( ( inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry) ) == NULL )
-	return -ENOMEM;
-    DPRINTK (DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
-	     dentry->d_name.name, de->inode.ino, inode, dentry);
-    d_instantiate (dentry, inode);
-    if ( !is_devfsd_or_child (fs_info) )
-	devfsd_notify_de (de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-			  inode->i_uid, inode->i_gid, fs_info);
-    return 0;
-}   /*  End Function devfs_mkdir  */
-
-static int devfs_rmdir (struct inode *dir, struct dentry *dentry)
+	dentry->d_op = &devfs_wait_dops;
+	d_add(dentry, NULL);	/*  Open the floodgates  */
+	/*  Unlock directory semaphore, which will release any waiters. They
+	   will get the hashed dentry, and may be forced to wait for
+	   revalidation  */
+	up(&dir->i_sem);
+	wait_for_devfsd_finished(fs_info);	/*  If I'm not devfsd, must wait  */
+	down(&dir->i_sem);	/*  Grab it again because them's the rules  */
+	de = lookup_info.de;
+	/*  If someone else has been so kind as to make the inode, we go home
+	   early  */
+	if (dentry->d_inode)
+		goto out;
+	if (de == NULL) {
+		read_lock(&parent->u.dir.lock);
+		de = _devfs_search_dir(parent, dentry->d_name.name,
+				       dentry->d_name.len);
+		read_unlock(&parent->u.dir.lock);
+		if (de == NULL)
+			goto out;
+		/*  OK, there's an entry now, but no VFS inode yet  */
+	}
+	/*  Create an inode, now that the driver information is available  */
+	inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry);
+	if (!inode) {
+		retval = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	DPRINTK(DEBUG_I_LOOKUP,
+		"(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", de->name,
+		de->inode.ino, inode, de, current->comm);
+	d_instantiate(dentry, inode);
+      out:
+	write_lock(&parent->u.dir.lock);
+	dentry->d_op = &devfs_dops;
+	dentry->d_fsdata = NULL;
+	wake_up(&lookup_info.wait_queue);
+	write_unlock(&parent->u.dir.lock);
+	devfs_put(de);
+	return retval;
+}				/*  End Function devfs_lookup  */
+
+static int devfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int unhooked;
+	struct devfs_entry *de;
+	struct inode *inode = dentry->d_inode;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+
+	de = get_devfs_entry_from_vfs_inode(inode);
+	DPRINTK(DEBUG_I_UNLINK, "(%s): de: %p\n", dentry->d_name.name, de);
+	if (de == NULL)
+		return -ENOENT;
+	if (!de->vfs)
+		return -EPERM;
+	write_lock(&de->parent->u.dir.lock);
+	unhooked = _devfs_unhook(de);
+	write_unlock(&de->parent->u.dir.lock);
+	if (!unhooked)
+		return -ENOENT;
+	if (!is_devfsd_or_child(fs_info))
+		devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
+				 inode->i_uid, inode->i_gid, fs_info);
+	free_dentry(de);
+	devfs_put(de);
+	return 0;
+}				/*  End Function devfs_unlink  */
+
+static int devfs_symlink(struct inode *dir, struct dentry *dentry,
+			 const char *symname)
 {
-    int err = 0;
-    struct devfs_entry *de;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-    struct inode *inode = dentry->d_inode;
-
-    if (dir->i_sb->s_fs_info != inode->i_sb->s_fs_info) return -EINVAL;
-    de = get_devfs_entry_from_vfs_inode (inode);
-    if (de == NULL) return -ENOENT;
-    if ( !S_ISDIR (de->mode) ) return -ENOTDIR;
-    if (!de->vfs) return -EPERM;
-    /*  First ensure the directory is empty and will stay that way  */
-    write_lock (&de->u.dir.lock);
-    if (de->u.dir.first) err = -ENOTEMPTY;
-    else de->u.dir.no_more_additions = TRUE;
-    write_unlock (&de->u.dir.lock);
-    if (err) return err;
-    /*  Now unhook the directory from its parent  */
-    write_lock (&de->parent->u.dir.lock);
-    if ( !_devfs_unhook (de) ) err = -ENOENT;
-    write_unlock (&de->parent->u.dir.lock);
-    if (err) return err;
-    if ( !is_devfsd_or_child (fs_info) )
-	devfsd_notify_de (de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
-			  inode->i_uid, inode->i_gid, fs_info);
-    free_dentry (de);
-    devfs_put (de);
-    return 0;
-}   /*  End Function devfs_rmdir  */
-
-static int devfs_mknod (struct inode *dir, struct dentry *dentry, int mode,
-			dev_t rdev)
+	int err;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+	struct devfs_entry *parent, *de;
+	struct inode *inode;
+
+	/*  First try to get the devfs entry for this directory  */
+	parent = get_devfs_entry_from_vfs_inode(dir);
+	if (parent == NULL)
+		return -ENOENT;
+	err = devfs_do_symlink(parent, dentry->d_name.name, symname, &de);
+	DPRINTK(DEBUG_DISABLED, "(%s): errcode from <devfs_do_symlink>: %d\n",
+		dentry->d_name.name, err);
+	if (err < 0)
+		return err;
+	de->vfs = TRUE;
+	de->inode.uid = current->euid;
+	de->inode.gid = current->egid;
+	de->inode.atime = CURRENT_TIME;
+	de->inode.mtime = CURRENT_TIME;
+	de->inode.ctime = CURRENT_TIME;
+	if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
+		return -ENOMEM;
+	DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
+		dentry->d_name.name, de->inode.ino, inode, dentry);
+	d_instantiate(dentry, inode);
+	if (!is_devfsd_or_child(fs_info))
+		devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
+				 inode->i_uid, inode->i_gid, fs_info);
+	return 0;
+}				/*  End Function devfs_symlink  */
+
+static int devfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 {
-    int err;
-    struct fs_info *fs_info = dir->i_sb->s_fs_info;
-    struct devfs_entry *parent, *de;
-    struct inode *inode;
-
-    DPRINTK (DEBUG_I_MKNOD, "(%s): mode: 0%o  dev: %u:%u\n",
-	     dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
-    parent = get_devfs_entry_from_vfs_inode (dir);
-    if (parent == NULL) return -ENOENT;
-    de = _devfs_alloc_entry (dentry->d_name.name, dentry->d_name.len, mode);
-    if (!de) return -ENOMEM;
-    de->vfs = TRUE;
-    if (S_ISCHR(mode) || S_ISBLK(mode))
-	de->u.dev = rdev;
-    if ( ( err = _devfs_append_entry (parent, de, NULL) ) != 0 )
-	return err;
-    de->inode.uid = current->euid;
-    de->inode.gid = current->egid;
-    de->inode.atime = CURRENT_TIME;
-    de->inode.mtime = CURRENT_TIME;
-    de->inode.ctime = CURRENT_TIME;
-    if ( ( inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry) ) == NULL )
-	return -ENOMEM;
-    DPRINTK (DEBUG_I_MKNOD, ":   new VFS inode(%u): %p  dentry: %p\n",
-	     de->inode.ino, inode, dentry);
-    d_instantiate (dentry, inode);
-    if ( !is_devfsd_or_child (fs_info) )
-	devfsd_notify_de (de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
-			  inode->i_uid, inode->i_gid, fs_info);
-    return 0;
-}   /*  End Function devfs_mknod  */
-
-static int devfs_readlink (struct dentry *dentry, char *buffer, int buflen)
+	int err;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+	struct devfs_entry *parent, *de;
+	struct inode *inode;
+
+	mode = (mode & ~S_IFMT) | S_IFDIR;	/*  VFS doesn't pass S_IFMT part  */
+	parent = get_devfs_entry_from_vfs_inode(dir);
+	if (parent == NULL)
+		return -ENOENT;
+	de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode);
+	if (!de)
+		return -ENOMEM;
+	de->vfs = TRUE;
+	if ((err = _devfs_append_entry(parent, de, NULL)) != 0)
+		return err;
+	de->inode.uid = current->euid;
+	de->inode.gid = current->egid;
+	de->inode.atime = CURRENT_TIME;
+	de->inode.mtime = CURRENT_TIME;
+	de->inode.ctime = CURRENT_TIME;
+	if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
+		return -ENOMEM;
+	DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p  dentry: %p\n",
+		dentry->d_name.name, de->inode.ino, inode, dentry);
+	d_instantiate(dentry, inode);
+	if (!is_devfsd_or_child(fs_info))
+		devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
+				 inode->i_uid, inode->i_gid, fs_info);
+	return 0;
+}				/*  End Function devfs_mkdir  */
+
+static int devfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err = 0;
+	struct devfs_entry *de;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+	struct inode *inode = dentry->d_inode;
+
+	if (dir->i_sb->s_fs_info != inode->i_sb->s_fs_info)
+		return -EINVAL;
+	de = get_devfs_entry_from_vfs_inode(inode);
+	if (de == NULL)
+		return -ENOENT;
+	if (!S_ISDIR(de->mode))
+		return -ENOTDIR;
+	if (!de->vfs)
+		return -EPERM;
+	/*  First ensure the directory is empty and will stay that way  */
+	write_lock(&de->u.dir.lock);
+	if (de->u.dir.first)
+		err = -ENOTEMPTY;
+	else
+		de->u.dir.no_more_additions = TRUE;
+	write_unlock(&de->u.dir.lock);
+	if (err)
+		return err;
+	/*  Now unhook the directory from its parent  */
+	write_lock(&de->parent->u.dir.lock);
+	if (!_devfs_unhook(de))
+		err = -ENOENT;
+	write_unlock(&de->parent->u.dir.lock);
+	if (err)
+		return err;
+	if (!is_devfsd_or_child(fs_info))
+		devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode,
+				 inode->i_uid, inode->i_gid, fs_info);
+	free_dentry(de);
+	devfs_put(de);
+	return 0;
+}				/*  End Function devfs_rmdir  */
+
+static int devfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		       dev_t rdev)
 {
-    int err;
-    struct devfs_entry *de;
+	int err;
+	struct fs_info *fs_info = dir->i_sb->s_fs_info;
+	struct devfs_entry *parent, *de;
+	struct inode *inode;
 
-    de = get_devfs_entry_from_vfs_inode (dentry->d_inode);
-    if (!de) return -ENODEV;
-    err = vfs_readlink (dentry, buffer, buflen, de->u.symlink.linkname);
-    return err;
-}   /*  End Function devfs_readlink  */
+	DPRINTK(DEBUG_I_MKNOD, "(%s): mode: 0%o  dev: %u:%u\n",
+		dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev));
+	parent = get_devfs_entry_from_vfs_inode(dir);
+	if (parent == NULL)
+		return -ENOENT;
+	de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode);
+	if (!de)
+		return -ENOMEM;
+	de->vfs = TRUE;
+	if (S_ISCHR(mode) || S_ISBLK(mode))
+		de->u.dev = rdev;
+	if ((err = _devfs_append_entry(parent, de, NULL)) != 0)
+		return err;
+	de->inode.uid = current->euid;
+	de->inode.gid = current->egid;
+	de->inode.atime = CURRENT_TIME;
+	de->inode.mtime = CURRENT_TIME;
+	de->inode.ctime = CURRENT_TIME;
+	if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL)
+		return -ENOMEM;
+	DPRINTK(DEBUG_I_MKNOD, ":   new VFS inode(%u): %p  dentry: %p\n",
+		de->inode.ino, inode, dentry);
+	d_instantiate(dentry, inode);
+	if (!is_devfsd_or_child(fs_info))
+		devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode,
+				 inode->i_uid, inode->i_gid, fs_info);
+	return 0;
+}				/*  End Function devfs_mknod  */
 
-static int devfs_follow_link (struct dentry *dentry, struct nameidata *nd)
+static int devfs_readlink(struct dentry *dentry, char *buffer, int buflen)
 {
-    int err;
-    struct devfs_entry *de;
+	int err;
+	struct devfs_entry *de;
 
-    de = get_devfs_entry_from_vfs_inode (dentry->d_inode);
-    if (!de) return -ENODEV;
-    err = vfs_follow_link (nd, de->u.symlink.linkname);
-    return err;
-}   /*  End Function devfs_follow_link  */
+	de = get_devfs_entry_from_vfs_inode(dentry->d_inode);
+	if (!de)
+		return -ENODEV;
+	err = vfs_readlink(dentry, buffer, buflen, de->u.symlink.linkname);
+	return err;
+}				/*  End Function devfs_readlink  */
 
-static struct inode_operations devfs_iops =
+static int devfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-    .setattr        = devfs_notify_change,
+	int err;
+	struct devfs_entry *de;
+
+	de = get_devfs_entry_from_vfs_inode(dentry->d_inode);
+	if (!de)
+		return -ENODEV;
+	err = vfs_follow_link(nd, de->u.symlink.linkname);
+	return err;
+}				/*  End Function devfs_follow_link  */
+
+static struct inode_operations devfs_iops = {
+	.setattr = devfs_notify_change,
 };
 
-static struct inode_operations devfs_dir_iops =
-{
-    .lookup         = devfs_lookup,
-    .unlink         = devfs_unlink,
-    .symlink        = devfs_symlink,
-    .mkdir          = devfs_mkdir,
-    .rmdir          = devfs_rmdir,
-    .mknod          = devfs_mknod,
-    .setattr        = devfs_notify_change,
+static struct inode_operations devfs_dir_iops = {
+	.lookup = devfs_lookup,
+	.unlink = devfs_unlink,
+	.symlink = devfs_symlink,
+	.mkdir = devfs_mkdir,
+	.rmdir = devfs_rmdir,
+	.mknod = devfs_mknod,
+	.setattr = devfs_notify_change,
 };
 
-static struct inode_operations devfs_symlink_iops =
-{
-    .readlink       = devfs_readlink,
-    .follow_link    = devfs_follow_link,
-    .setattr        = devfs_notify_change,
+static struct inode_operations devfs_symlink_iops = {
+	.readlink = devfs_readlink,
+	.follow_link = devfs_follow_link,
+	.setattr = devfs_notify_change,
 };
 
-static int devfs_fill_super (struct super_block *sb, void *data, int silent)
+static int devfs_fill_super(struct super_block *sb, void *data, int silent)
 {
-    struct inode *root_inode = NULL;
-
-    if (_devfs_get_root_entry () == NULL) goto out_no_root;
-    atomic_set (&fs_info.devfsd_overrun_count, 0);
-    init_waitqueue_head (&fs_info.devfsd_wait_queue);
-    init_waitqueue_head (&fs_info.revalidate_wait_queue);
-    fs_info.sb = sb;
-    sb->s_fs_info = &fs_info;
-    sb->s_blocksize = 1024;
-    sb->s_blocksize_bits = 10;
-    sb->s_magic = DEVFS_SUPER_MAGIC;
-    sb->s_op = &devfs_sops;
-    if ( ( root_inode = _devfs_get_vfs_inode (sb, root_entry, NULL) ) == NULL )
-	goto out_no_root;
-    sb->s_root = d_alloc_root (root_inode);
-    if (!sb->s_root) goto out_no_root;
-    DPRINTK (DEBUG_S_READ, "(): made devfs ptr: %p\n", sb->s_fs_info);
-    return 0;
-
-out_no_root:
-    PRINTK ("(): get root inode failed\n");
-    if (root_inode) iput (root_inode);
-    return -EINVAL;
-}   /*  End Function devfs_fill_super  */
-
-static struct super_block *
-devfs_get_sb (struct file_system_type *fs_type, int flags,
-	      const char *dev_name, void *data)
+	struct inode *root_inode = NULL;
+
+	if (_devfs_get_root_entry() == NULL)
+		goto out_no_root;
+	atomic_set(&fs_info.devfsd_overrun_count, 0);
+	init_waitqueue_head(&fs_info.devfsd_wait_queue);
+	init_waitqueue_head(&fs_info.revalidate_wait_queue);
+	fs_info.sb = sb;
+	sb->s_fs_info = &fs_info;
+	sb->s_blocksize = 1024;
+	sb->s_blocksize_bits = 10;
+	sb->s_magic = DEVFS_SUPER_MAGIC;
+	sb->s_op = &devfs_sops;
+	if ((root_inode = _devfs_get_vfs_inode(sb, root_entry, NULL)) == NULL)
+		goto out_no_root;
+	sb->s_root = d_alloc_root(root_inode);
+	if (!sb->s_root)
+		goto out_no_root;
+	DPRINTK(DEBUG_S_READ, "(): made devfs ptr: %p\n", sb->s_fs_info);
+	return 0;
+
+      out_no_root:
+	PRINTK("(): get root inode failed\n");
+	if (root_inode)
+		iput(root_inode);
+	return -EINVAL;
+}				/*  End Function devfs_fill_super  */
+
+static struct super_block *devfs_get_sb(struct file_system_type *fs_type,
+					int flags, const char *dev_name,
+					void *data)
 {
-    return get_sb_single (fs_type, flags, data, devfs_fill_super);
+	return get_sb_single(fs_type, flags, data, devfs_fill_super);
 }
 
-static struct file_system_type devfs_fs_type =
-{
-    .name	= DEVFS_NAME,
-    .get_sb	= devfs_get_sb,
-    .kill_sb	= kill_anon_super,
+static struct file_system_type devfs_fs_type = {
+	.name = DEVFS_NAME,
+	.get_sb = devfs_get_sb,
+	.kill_sb = kill_anon_super,
 };
 
 /*  File operations for devfsd follow  */
 
-static ssize_t devfsd_read (struct file *file, char *buf, size_t len,
-			    loff_t *ppos)
+static ssize_t devfsd_read(struct file *file, char *buf, size_t len,
+			   loff_t * ppos)
 {
-    int done = FALSE;
-    int ival;
-    loff_t pos, devname_offset, tlen, rpos;
-    devfs_handle_t de;
-    struct devfsd_buf_entry *entry;
-    struct fs_info *fs_info = file->f_dentry->d_inode->i_sb->s_fs_info;
-    struct devfsd_notify_struct *info = fs_info->devfsd_info;
-    DECLARE_WAITQUEUE (wait, current);
-
-    /*  Can't seek (pread) on this device  */
-    if (ppos != &file->f_pos) return -ESPIPE;
-    /*  Verify the task has grabbed the queue  */
-    if (fs_info->devfsd_task != current) return -EPERM;
-    info->major = 0;
-    info->minor = 0;
-    /*  Block for a new entry  */
-    set_current_state (TASK_INTERRUPTIBLE);
-    add_wait_queue (&fs_info->devfsd_wait_queue, &wait);
-    while ( devfsd_queue_empty (fs_info) )
-    {
-	fs_info->devfsd_sleeping = TRUE;
-	wake_up (&fs_info->revalidate_wait_queue);
-	schedule ();
-	fs_info->devfsd_sleeping = FALSE;
-	if ( signal_pending (current) )
-	{
-	    remove_wait_queue (&fs_info->devfsd_wait_queue, &wait);
-	    __set_current_state (TASK_RUNNING);
-	    return -EINTR;
+	int done = FALSE;
+	int ival;
+	loff_t pos, devname_offset, tlen, rpos;
+	devfs_handle_t de;
+	struct devfsd_buf_entry *entry;
+	struct fs_info *fs_info = file->f_dentry->d_inode->i_sb->s_fs_info;
+	struct devfsd_notify_struct *info = fs_info->devfsd_info;
+	DECLARE_WAITQUEUE(wait, current);
+
+	/*  Can't seek (pread) on this device  */
+	if (ppos != &file->f_pos)
+		return -ESPIPE;
+	/*  Verify the task has grabbed the queue  */
+	if (fs_info->devfsd_task != current)
+		return -EPERM;
+	info->major = 0;
+	info->minor = 0;
+	/*  Block for a new entry  */
+	set_current_state(TASK_INTERRUPTIBLE);
+	add_wait_queue(&fs_info->devfsd_wait_queue, &wait);
+	while (devfsd_queue_empty(fs_info)) {
+		fs_info->devfsd_sleeping = TRUE;
+		wake_up(&fs_info->revalidate_wait_queue);
+		schedule();
+		fs_info->devfsd_sleeping = FALSE;
+		if (signal_pending(current)) {
+			remove_wait_queue(&fs_info->devfsd_wait_queue, &wait);
+			__set_current_state(TASK_RUNNING);
+			return -EINTR;
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
 	}
-	set_current_state (TASK_INTERRUPTIBLE);
-    }
-    remove_wait_queue (&fs_info->devfsd_wait_queue, &wait);
-    __set_current_state (TASK_RUNNING);
-    /*  Now play with the data  */
-    ival = atomic_read (&fs_info->devfsd_overrun_count);
-    info->overrun_count = ival;
-    entry = fs_info->devfsd_first_event;
-    info->type = entry->type;
-    info->mode = entry->mode;
-    info->uid = entry->uid;
-    info->gid = entry->gid;
-    de = entry->de;
-    if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) {
-	info->major = MAJOR(de->u.dev);
-	info->minor = MINOR(de->u.dev);
-    }
-    pos = devfs_generate_path (de, info->devname, DEVFS_PATHLEN);
-    if (pos < 0) return pos;
-    info->namelen = DEVFS_PATHLEN - pos - 1;
-    if (info->mode == 0) info->mode = de->mode;
-    devname_offset = info->devname - (char *) info;
-    rpos = *ppos;
-    if (rpos < devname_offset)
-    {
-	/*  Copy parts of the header  */
-	tlen = devname_offset - rpos;
-	if (tlen > len) tlen = len;
-	if ( copy_to_user (buf, (char *) info + rpos, tlen) )
-	{
-	    return -EFAULT;
+	remove_wait_queue(&fs_info->devfsd_wait_queue, &wait);
+	__set_current_state(TASK_RUNNING);
+	/*  Now play with the data  */
+	ival = atomic_read(&fs_info->devfsd_overrun_count);
+	info->overrun_count = ival;
+	entry = fs_info->devfsd_first_event;
+	info->type = entry->type;
+	info->mode = entry->mode;
+	info->uid = entry->uid;
+	info->gid = entry->gid;
+	de = entry->de;
+	if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) {
+		info->major = MAJOR(de->u.dev);
+		info->minor = MINOR(de->u.dev);
 	}
-	rpos += tlen;
-	buf += tlen;
-	len -= tlen;
-    }
-    if ( (rpos >= devname_offset) && (len > 0) )
-    {
-	/*  Copy the name  */
-	tlen = info->namelen + 1;
-	if (tlen > len) tlen = len;
-	else done = TRUE;
-	if ( copy_to_user (buf, info->devname + pos + rpos - devname_offset,
-			   tlen) )
-	{
-	    return -EFAULT;
+	pos = devfs_generate_path(de, info->devname, DEVFS_PATHLEN);
+	if (pos < 0)
+		return pos;
+	info->namelen = DEVFS_PATHLEN - pos - 1;
+	if (info->mode == 0)
+		info->mode = de->mode;
+	devname_offset = info->devname - (char *)info;
+	rpos = *ppos;
+	if (rpos < devname_offset) {
+		/*  Copy parts of the header  */
+		tlen = devname_offset - rpos;
+		if (tlen > len)
+			tlen = len;
+		if (copy_to_user(buf, (char *)info + rpos, tlen)) {
+			return -EFAULT;
+		}
+		rpos += tlen;
+		buf += tlen;
+		len -= tlen;
 	}
-	rpos += tlen;
-    }
-    tlen = rpos - *ppos;
-    if (done)
-    {
-	devfs_handle_t parent;
-
-	spin_lock (&fs_info->devfsd_buffer_lock);
-	fs_info->devfsd_first_event = entry->next;
-	if (entry->next == NULL) fs_info->devfsd_last_event = NULL;
-	spin_unlock (&fs_info->devfsd_buffer_lock);
-	for (; de != NULL; de = parent)
-	{
-	    parent = de->parent;
-	    devfs_put (de);
+	if ((rpos >= devname_offset) && (len > 0)) {
+		/*  Copy the name  */
+		tlen = info->namelen + 1;
+		if (tlen > len)
+			tlen = len;
+		else
+			done = TRUE;
+		if (copy_to_user
+		    (buf, info->devname + pos + rpos - devname_offset, tlen)) {
+			return -EFAULT;
+		}
+		rpos += tlen;
 	}
-	kmem_cache_free (devfsd_buf_cache, entry);
-	if (ival > 0) atomic_sub (ival, &fs_info->devfsd_overrun_count);
-	*ppos = 0;
-    }
-    else *ppos = rpos;
-    return tlen;
-}   /*  End Function devfsd_read  */
-
-static int devfsd_ioctl (struct inode *inode, struct file *file,
-			 unsigned int cmd, unsigned long arg)
+	tlen = rpos - *ppos;
+	if (done) {
+		devfs_handle_t parent;
+
+		spin_lock(&fs_info->devfsd_buffer_lock);
+		fs_info->devfsd_first_event = entry->next;
+		if (entry->next == NULL)
+			fs_info->devfsd_last_event = NULL;
+		spin_unlock(&fs_info->devfsd_buffer_lock);
+		for (; de != NULL; de = parent) {
+			parent = de->parent;
+			devfs_put(de);
+		}
+		kmem_cache_free(devfsd_buf_cache, entry);
+		if (ival > 0)
+			atomic_sub(ival, &fs_info->devfsd_overrun_count);
+		*ppos = 0;
+	} else
+		*ppos = rpos;
+	return tlen;
+}				/*  End Function devfsd_read  */
+
+static int devfsd_ioctl(struct inode *inode, struct file *file,
+			unsigned int cmd, unsigned long arg)
 {
-    int ival;
-    struct fs_info *fs_info = inode->i_sb->s_fs_info;
-
-    switch (cmd)
-    {
-      case DEVFSDIOC_GET_PROTO_REV:
-	ival = DEVFSD_PROTOCOL_REVISION_KERNEL;
-	if ( copy_to_user ( (void *)arg, &ival, sizeof ival ) ) return -EFAULT;
-	break;
-      case DEVFSDIOC_SET_EVENT_MASK:
-	/*  Ensure only one reader has access to the queue. This scheme will
-	    work even if the global kernel lock were to be removed, because it
-	    doesn't matter who gets in first, as long as only one gets it  */
-	if (fs_info->devfsd_task == NULL)
-	{
-	    static spinlock_t lock = SPIN_LOCK_UNLOCKED;
-
-	    if ( !spin_trylock (&lock) ) return -EBUSY;
-	    if (fs_info->devfsd_task != NULL)
-	    {   /*  We lost the race...  */
-		spin_unlock (&lock);
-		return -EBUSY;
-	    }
-	    fs_info->devfsd_task = current;
-	    spin_unlock (&lock);
-	    fs_info->devfsd_pgrp = (process_group(current) == current->pid) ?
-		process_group(current) : 0;
-	    fs_info->devfsd_file = file;
-	    fs_info->devfsd_info = kmalloc (sizeof *fs_info->devfsd_info,
-					    GFP_KERNEL);
-	    if (!fs_info->devfsd_info)
-	    {
-		devfsd_close (inode, file);
-		return -ENOMEM;
-	    }
-	}
-	else if (fs_info->devfsd_task != current) return -EBUSY;
-	fs_info->devfsd_event_mask = arg;  /*  Let the masses come forth  */
-	break;
-      case DEVFSDIOC_RELEASE_EVENT_QUEUE:
-	if (fs_info->devfsd_file != file) return -EPERM;
-	return devfsd_close (inode, file);
-	/*break;*/
+	int ival;
+	struct fs_info *fs_info = inode->i_sb->s_fs_info;
+
+	switch (cmd) {
+	case DEVFSDIOC_GET_PROTO_REV:
+		ival = DEVFSD_PROTOCOL_REVISION_KERNEL;
+		if (copy_to_user((void *)arg, &ival, sizeof ival))
+			return -EFAULT;
+		break;
+	case DEVFSDIOC_SET_EVENT_MASK:
+		/*  Ensure only one reader has access to the queue. This scheme will
+		   work even if the global kernel lock were to be removed, because it
+		   doesn't matter who gets in first, as long as only one gets it  */
+		if (fs_info->devfsd_task == NULL) {
+			static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+
+			if (!spin_trylock(&lock))
+				return -EBUSY;
+			if (fs_info->devfsd_task != NULL) {	/*  We lost the race...  */
+				spin_unlock(&lock);
+				return -EBUSY;
+			}
+			fs_info->devfsd_task = current;
+			spin_unlock(&lock);
+			fs_info->devfsd_pgrp =
+			    (process_group(current) ==
+			     current->pid) ? process_group(current) : 0;
+			fs_info->devfsd_file = file;
+			fs_info->devfsd_info =
+			    kmalloc(sizeof *fs_info->devfsd_info, GFP_KERNEL);
+			if (!fs_info->devfsd_info) {
+				devfsd_close(inode, file);
+				return -ENOMEM;
+			}
+		} else if (fs_info->devfsd_task != current)
+			return -EBUSY;
+		fs_info->devfsd_event_mask = arg;	/*  Let the masses come forth  */
+		break;
+	case DEVFSDIOC_RELEASE_EVENT_QUEUE:
+		if (fs_info->devfsd_file != file)
+			return -EPERM;
+		return devfsd_close(inode, file);
+		/*break; */
 #ifdef CONFIG_DEVFS_DEBUG
-      case DEVFSDIOC_SET_DEBUG_MASK:
-	if ( copy_from_user (&ival, (void *) arg, sizeof ival) )return -EFAULT;
-	devfs_debug = ival;
-	break;
+	case DEVFSDIOC_SET_DEBUG_MASK:
+		if (copy_from_user(&ival, (void *)arg, sizeof ival))
+			return -EFAULT;
+		devfs_debug = ival;
+		break;
 #endif
-      default:
-	return -ENOIOCTLCMD;
-    }
-    return 0;
-}   /*  End Function devfsd_ioctl  */
-
-static int devfsd_close (struct inode *inode, struct file *file)
-{
-    struct devfsd_buf_entry *entry, *next;
-    struct fs_info *fs_info = inode->i_sb->s_fs_info;
-
-    if (fs_info->devfsd_file != file) return 0;
-    fs_info->devfsd_event_mask = 0;
-    fs_info->devfsd_file = NULL;
-    spin_lock (&fs_info->devfsd_buffer_lock);
-    entry = fs_info->devfsd_first_event;
-    fs_info->devfsd_first_event = NULL;
-    fs_info->devfsd_last_event = NULL;
-    if (fs_info->devfsd_info)
-    {
-	kfree (fs_info->devfsd_info);
-	fs_info->devfsd_info = NULL;
-    }
-    spin_unlock (&fs_info->devfsd_buffer_lock);
-    fs_info->devfsd_pgrp = 0;
-    fs_info->devfsd_task = NULL;
-    wake_up (&fs_info->revalidate_wait_queue);
-    for (; entry; entry = next)
-    {
-	next = entry->next;
-	kmem_cache_free (devfsd_buf_cache, entry);
-    }
-    return 0;
-}   /*  End Function devfsd_close  */
+	default:
+		return -ENOIOCTLCMD;
+	}
+	return 0;
+}				/*  End Function devfsd_ioctl  */
+
+static int devfsd_close(struct inode *inode, struct file *file)
+{
+	struct devfsd_buf_entry *entry, *next;
+	struct fs_info *fs_info = inode->i_sb->s_fs_info;
+
+	if (fs_info->devfsd_file != file)
+		return 0;
+	fs_info->devfsd_event_mask = 0;
+	fs_info->devfsd_file = NULL;
+	spin_lock(&fs_info->devfsd_buffer_lock);
+	entry = fs_info->devfsd_first_event;
+	fs_info->devfsd_first_event = NULL;
+	fs_info->devfsd_last_event = NULL;
+	if (fs_info->devfsd_info) {
+		kfree(fs_info->devfsd_info);
+		fs_info->devfsd_info = NULL;
+	}
+	spin_unlock(&fs_info->devfsd_buffer_lock);
+	fs_info->devfsd_pgrp = 0;
+	fs_info->devfsd_task = NULL;
+	wake_up(&fs_info->revalidate_wait_queue);
+	for (; entry; entry = next) {
+		next = entry->next;
+		kmem_cache_free(devfsd_buf_cache, entry);
+	}
+	return 0;
+}				/*  End Function devfsd_close  */
 
 #ifdef CONFIG_DEVFS_DEBUG
-static ssize_t stat_read (struct file *file, char *buf, size_t len,
-			  loff_t *ppos)
-{
-    ssize_t num;
-    char txt[80];
-
-    num = sprintf (txt, "Number of entries: %u  number of bytes: %u\n",
-		   stat_num_entries, stat_num_bytes) + 1;
-    /*  Can't seek (pread) on this device  */
-    if (ppos != &file->f_pos) return -ESPIPE;
-    if (*ppos >= num) return 0;
-    if (*ppos + len > num) len = num - *ppos;
-    if ( copy_to_user (buf, txt + *ppos, len) ) return -EFAULT;
-    *ppos += len;
-    return len;
-}   /*  End Function stat_read  */
+static ssize_t stat_read(struct file *file, char *buf, size_t len,
+			 loff_t * ppos)
+{
+	ssize_t num;
+	char txt[80];
+
+	num = sprintf(txt, "Number of entries: %u  number of bytes: %u\n",
+		      stat_num_entries, stat_num_bytes) + 1;
+	/*  Can't seek (pread) on this device  */
+	if (ppos != &file->f_pos)
+		return -ESPIPE;
+	if (*ppos >= num)
+		return 0;
+	if (*ppos + len > num)
+		len = num - *ppos;
+	if (copy_to_user(buf, txt + *ppos, len))
+		return -EFAULT;
+	*ppos += len;
+	return len;
+}				/*  End Function stat_read  */
 #endif
 
 static int __init init_devfs_fs(void)
@@ -2793,8 +2809,8 @@ static int __init init_devfs_fs(void)
 	printk(KERN_INFO "%s: %s Richard Gooch (rgooch@atnf.csiro.au)\n",
 	       DEVFS_NAME, DEVFS_VERSION);
 	devfsd_buf_cache = kmem_cache_create("devfsd_event",
-					  sizeof (struct devfsd_buf_entry),
-					  0, 0, NULL, NULL);
+					     sizeof(struct devfsd_buf_entry),
+					     0, 0, NULL, NULL);
 	if (!devfsd_buf_cache)
 		OOPS("(): unable to allocate event slab\n");
 #ifdef CONFIG_DEVFS_DEBUG
@@ -2809,32 +2825,35 @@ static int __init init_devfs_fs(void)
 		return major;
 
 	/*  And create the entry for ".devfsd"  */
-	devfsd = _devfs_alloc_entry(".devfsd", 0, S_IFCHR|S_IRUSR|S_IWUSR);
-	if (devfsd == NULL )
+	devfsd = _devfs_alloc_entry(".devfsd", 0, S_IFCHR | S_IRUSR | S_IWUSR);
+	if (devfsd == NULL)
 		return -ENOMEM;
 	devfsd->u.dev = MKDEV(major, 0);
 	_devfs_append_entry(root_entry, devfsd, NULL);
 
 #ifdef CONFIG_DEVFS_DEBUG
-	stat = _devfs_alloc_entry(".stat", 0, S_IFCHR|S_IRUGO);
-	if (stat == NULL )
+	stat = _devfs_alloc_entry(".stat", 0, S_IFCHR | S_IRUGO);
+	if (stat == NULL)
 		return -ENOMEM;
 	stat->u.dev = MKDEV(major, 1);
-	_devfs_append_entry (root_entry, stat, NULL);
+	_devfs_append_entry(root_entry, stat, NULL);
 #endif
 
 	err = register_filesystem(&devfs_fs_type);
 	return err;
-}   /*  End Function init_devfs_fs  */
+}				/*  End Function init_devfs_fs  */
 
-void __init mount_devfs_fs (void)
+void __init mount_devfs_fs(void)
 {
-    int err;
+	int err;
 
-    if ( !(boot_options & OPTION_MOUNT) ) return;
-    err = do_mount ("none", "/dev", "devfs", 0, NULL);
-    if (err == 0) printk (KERN_INFO "Mounted devfs on /dev\n");
-    else PRINTK ("(): unable to mount devfs, err: %d\n", err);
-}   /*  End Function mount_devfs_fs  */
+	if (!(boot_options & OPTION_MOUNT))
+		return;
+	err = do_mount("none", "/dev", "devfs", 0, NULL);
+	if (err == 0)
+		printk(KERN_INFO "Mounted devfs on /dev\n");
+	else
+		PRINTK("(): unable to mount devfs, err: %d\n", err);
+}				/*  End Function mount_devfs_fs  */
 
 module_init(init_devfs_fs)
diff --git a/fs/devfs/util.c b/fs/devfs/util.c
index a6ecc014b471..06a2d827e3e5 100644
--- a/fs/devfs/util.c
+++ b/fs/devfs/util.c
@@ -73,7 +73,6 @@
 #include <linux/genhd.h>
 #include <asm/bitops.h>
 
-
 int devfs_register_tape(const char *name)
 {
 	char tname[32], dest[64];
@@ -86,6 +85,7 @@ int devfs_register_tape(const char *name)
 
 	return n;
 }
+
 EXPORT_SYMBOL(devfs_register_tape);
 
 void devfs_unregister_tape(int num)
diff --git a/include/linux/devfs_fs.h b/include/linux/devfs_fs.h
index 48da59012021..de236f431877 100644
--- a/include/linux/devfs_fs.h
+++ b/include/linux/devfs_fs.h
@@ -22,22 +22,20 @@
 #define DEVFSD_NOTIFY_CREATE        6
 #define DEVFSD_NOTIFY_DELETE        7
 
-#define DEVFS_PATHLEN               1024  /*  Never change this otherwise the
-					      binary interface will change   */
-
-struct devfsd_notify_struct
-{   /*  Use native C types to ensure same types in kernel and user space     */
-    unsigned int type;           /*  DEVFSD_NOTIFY_* value                   */
-    unsigned int mode;           /*  Mode of the inode or device entry       */
-    unsigned int major;          /*  Major number of device entry            */
-    unsigned int minor;          /*  Minor number of device entry            */
-    unsigned int uid;            /*  Uid of process, inode or device entry   */
-    unsigned int gid;            /*  Gid of process, inode or device entry   */
-    unsigned int overrun_count;  /*  Number of lost events                   */
-    unsigned int namelen;        /*  Number of characters not including '\0' */
-    /*  The device name MUST come last                                       */
-    char devname[DEVFS_PATHLEN]; /*  This will be '\0' terminated            */
+#define DEVFS_PATHLEN               1024	/*  Never change this otherwise the
+						   binary interface will change   */
+
+struct devfsd_notify_struct {	/*  Use native C types to ensure same types in kernel and user space     */
+	unsigned int type;	/*  DEVFSD_NOTIFY_* value                   */
+	unsigned int mode;	/*  Mode of the inode or device entry       */
+	unsigned int major;	/*  Major number of device entry            */
+	unsigned int minor;	/*  Minor number of device entry            */
+	unsigned int uid;	/*  Uid of process, inode or device entry   */
+	unsigned int gid;	/*  Gid of process, inode or device entry   */
+	unsigned int overrun_count;	/*  Number of lost events                   */
+	unsigned int namelen;	/*  Number of characters not including '\0' */
+	/*  The device name MUST come last                                       */
+	char devname[DEVFS_PATHLEN];	/*  This will be '\0' terminated            */
 };
 
-
-#endif  /*  _LINUX_DEVFS_FS_H  */
+#endif				/*  _LINUX_DEVFS_FS_H  */
diff --git a/include/linux/devfs_fs_kernel.h b/include/linux/devfs_fs_kernel.h
index 16c78f54f427..89810e73d256 100644
--- a/include/linux/devfs_fs_kernel.h
+++ b/include/linux/devfs_fs_kernel.h
@@ -12,18 +12,18 @@
 
 #ifdef CONFIG_DEVFS_FS
 extern int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
+    __attribute__ ((format(printf, 3, 4)));
 extern int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
-	__attribute__((format (printf, 3, 4)));
+    __attribute__ ((format(printf, 3, 4)));
 extern int devfs_mk_symlink(const char *name, const char *link);
 extern int devfs_mk_dir(const char *fmt, ...)
-	__attribute__((format (printf, 1, 2)));
+    __attribute__ ((format(printf, 1, 2)));
 extern void devfs_remove(const char *fmt, ...)
-	__attribute__((format (printf, 1, 2)));
+    __attribute__ ((format(printf, 1, 2)));
 extern int devfs_register_tape(const char *name);
 extern void devfs_unregister_tape(int num);
 extern void mount_devfs_fs(void);
-#else  /*  CONFIG_DEVFS_FS  */
+#else				/*  CONFIG_DEVFS_FS  */
 static inline int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...)
 {
 	return 0;
@@ -32,9 +32,9 @@ static inline int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...)
 {
 	return 0;
 }
-static inline int devfs_mk_symlink (const char *name, const char *link)
+static inline int devfs_mk_symlink(const char *name, const char *link)
 {
-    return 0;
+	return 0;
 }
 static inline int devfs_mk_dir(const char *fmt, ...)
 {
@@ -43,16 +43,16 @@ static inline int devfs_mk_dir(const char *fmt, ...)
 static inline void devfs_remove(const char *fmt, ...)
 {
 }
-static inline int devfs_register_tape (const char *name)
+static inline int devfs_register_tape(const char *name)
 {
-    return -1;
+	return -1;
 }
 static inline void devfs_unregister_tape(int num)
 {
 }
-static inline void mount_devfs_fs (void)
+static inline void mount_devfs_fs(void)
 {
-    return;
+	return;
 }
-#endif  /*  CONFIG_DEVFS_FS  */
-#endif  /*  _LINUX_DEVFS_FS_KERNEL_H  */
+#endif				/*  CONFIG_DEVFS_FS  */
+#endif				/*  _LINUX_DEVFS_FS_KERNEL_H  */
-- 
cgit v1.2.3


From 0eb217f9b539fccf5aafaba8c9a06e170825f68b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:40:05 -0700
Subject: [PATCH] generalise system_running

From: Olof Johansson <olof@austin.ibm.com>

It's currently a boolean, but that means that system_running goes to zero
again when shutting down.  So we then use code (in the page allocator) which
is only designed to be used during bootup - it is marked __init.

So we need to be able to distinguish early boot state from late shutdown
state.  Rename system_running to system_state and give it the three
appropriate states.
---
 arch/ppc/platforms/pmac_nvram.c | 8 ++++----
 include/linux/kernel.h          | 8 +++++++-
 init/main.c                     | 8 ++------
 kernel/kmod.c                   | 2 +-
 kernel/printk.c                 | 3 ++-
 kernel/sched.c                  | 3 ++-
 kernel/sys.c                    | 8 ++++----
 mm/page_alloc.c                 | 2 +-
 8 files changed, 23 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/arch/ppc/platforms/pmac_nvram.c b/arch/ppc/platforms/pmac_nvram.c
index f381f3f745f9..3b3f984fb929 100644
--- a/arch/ppc/platforms/pmac_nvram.c
+++ b/arch/ppc/platforms/pmac_nvram.c
@@ -154,11 +154,11 @@ static unsigned char __pmac pmu_nvram_read_byte(int addr)
 	struct adb_request req;
 	DECLARE_COMPLETION(req_complete); 
 	
-	req.arg = system_running ? &req_complete : NULL;
+	req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL;
 	if (pmu_request(&req, pmu_nvram_complete, 3, PMU_READ_NVRAM,
 			(addr >> 8) & 0xff, addr & 0xff))
 		return 0xff;
-	if (system_running)
+	if (system_state == SYSTEM_RUNNING)
 		wait_for_completion(&req_complete);
 	while (!req.complete)
 		pmu_poll();
@@ -170,11 +170,11 @@ static void __pmac pmu_nvram_write_byte(int addr, unsigned char val)
 	struct adb_request req;
 	DECLARE_COMPLETION(req_complete); 
 	
-	req.arg = system_running ? &req_complete : NULL;
+	req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL;
 	if (pmu_request(&req, pmu_nvram_complete, 4, PMU_WRITE_NVRAM,
 			(addr >> 8) & 0xff, addr & 0xff, val))
 		return;
-	if (system_running)
+	if (system_state == SYSTEM_RUNNING)
 		wait_for_completion(&req_complete);
 	while (!req.complete)
 		pmu_poll();
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e11e79199357..c1171e77c76b 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -109,9 +109,15 @@ static inline void console_verbose(void)
 extern void bust_spinlocks(int yes);
 extern int oops_in_progress;		/* If set, an oops, panic(), BUG() or die() is in progress */
 extern int panic_on_oops;
-extern int system_running;
+extern int system_state;		/* See values below */
 extern int tainted;
 extern const char *print_tainted(void);
+
+/* Values used for system_state */
+#define SYSTEM_BOOTING 0
+#define SYSTEM_RUNNING 1
+#define SYSTEM_SHUTDOWN 2
+
 #define TAINT_PROPRIETARY_MODULE	(1<<0)
 #define TAINT_FORCED_MODULE		(1<<1)
 #define TAINT_UNSAFE_SMP		(1<<2)
diff --git a/init/main.c b/init/main.c
index 9d1ed1de14c5..348ce7db30f3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -94,11 +94,7 @@ extern void driver_init(void);
 extern void tc_init(void);
 #endif
 
-/*
- * Are we up and running (ie do we have all the infrastructure
- * set up)
- */
-int system_running;
+int system_state;	/* SYSTEM_BOOTING/RUNNING/SHUTDOWN */
 
 /*
  * Boot command-line arguments
@@ -613,7 +609,7 @@ static int init(void * unused)
 	 */
 	free_initmem();
 	unlock_kernel();
-	system_running = 1;
+	system_state = SYSTEM_RUNNING;
 
 	if (sys_open("/dev/console", O_RDWR, 0) < 0)
 		printk("Warning: unable to open an initial console.\n");
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5261de82029b..0002fcd4c554 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 	};
 	DECLARE_WORK(work, __call_usermodehelper, &sub_info);
 
-	if (!system_running)
+	if (system_state != SYSTEM_RUNNING)
 		return -EBUSY;
 
 	if (path[0] == '\0')
diff --git a/kernel/printk.c b/kernel/printk.c
index a7be1f922f34..5f2b3c9bbd6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...)
 			log_level_unknown = 1;
 	}
 
-	if (!cpu_online(smp_processor_id()) && !system_running) {
+	if (!cpu_online(smp_processor_id()) &&
+	    system_state != SYSTEM_RUNNING) {
 		/*
 		 * Some console drivers may assume that per-cpu resources have
 		 * been allocated.  So don't allow them to be called by this
diff --git a/kernel/sched.c b/kernel/sched.c
index d5f21712ffbb..9e19d4c0d4a9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2982,7 +2982,8 @@ void __might_sleep(char *file, int line)
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) && system_running) {
+	if ((in_atomic() || irqs_disabled()) &&
+	    system_state == SYSTEM_RUNNING) {
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
diff --git a/kernel/sys.c b/kernel/sys.c
index 33a14e13079e..bc498b12edcc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -436,7 +436,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 	switch (cmd) {
 	case LINUX_REBOOT_CMD_RESTART:
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system.\n");
 		machine_restart(NULL);
@@ -452,7 +452,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_HALT:
 		notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "System halted.\n");
 		machine_halt();
@@ -462,7 +462,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 
 	case LINUX_REBOOT_CMD_POWER_OFF:
 		notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Power down.\n");
 		machine_power_off();
@@ -478,7 +478,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
 		buffer[sizeof(buffer) - 1] = '\0';
 
 		notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
-		system_running = 0;
+		system_state = SYSTEM_SHUTDOWN;
 		device_shutdown();
 		printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
 		machine_restart(buffer);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5d035d836c15..9764a4e78e45 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -734,7 +734,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde
 	struct page * page;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(!system_running))
+	if (unlikely(system_state == SYSTEM_BOOTING))
 		return get_boot_pages(gfp_mask, order);
 #endif
 	page = alloc_pages(gfp_mask, order);
-- 
cgit v1.2.3


From efffe9c8536bf9ee28f2f381bd285824bedcdbcd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:40:55 -0700
Subject: [PATCH] Fix VT open/close race

The race is that con_close() can sleep, and drops the BKL while
tty->count==1.  But another thread can come into init_dev() and will take a
new ref against the tty and start using it.

But con_close() doesn't notice that new ref and proceeds to null out
tty->driver_data while someone else is using the resurrected tty.

So the patch serialises con_close() against init_dev() with tty_sem.


Here's a test app which reproduced the oops instantly on 2-way.  It realy
needs to be run against all tty-capable devices.

/*
 * Run this against a tty which nobody currently has open, such as /dev/tty9
 */

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>
#include <linux/kd.h>

void doit(char *filename)
{
	int fd,x;

	fd = open(filename, O_RDWR);
	if (fd < 0) {
		perror("open");
		exit(1);
	}
	ioctl(fd, KDKBDREP, &x);
	close(fd);
}

main(int argc, char *argv[])
{
	char *filename = argv[1];

	for ( ; ; )
		doit(filename);
}
---
 drivers/char/tty_io.c |  2 +-
 drivers/char/vt.c     | 14 ++++++++++++++
 include/linux/tty.h   |  3 +++
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6bb5ae7e41a5..0ba52078f637 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -123,7 +123,7 @@ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
 struct tty_ldisc ldiscs[NR_LDISCS];	/* line disc dispatch table	*/
 
 /* Semaphore to protect creating and releasing a tty */
-static DECLARE_MUTEX(tty_sem);
+DECLARE_MUTEX(tty_sem);
 
 #ifdef CONFIG_UNIX98_PTYS
 extern struct tty_driver *ptm_driver;	/* Unix98 pty masters; for /dev/ptmx */
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index a5ddfc5ac9c1..2febed52e19f 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2480,8 +2480,16 @@ static int con_open(struct tty_struct *tty, struct file *filp)
 	return ret;
 }
 
+/*
+ * We take tty_sem in here to prevent another thread from coming in via init_dev
+ * and taking a ref against the tty while we're in the process of forgetting
+ * about it and cleaning things up.
+ *
+ * This is because vcs_remove_devfs() can sleep and will drop the BKL.
+ */
 static void con_close(struct tty_struct *tty, struct file *filp)
 {
+	down(&tty_sem);
 	acquire_console_sem();
 	if (tty && tty->count == 1) {
 		struct vt_struct *vt;
@@ -2492,9 +2500,15 @@ static void con_close(struct tty_struct *tty, struct file *filp)
 		tty->driver_data = 0;
 		release_console_sem();
 		vcs_remove_devfs(tty);
+		up(&tty_sem);
+		/*
+		 * tty_sem is released, but we still hold BKL, so there is
+		 * still exclusion against init_dev()
+		 */
 		return;
 	}
 	release_console_sem();
+	up(&tty_sem);
 }
 
 static void vc_init(unsigned int currcons, unsigned int rows,
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fbcc401e8b28..6e61f3b27157 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -363,6 +363,9 @@ extern void tty_flip_buffer_push(struct tty_struct *tty);
 extern int tty_get_baud_rate(struct tty_struct *tty);
 extern int tty_termios_baud_rate(struct termios *termios);
 
+struct semaphore;
+extern struct semaphore tty_sem;
+
 /* n_tty.c */
 extern struct tty_ldisc tty_ldisc_N_TTY;
 
-- 
cgit v1.2.3


From ee28db843649533f5650186251ae4a8bd49a3da9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:41:07 -0700
Subject: [PATCH] i4l: kernelcapi receive workqueue and locking rework

From: Armin Schindler <armin@melware.de>

With this patch the ISDN kernel CAPI code uses a per application workqueue
with proper locking to prevent message re-ordering due to the fact a
workqueue may run on another CPU at the same time.  Also some locks for
internal data is added.

Removed global recv_queue work, use per application workqueue.  Added
proper locking mechanisms for application, controller and application
workqueue function.  Increased max.  number of possible applications and
controllers.
---
 drivers/isdn/capi/kcapi.c  | 96 ++++++++++++++++++++++++++++++++--------------
 include/linux/kernelcapi.h | 11 ++++--
 2 files changed, 75 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 064dc3003716..8524997b10b6 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -1,4 +1,4 @@
-/* $Id: kcapi.c,v 1.1.2.7 2004/03/16 08:01:47 armin Exp $
+/* $Id: kcapi.c,v 1.1.2.8 2004/03/26 19:57:20 armin Exp $
  * 
  * Kernel CAPI 2.0 Module
  * 
@@ -31,7 +31,7 @@
 #include <linux/b1lli.h>
 #endif
 
-static char *revision = "$Revision: 1.1.2.7 $";
+static char *revision = "$Revision: 1.1.2.8 $";
 
 /* ------------------------------------------------------------- */
 
@@ -63,13 +63,13 @@ static char capi_manufakturer[64] = "AVM Berlin";
 LIST_HEAD(capi_drivers);
 rwlock_t capi_drivers_list_lock = RW_LOCK_UNLOCKED;
 
+static rwlock_t application_lock = RW_LOCK_UNLOCKED;
+static DECLARE_MUTEX(controller_sem);
+
 struct capi20_appl *capi_applications[CAPI_MAXAPPL];
 struct capi_ctr *capi_cards[CAPI_MAXCONTR];
 
 static int ncards;
-static struct sk_buff_head recv_queue;
-
-static struct work_struct tq_recv_notify;
 
 /* -------- controller ref counting -------------------------------------- */
 
@@ -174,7 +174,7 @@ static void notify_up(u32 contr)
 
 	for (applid = 1; applid <= CAPI_MAXAPPL; applid++) {
 		ap = get_capi_appl_by_nr(applid);
-		if (ap && ap->callback)
+		if (ap && ap->callback && !ap->release_in_progress)
 			ap->callback(KCI_CONTRUP, contr, &card->profile);
 	}
 }
@@ -192,7 +192,7 @@ static void notify_down(u32 contr)
 
 	for (applid = 1; applid <= CAPI_MAXAPPL; applid++) {
 		ap = get_capi_appl_by_nr(applid);
-		if (ap && ap->callback)
+		if (ap && ap->callback && !ap->release_in_progress)
 			ap->callback(KCI_CONTRDOWN, contr, 0);
 	}
 }
@@ -237,38 +237,39 @@ static int notify_push(unsigned int cmd, u32 controller, u16 applid, u32 ncci)
 	
 /* -------- Receiver ------------------------------------------ */
 
-static void recv_handler(void *dummy)
+static void recv_handler(void *_ap)
 {
 	struct sk_buff *skb;
-	struct capi20_appl *ap;
+	struct capi20_appl *ap = (struct capi20_appl *) _ap;
 
-	while ((skb = skb_dequeue(&recv_queue)) != 0) {
-		ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data));
-		if (!ap) {
-			printk(KERN_ERR "kcapi: recv_handler: applid %d ? (%s)\n",
-				CAPIMSG_APPID(skb->data), capi_message2str(skb->data));
-			kfree_skb(skb);
-			continue;
-		}
+	if ((!ap) || (ap->release_in_progress))
+		return;
 
+	down(&ap->recv_sem);
+	while ((skb = skb_dequeue(&ap->recv_queue))) {
 		if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_IND)
 			ap->nrecvdatapkt++;
 		else
 			ap->nrecvctlpkt++;
+
 		ap->recv_message(ap, skb);
 	}
+	up(&ap->recv_sem);
 }
 
 void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb)
 {
+	struct capi20_appl *ap;
 	int showctl = 0;
 	u8 cmd, subcmd;
+	unsigned long flags;
 
 	if (card->cardstate != CARD_RUNNING) {
 		printk(KERN_INFO "kcapi: controller %d not active, got: %s",
 		       card->cnr, capi_message2str(skb->data));
 		goto error;
 	}
+
 	cmd = CAPIMSG_COMMAND(skb->data);
         subcmd = CAPIMSG_SUBCOMMAND(skb->data);
 	if (cmd == CAPI_DATA_B3 && subcmd == CAPI_IND) {
@@ -293,8 +294,19 @@ void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *s
 		}
 
 	}
-	skb_queue_tail(&recv_queue, skb);
-	schedule_work(&tq_recv_notify);
+
+	read_lock_irqsave(&application_lock, flags);
+	ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data));
+	if ((!ap) || (ap->release_in_progress)) {
+		read_unlock_irqrestore(&application_lock, flags);
+		printk(KERN_ERR "kcapi: handle_message: applid %d state released (%s)\n",
+			CAPIMSG_APPID(skb->data), capi_message2str(skb->data));
+		goto error;
+	}
+	skb_queue_tail(&ap->recv_queue, skb);
+	schedule_work(&ap->recv_work);
+	read_unlock_irqrestore(&application_lock, flags);
+
 	return;
 
 error:
@@ -310,11 +322,13 @@ void capi_ctr_ready(struct capi_ctr * card)
 
 	card->cardstate = CARD_RUNNING;
 
+	down(&controller_sem);
 	for (appl = 1; appl <= CAPI_MAXAPPL; appl++) {
 		ap = get_capi_appl_by_nr(appl);
-		if (!ap) continue;
+		if (!ap || ap->release_in_progress) continue;
 		register_appl(card, appl, &ap->rparam);
 	}
+	up(&controller_sem);
 
         printk(KERN_NOTICE "kcapi: card %d \"%s\" ready.\n",
 	       card->cnr, card->name);
@@ -342,7 +356,7 @@ void capi_ctr_reseted(struct capi_ctr * card)
 
 	for (appl = 1; appl <= CAPI_MAXAPPL; appl++) {
 		struct capi20_appl *ap = get_capi_appl_by_nr(appl);
-		if (!ap)
+		if (!ap || ap->release_in_progress)
 			continue;
 
 		capi_ctr_put(card);
@@ -382,16 +396,21 @@ attach_capi_ctr(struct capi_ctr *card)
 {
 	int i;
 
+	down(&controller_sem);
+
 	for (i = 0; i < CAPI_MAXCONTR; i++) {
 		if (capi_cards[i] == NULL)
 			break;
 	}
 	if (i == CAPI_MAXCONTR) {
+		up(&controller_sem);
 		printk(KERN_ERR "kcapi: out of controller slots\n");
 	   	return -EBUSY;
 	}
 	capi_cards[i] = card;
 
+	up(&controller_sem);
+
 	card->nrecvctlpkt = 0;
 	card->nrecvdatapkt = 0;
 	card->nsentctlpkt = 0;
@@ -480,18 +499,23 @@ u16 capi20_register(struct capi20_appl *ap)
 {
 	int i;
 	u16 applid;
+	unsigned long flags;
 
 	DBG("");
 
 	if (ap->rparam.datablklen < 128)
 		return CAPI_LOGBLKSIZETOSMALL;
 
+	write_lock_irqsave(&application_lock, flags);
+
 	for (applid = 1; applid <= CAPI_MAXAPPL; applid++) {
 		if (capi_applications[applid - 1] == NULL)
 			break;
 	}
-	if (applid > CAPI_MAXAPPL)
+	if (applid > CAPI_MAXAPPL) {
+		write_unlock_irqrestore(&application_lock, flags);
 		return CAPI_TOOMANYAPPLS;
+	}
 
 	ap->applid = applid;
 	capi_applications[applid - 1] = ap;
@@ -501,12 +525,21 @@ u16 capi20_register(struct capi20_appl *ap)
 	ap->nsentctlpkt = 0;
 	ap->nsentdatapkt = 0;
 	ap->callback = 0;
+	init_MUTEX(&ap->recv_sem);
+	skb_queue_head_init(&ap->recv_queue);
+	INIT_WORK(&ap->recv_work, recv_handler, (void *)ap);
+	ap->release_in_progress = 0;
+
+	write_unlock_irqrestore(&application_lock, flags);
 	
+	down(&controller_sem);
 	for (i = 0; i < CAPI_MAXCONTR; i++) {
 		if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING)
 			continue;
 		register_appl(capi_cards[i], applid, &ap->rparam);
 	}
+	up(&controller_sem);
+
 	if (showcapimsgs & 1) {
 		printk(KERN_DEBUG "kcapi: appl %d up\n", applid);
 	}
@@ -519,15 +552,26 @@ EXPORT_SYMBOL(capi20_register);
 u16 capi20_release(struct capi20_appl *ap)
 {
 	int i;
+	unsigned long flags;
 
 	DBG("applid %#x", ap->applid);
 
+	write_lock_irqsave(&application_lock, flags);
+	ap->release_in_progress = 1;
+	capi_applications[ap->applid - 1] = NULL;
+	write_unlock_irqrestore(&application_lock, flags);
+
+	down(&controller_sem);
 	for (i = 0; i < CAPI_MAXCONTR; i++) {
 		if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING)
 			continue;
 		release_appl(capi_cards[i], ap->applid);
 	}
-	capi_applications[ap->applid - 1] = NULL;
+	up(&controller_sem);
+
+	flush_scheduled_work();
+	skb_queue_purge(&ap->recv_queue);
+
 	if (showcapimsgs & 1) {
 		printk(KERN_DEBUG "kcapi: appl %d down\n", ap->applid);
 	}
@@ -547,7 +591,7 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb)
  
 	if (ncards == 0)
 		return CAPI_REGNOTINSTALLED;
-	if (ap->applid == 0)
+	if ((ap->applid == 0) || ap->release_in_progress)
 		return CAPI_ILLAPPNR;
 	if (skb->len < 12
 	    || !capi_cmd_valid(CAPIMSG_COMMAND(skb->data))
@@ -925,10 +969,6 @@ static int __init kcapi_init(void)
 	char *p;
 	char rev[32];
 
-	skb_queue_head_init(&recv_queue);
-
-	INIT_WORK(&tq_recv_notify, recv_handler, NULL);
-
         kcapi_proc_init();
 
 	if ((p = strchr(revision, ':')) != 0 && p[1]) {
diff --git a/include/linux/kernelcapi.h b/include/linux/kernelcapi.h
index b982d5b77ae9..1d4b1b15d0b8 100644
--- a/include/linux/kernelcapi.h
+++ b/include/linux/kernelcapi.h
@@ -10,10 +10,8 @@
 #ifndef __KERNELCAPI_H__
 #define __KERNELCAPI_H__
 
-#include <linux/list.h>
-
-#define CAPI_MAXAPPL	128	/* maximum number of applications  */
-#define CAPI_MAXCONTR	16	/* maximum number of controller    */
+#define CAPI_MAXAPPL	240	/* maximum number of applications  */
+#define CAPI_MAXCONTR	32	/* maximum number of controller    */
 #define CAPI_MAXDATAWINDOW	8
 
 
@@ -47,6 +45,7 @@ typedef struct kcapi_carddef {
 
 #ifdef __KERNEL__
 
+#include <linux/list.h>
 #include <linux/skbuff.h>
 
 #define	KCI_CONTRUP	0	/* arg: struct capi_profile */
@@ -63,6 +62,10 @@ struct capi20_appl {
 	unsigned long nrecvdatapkt;
 	unsigned long nsentctlpkt;
 	unsigned long nsentdatapkt;
+	struct semaphore recv_sem;
+	struct sk_buff_head recv_queue;
+	struct work_struct recv_work;
+	int release_in_progress;
 
 	/* ugly hack to allow for notification of added/removed
 	 * controllers. The Right Way (tm) is known. XXX
-- 
cgit v1.2.3


From b283f09cf8f51c29bf90e42e22099f76d0f33378 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:41:20 -0700
Subject: [PATCH] Fix get_wchan() FIXME wrt. order of functions

From: William Lee Irwin III <wli@holomorphy.com>

This addresses the issue with get_wchan() that the various functions acting
as scheduling-related primitives are not, in fact, contiguous in the text
segment.  It creates an ELF section for scheduling primitives to be placed
in, and places currently-detected (i.e.  skipped during stack decoding)
scheduling primitives and others like io_schedule() and down(), which are
currently missed by get_wchan() code, into this section also.

The net effects are more reliability of get_wchan()'s results and the new
ability, made use of by this code, to arbitrarily place scheduling
primitives in the source code without disturbing get_wchan()'s accuracy.

Suggestions by Arnd Bergmann and Matthew Wilcox regarding reducing the
invasiveness of the patch were incorporated during prior rounds of review.
I've at least tried to sweep all arches in this patch.
---
 arch/alpha/kernel/process.c                |  2 --
 arch/alpha/kernel/semaphore.c              |  9 ++++----
 arch/alpha/kernel/vmlinux.lds.S            |  1 +
 arch/arm/kernel/process.c                  |  2 --
 arch/arm/kernel/semaphore.c                |  8 ++++---
 arch/arm/kernel/vmlinux.lds.S              |  1 +
 arch/arm26/kernel/process.c                |  2 --
 arch/arm26/kernel/semaphore.c              |  8 ++++---
 arch/arm26/kernel/vmlinux-arm26-xip.lds.in |  1 +
 arch/arm26/kernel/vmlinux-arm26.lds.in     |  1 +
 arch/cris/arch-v10/kernel/process.c        |  3 +--
 arch/cris/arch-v10/vmlinux.lds.S           |  1 +
 arch/cris/kernel/semaphore.c               |  5 ++--
 arch/h8300/kernel/process.c                |  3 ---
 arch/h8300/kernel/semaphore.c              |  5 ++--
 arch/h8300/kernel/vmlinux.lds.S            |  1 +
 arch/i386/kernel/process.c                 |  2 --
 arch/i386/kernel/semaphore.c               | 17 +++++++-------
 arch/i386/kernel/vmlinux.lds.S             |  1 +
 arch/ia64/kernel/process.c                 |  2 --
 arch/ia64/kernel/semaphore.c               |  7 +++---
 arch/ia64/kernel/vmlinux.lds.S             |  1 +
 arch/m68k/kernel/process.c                 |  5 ----
 arch/m68k/kernel/semaphore.c               |  5 ++--
 arch/m68k/kernel/vmlinux-std.lds           |  1 +
 arch/m68k/kernel/vmlinux-sun3.lds          |  1 +
 arch/m68knommu/kernel/process.c            |  5 ----
 arch/m68knommu/kernel/semaphore.c          |  5 ++--
 arch/m68knommu/kernel/vmlinux.lds.S        |  1 +
 arch/mips/kernel/process.c                 |  2 --
 arch/mips/kernel/semaphore.c               |  5 ++--
 arch/mips/kernel/vmlinux.lds.S             |  1 +
 arch/parisc/kernel/semaphore.c             |  5 ++--
 arch/parisc/kernel/vmlinux.lds.S           |  1 +
 arch/ppc/kernel/process.c                  |  2 --
 arch/ppc/kernel/semaphore.c                |  5 ++--
 arch/ppc/kernel/vmlinux.lds.S              |  1 +
 arch/ppc64/kernel/process.c                |  2 --
 arch/ppc64/kernel/semaphore.c              |  5 ++--
 arch/ppc64/kernel/vmlinux.lds.S            |  1 +
 arch/s390/kernel/process.c                 |  2 --
 arch/s390/kernel/semaphore.c               |  5 ++--
 arch/s390/kernel/vmlinux.lds.S             |  1 +
 arch/sh/kernel/process.c                   |  4 +---
 arch/sh/kernel/semaphore.c                 |  5 ++--
 arch/sh/kernel/vmlinux.lds.S               |  1 +
 arch/sparc/kernel/process.c                |  4 +---
 arch/sparc/kernel/semaphore.c              |  5 ++--
 arch/sparc/kernel/vmlinux.lds.S            |  1 +
 arch/sparc/lib/rwsem.S                     |  3 ++-
 arch/sparc64/kernel/process.c              |  4 +---
 arch/sparc64/kernel/semaphore.c            |  9 ++++----
 arch/sparc64/kernel/vmlinux.lds.S          |  1 +
 arch/sparc64/lib/rwsem.c                   |  5 ++--
 arch/v850/kernel/process.c                 |  3 ---
 arch/v850/kernel/semaphore.c               |  5 ++--
 arch/v850/kernel/vmlinux.lds.S             |  1 +
 arch/x86_64/kernel/process.c               |  2 --
 arch/x86_64/kernel/semaphore.c             |  5 ++--
 arch/x86_64/kernel/vmlinux.lds.S           |  1 +
 arch/x86_64/lib/thunk.S                    |  3 ++-
 include/asm-generic/vmlinux.lds.h          |  5 ++++
 include/linux/init.h                       |  2 ++
 include/linux/sched.h                      |  2 ++
 kernel/sched.c                             | 37 ++++++++++++++++--------------
 kernel/timer.c                             |  4 ++--
 lib/rwsem.c                                |  5 ++--
 67 files changed, 137 insertions(+), 124 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index e427bae12ffe..297e4b48bfe2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -513,8 +513,6 @@ thread_saved_pc(task_t *t)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c
index b52a0df303fe..4d60a0ccd6f7 100644
--- a/arch/alpha/kernel/semaphore.c
+++ b/arch/alpha/kernel/semaphore.c
@@ -7,6 +7,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 
 /*
  * This is basically the PPC semaphore scheme ported to use
@@ -60,7 +61,7 @@ static inline int __sem_update_count(struct semaphore *sem, int incr)
  * Either form may be used in conjunction with "up()".
  */
 
-void
+void __sched
 __down_failed(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
@@ -101,7 +102,7 @@ __down_failed(struct semaphore *sem)
 #endif
 }
 
-int
+int __sched
 __down_failed_interruptible(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
@@ -159,7 +160,7 @@ __up_wakeup(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void
+void __sched
 down(struct semaphore *sem)
 {
 #if WAITQUEUE_DEBUG
@@ -173,7 +174,7 @@ down(struct semaphore *sem)
 	__down(sem);
 }
 
-int
+int __sched
 down_interruptible(struct semaphore *sem)
 {
 #if WAITQUEUE_DEBUG
diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S
index 7afd00d5d46b..d159b8f0d022 100644
--- a/arch/alpha/kernel/vmlinux.lds.S
+++ b/arch/alpha/kernel/vmlinux.lds.S
@@ -17,6 +17,7 @@ SECTIONS
   _text = .;					/* Text and read-only data */
   .text : { 
 	*(.text) 
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
   } :kernel
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 863c4076daad..8423921e821a 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -414,8 +414,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c
index a50902e8bec7..da39eb3dca31 100644
--- a/arch/arm/kernel/semaphore.c
+++ b/arch/arm/kernel/semaphore.c
@@ -13,6 +13,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -54,7 +55,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -87,7 +88,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * sem)
  * registers (r0 to r3 and lr), but not ip, as we use it as a return
  * value in some cases..
  */
-asm("	.align	5				\n\
+asm("	.section .sched.text			\n\
+	.align	5				\n\
 	.globl	__down_failed			\n\
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 56af3401b34d..a5db0ddca6a4 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -73,6 +73,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c
index 09a2f52ad8a8..ce23571617a1 100644
--- a/arch/arm26/kernel/process.c
+++ b/arch/arm26/kernel/process.c
@@ -400,8 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/arm26/kernel/semaphore.c b/arch/arm26/kernel/semaphore.c
index e7964ce1d0d9..60591a738592 100644
--- a/arch/arm26/kernel/semaphore.c
+++ b/arch/arm26/kernel/semaphore.c
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -56,7 +57,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +90,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * sem)
  * registers (r0 to r3 and lr), but not ip, as we use it as a return
  * value in some cases..
  */
-asm("	.align	5				\n\
+asm("	.section .sched.text			\n\
+	.align	5				\n\
 	.globl	__down_failed			\n\
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
diff --git a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
index 602a77c022d7..61eedf0bc42f 100644
--- a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
+++ b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in
@@ -66,6 +66,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/arm26/kernel/vmlinux-arm26.lds.in b/arch/arm26/kernel/vmlinux-arm26.lds.in
index 8782fe36f0a8..2393f3805a49 100644
--- a/arch/arm26/kernel/vmlinux-arm26.lds.in
+++ b/arch/arm26/kernel/vmlinux-arm26.lds.in
@@ -67,6 +67,7 @@ SECTIONS
 	.text : {			/* Real text segment		*/
 		_text = .;		/* Text and read-only data	*/
 			*(.text)
+			SCHED_TEXT
 			*(.fixup)
 			*(.gnu.warning)
 			*(.rodata)
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 62e3a4fbf33a..c785b54e6cbd 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -16,6 +16,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
+#include <linux/init.h>
 
 #ifdef CONFIG_ETRAX_GPIO
 void etrax_gpio_wake_up_check(void); /* drivers/gpio.c */
@@ -216,8 +217,6 @@ asmlinkage int sys_execve(const char *fname, char **argv, char **envp,
  * These bracket the sleeping functions..
  */
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched     ((unsigned long) scheduling_functions_start_here)
 #define last_sched      ((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S
index b2c27e147f29..6b73a2c0dad8 100644
--- a/arch/cris/arch-v10/vmlinux.lds.S
+++ b/arch/cris/arch-v10/vmlinux.lds.S
@@ -25,6 +25,7 @@ SECTIONS
 	__stext = .;
 	.text : {
 		*(.text)
+		SCHED_TEXT
 		*(.fixup)
 		*(.text.__*)
 	}
diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c
index d62b355e1706..b884263d3cd4 100644
--- a/arch/cris/kernel/semaphore.c
+++ b/arch/cris/kernel/semaphore.c
@@ -4,6 +4,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 /*
@@ -94,7 +95,7 @@ void __up(struct semaphore *sem)
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_VAR
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
@@ -104,7 +105,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int ret = 0;
 	DOWN_VAR
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index bd6ccd542399..8640ea20dba0 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -264,8 +264,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -289,7 +287,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c
index 690efce1e437..1ebb79baaa8c 100644
--- a/arch/h8300/kernel/semaphore.c
+++ b/arch/h8300/kernel/semaphore.c
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -95,7 +96,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -106,7 +107,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S
index 60787f07eb2b..3a643954a8fe 100644
--- a/arch/h8300/kernel/vmlinux.lds.S
+++ b/arch/h8300/kernel/vmlinux.lds.S
@@ -82,6 +82,7 @@ SECTIONS
 #endif
 	__stext = . ;
         	*(.text)
+	SCHED_TEXT
 	. = ALIGN(0x4) ;
 		*(.exit.text)
 		*(.text.*)
diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c
index 3495f1aedf67..7fed9d3823ed 100644
--- a/arch/i386/kernel/process.c
+++ b/arch/i386/kernel/process.c
@@ -632,8 +632,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 #define top_esp                (THREAD_SIZE - sizeof(unsigned long))
diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c
index 5acd544f0cbd..073912cfcf44 100644
--- a/arch/i386/kernel/semaphore.c
+++ b/arch/i386/kernel/semaphore.c
@@ -15,6 +15,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/init.h>
 #include <asm/semaphore.h>
 
 /*
@@ -53,7 +54,7 @@ asmlinkage void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-asmlinkage void __down(struct semaphore * sem)
+asmlinkage void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -90,7 +91,7 @@ asmlinkage void __down(struct semaphore * sem)
 	tsk->state = TASK_RUNNING;
 }
 
-asmlinkage int __down_interruptible(struct semaphore * sem)
+asmlinkage int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -187,7 +188,7 @@ asmlinkage int __down_trylock(struct semaphore * sem)
  * value..
  */
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed\n"
 "__down_failed:\n\t"
@@ -210,7 +211,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed_interruptible\n"
 "__down_failed_interruptible:\n\t"
@@ -231,7 +232,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __down_failed_trylock\n"
 "__down_failed_trylock:\n\t"
@@ -252,7 +253,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align 4\n"
 ".globl __up_wakeup\n"
 "__up_wakeup:\n\t"
@@ -271,7 +272,7 @@ asm(
  */
 #if defined(CONFIG_SMP)
 asm(
-".text\n"
+".section .sched.text\n"
 ".align	4\n"
 ".globl	__write_lock_failed\n"
 "__write_lock_failed:\n\t"
@@ -285,7 +286,7 @@ asm(
 );
 
 asm(
-".text\n"
+".section .sched.text\n"
 ".align	4\n"
 ".globl	__read_lock_failed\n"
 "__read_lock_failed:\n\t"
diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S
index 3623d7e2934a..0253c586547b 100644
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -16,6 +16,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a1d09d5c91c4..0d245cbcd1f6 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -660,8 +660,6 @@ get_wchan (struct task_struct *p)
 	/*
 	 * These bracket the sleeping functions..
 	 */
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 #	define first_sched	((unsigned long) scheduling_functions_start_here)
 #	define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c
index f3926a3c4d73..2724ef3fbae2 100644
--- a/arch/ia64/kernel/semaphore.c
+++ b/arch/ia64/kernel/semaphore.c
@@ -24,6 +24,7 @@
  * <asm/semaphore.h> where we want to avoid any extra jumps and calls.
  */
 #include <linux/sched.h>
+#include <linux/init.h>
 
 #include <asm/errno.h>
 #include <asm/semaphore.h>
@@ -44,8 +45,7 @@ __up (struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void
-__down (struct semaphore *sem)
+void __sched __down (struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,8 +82,7 @@ __down (struct semaphore *sem)
 	tsk->state = TASK_RUNNING;
 }
 
-int
-__down_interruptible (struct semaphore * sem)
+int __sched __down_interruptible (struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index e5589e49d9da..5c45718a9c82 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -41,6 +41,7 @@ SECTIONS
     {
 	*(.text.ivt)
 	*(.text)
+	SCHED_TEXT
 	*(.gnu.linkonce.t*)
     }
   .text2 : AT(ADDR(.text2) - LOAD_OFFSET)
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 8d72a5c5b0c7..fc2c753c332b 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -65,8 +65,6 @@ asmlinkage void ret_from_fork(void);
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 	struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
 	/* Check whether the thread is blocked in resume() */
 	if (sw->retpc > (unsigned long)scheduling_functions_start_here &&
@@ -387,8 +385,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -407,7 +403,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c
index 690efce1e437..1ebb79baaa8c 100644
--- a/arch/m68k/kernel/semaphore.c
+++ b/arch/m68k/kernel/semaphore.c
@@ -5,6 +5,7 @@
 
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -95,7 +96,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -106,7 +107,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds
index bd41fc992169..6dc62684c7b9 100644
--- a/arch/m68k/kernel/vmlinux-std.lds
+++ b/arch/m68k/kernel/vmlinux-std.lds
@@ -12,6 +12,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x4e75
diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds
index 2e81cde14987..f293e567192c 100644
--- a/arch/m68k/kernel/vmlinux-sun3.lds
+++ b/arch/m68k/kernel/vmlinux-sun3.lds
@@ -13,6 +13,7 @@ SECTIONS
   .text : {
 	*(.head)
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x4e75
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index c8b87371641a..896d596a1bd8 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -406,8 +406,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -426,7 +424,6 @@ unsigned long get_wchan(struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
@@ -439,8 +436,6 @@ unsigned long get_wchan(struct task_struct *p)
  */
 unsigned long thread_saved_pc(struct task_struct *tsk)
 {
-	extern void scheduling_functions_start_here(void);
-	extern void scheduling_functions_end_here(void);
 	struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp;
 
 	/* Check whether the thread is blocked in resume() */
diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c
index 33d704fcf883..c083f4772add 100644
--- a/arch/m68knommu/kernel/semaphore.c
+++ b/arch/m68knommu/kernel/semaphore.c
@@ -6,6 +6,7 @@
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/err.h>
+#include <linux/init.h>
 #include <asm/semaphore-helper.h>
 
 #ifndef CONFIG_RMW_INSNS
@@ -96,7 +97,7 @@ void __up(struct semaphore *sem)
 	current->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 
@@ -107,7 +108,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	int ret = 0;
diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S
index 1ab8a31ef964..a362870b6e4e 100644
--- a/arch/m68knommu/kernel/vmlinux.lds.S
+++ b/arch/m68knommu/kernel/vmlinux.lds.S
@@ -191,6 +191,7 @@ SECTIONS {
 	.text : {
 		_stext = . ;
         	*(.text)
+		SCHED_TEXT
         	*(.text.lock)
 
 		. = ALIGN(16);          /* Exception table              */
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index f8ba26770bf4..f4ab9c66b27f 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -283,8 +283,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c
index 11b937f20604..51c3e772c029 100644
--- a/arch/mips/kernel/semaphore.c
+++ b/arch/mips/kernel/semaphore.c
@@ -6,6 +6,7 @@
 #include <linux/config.h>
 #include <linux/errno.h>
 #include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 
 #ifdef CONFIG_CPU_HAS_LLDSCD
@@ -104,7 +105,7 @@ static inline int waking_non_zero(struct semaphore *sem)
  * Either form may be used in conjunction with "up()".
  */
 
-void __down_failed(struct semaphore * sem)
+void __sched __down_failed(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	wait_queue_t wait;
@@ -227,7 +228,7 @@ static inline int waking_non_zero_interruptible(struct semaphore *sem,
 
 #endif /* !CONFIG_CPU_HAS_LLDSCD */
 
-int __down_failed_interruptible(struct semaphore * sem)
+int __sched __down_failed_interruptible(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	wait_queue_t wait;
diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S
index b72639f8db65..098cfaa23c0e 100644
--- a/arch/mips/kernel/vmlinux.lds.S
+++ b/arch/mips/kernel/vmlinux.lds.S
@@ -28,6 +28,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
     *(.text)
+    SCHED_TEXT
     *(.fixup)
     *(.gnu.warning)
   } =0
diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c
index ffb4851451fc..ee806bcc3726 100644
--- a/arch/parisc/kernel/semaphore.c
+++ b/arch/parisc/kernel/semaphore.c
@@ -5,6 +5,7 @@
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 /*
  * Semaphores are complex as we wish to avoid using two variables.
@@ -58,7 +59,7 @@ void __up(struct semaphore *sem)
 	sem->count += (sem->count < 0) ? 1 : - 1;
 	
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_HEAD
 
@@ -74,7 +75,7 @@ void __down(struct semaphore * sem)
 	UPDATE_COUNT
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	DOWN_HEAD
 
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 14d0882a19d2..e5d5aeef96e5 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -50,6 +50,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text ALIGN(16) : {
 	*(.text*)
+	SCHED_TEXT
 	*(.PARISC.unwind)
 	*(.fixup)
 	*(.lock.text)		/* out-of-line lock text */
diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c
index ada32baeda19..3363a030e00f 100644
--- a/arch/ppc/kernel/process.c
+++ b/arch/ppc/kernel/process.c
@@ -661,8 +661,6 @@ void __init ll_puts(const char *s)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched    ((unsigned long) scheduling_functions_start_here)
 #define last_sched     ((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c
index 7bf51fba5c14..2fe429b27c14 100644
--- a/arch/ppc/kernel/semaphore.c
+++ b/arch/ppc/kernel/semaphore.c
@@ -15,6 +15,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/errno.h>
@@ -69,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __down(struct semaphore *sem)
+void __sched __down(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -99,7 +100,7 @@ void __down(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S
index 81b95d449a22..b710d55c5b08 100644
--- a/arch/ppc/kernel/vmlinux.lds.S
+++ b/arch/ppc/kernel/vmlinux.lds.S
@@ -31,6 +31,7 @@ SECTIONS
   .text      :
   {
     *(.text)
+    SCHED_TEXT
     *(.fixup)
     *(.got1)
     __got2_start = .;
diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c
index cec7225a6ac1..f74b14d7e58e 100644
--- a/arch/ppc64/kernel/process.c
+++ b/arch/ppc64/kernel/process.c
@@ -475,8 +475,6 @@ static inline int validate_sp(unsigned long sp, struct task_struct *p)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched    (*(unsigned long *)scheduling_functions_start_here)
 #define last_sched     (*(unsigned long *)scheduling_functions_end_here)
 
diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c
index c977029e2465..d723632d59f3 100644
--- a/arch/ppc64/kernel/semaphore.c
+++ b/arch/ppc64/kernel/semaphore.c
@@ -17,6 +17,7 @@
  */
 
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/atomic.h>
 #include <asm/semaphore.h>
 #include <asm/errno.h>
@@ -70,7 +71,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __down(struct semaphore *sem)
+void __sched __down(struct semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -99,7 +100,7 @@ void __down(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S
index a8531b1f9ef2..1d9b61143aaa 100644
--- a/arch/ppc64/kernel/vmlinux.lds.S
+++ b/arch/ppc64/kernel/vmlinux.lds.S
@@ -13,6 +13,7 @@ SECTIONS
   /* Read-only sections, merged into text segment: */
   .text : {
 	*(.text .text.*)
+	SCHED_TEXT
 	*(.fixup)
 	. = ALIGN(4096);
 	_etext = .;
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 3676307d1d8a..050585ab5d2a 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -384,8 +384,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c
index 8203f5e0228d..8dfb690c159f 100644
--- a/arch/s390/kernel/semaphore.c
+++ b/arch/s390/kernel/semaphore.c
@@ -11,6 +11,7 @@
  */
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -60,7 +61,7 @@ void __up(struct semaphore *sem)
  *   count > 0: decrement count, wake up queue and exit.
  *   count <= 0: set count to -1, go to sleep.
  */
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -82,7 +83,7 @@ void __down(struct semaphore * sem)
  *   count > 0: wake up queue and exit.
  *   count <= 0: set count to 0, wake up queue and exit.
  */
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index c9ca7a8e93b3..b4534b2867c3 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -23,6 +23,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x0700
diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c
index 773006661b50..7d45ea0acd09 100644
--- a/arch/sh/kernel/process.c
+++ b/arch/sh/kernel/process.c
@@ -464,8 +464,6 @@ out:
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -481,7 +479,7 @@ unsigned long get_wchan(struct task_struct *p)
 	 * The same comment as on the Alpha applies here, too ...
 	 */
 	pc = thread_saved_pc(p);
-	if (pc >= (unsigned long) interruptible_sleep_on && pc < (unsigned long) add_timer) {
+	if (pc >= first_sched && pc < last_sched) {
 		schedule_frame = ((unsigned long *)(long)p->thread.sp)[1];
 		return (unsigned long)((unsigned long *)schedule_frame)[1];
 	}
diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c
index 0943ad666a67..a3c24dcbf01d 100644
--- a/arch/sh/kernel/semaphore.c
+++ b/arch/sh/kernel/semaphore.c
@@ -10,6 +10,7 @@
 #include <linux/errno.h>
 #include <linux/sched.h>
 #include <linux/wait.h>
+#include <linux/init.h>
 #include <asm/semaphore.h>
 #include <asm/semaphore-helper.h>
 
@@ -103,7 +104,7 @@ void __up(struct semaphore *sem)
 	tsk->state = TASK_RUNNING;		\
 	remove_wait_queue(&sem->wait, &wait);
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	DOWN_VAR
 	DOWN_HEAD(TASK_UNINTERRUPTIBLE)
@@ -113,7 +114,7 @@ void __down(struct semaphore * sem)
 	DOWN_TAIL(TASK_UNINTERRUPTIBLE)
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int ret = 0;
 	DOWN_VAR
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 2cc86534c130..da0f5d728b3e 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -22,6 +22,7 @@ SECTIONS
 	} = 0
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x0009
diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c
index beae70a970e4..70261b211997 100644
--- a/arch/sparc/kernel/process.c
+++ b/arch/sparc/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/reboot.h>
 #include <linux/delay.h>
 #include <linux/pm.h>
+#include <linux/init.h>
 
 #include <asm/auxio.h>
 #include <asm/oplib.h>
@@ -694,9 +695,6 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 	return retval;
 }
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c
index 5a8f3d176a8f..77e63b92ca30 100644
--- a/arch/sparc/kernel/semaphore.c
+++ b/arch/sparc/kernel/semaphore.c
@@ -4,6 +4,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -45,7 +46,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -78,7 +79,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S
index 0862360d865d..8d4bbfaf304c 100644
--- a/arch/sparc/kernel/vmlinux.lds.S
+++ b/arch/sparc/kernel/vmlinux.lds.S
@@ -12,6 +12,7 @@ SECTIONS
   .text 0xf0004000 :
   {
     *(.text)
+    SCHED_TEXT
     *(.gnu.warning)
   } =0
   _etext = .;
diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S
index 98b757cb67c6..e7578dc600b8 100644
--- a/arch/sparc/lib/rwsem.S
+++ b/arch/sparc/lib/rwsem.S
@@ -8,7 +8,7 @@
 #include <asm/ptrace.h>
 #include <asm/psr.h>
 
-	.text
+	.section .sched.text
 	.align	4
 
 	.globl		___down_read
@@ -113,6 +113,7 @@ ___down_write:
 	ba		2b
 	 restore	%l5, %g0, %g5
 
+	.text
 	.globl		___up_read
 ___up_read:
 	rd		%psr, %g3
diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c
index 1be2b97e4672..0caf962e8155 100644
--- a/arch/sparc64/kernel/process.c
+++ b/arch/sparc64/kernel/process.c
@@ -28,6 +28,7 @@
 #include <linux/config.h>
 #include <linux/reboot.h>
 #include <linux/delay.h>
+#include <linux/init.h>
 
 #include <asm/oplib.h>
 #include <asm/uaccess.h>
@@ -823,9 +824,6 @@ out:
 	return error;
 }
 
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
-
 unsigned long get_wchan(struct task_struct *task)
 {
 	unsigned long pc, fp, bias = 0;
diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c
index a9e66d666ceb..9ddfcb9a1900 100644
--- a/arch/sparc64/kernel/semaphore.c
+++ b/arch/sparc64/kernel/semaphore.c
@@ -8,6 +8,7 @@
 
 #include <linux/sched.h>
 #include <linux/errno.h>
+#include <linux/init.h>
 
 /*
  * Atomically update sem->count.
@@ -90,7 +91,7 @@ void up(struct semaphore *sem)
 	: "g5", "g7", "memory", "cc");
 }
 
-static void __down(struct semaphore * sem)
+static void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -108,7 +109,7 @@ static void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-void down(struct semaphore *sem)
+void __sched down(struct semaphore *sem)
 {
 	might_sleep();
 	/* This atomically does:
@@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem)
 	return ret;
 }
 
-static int __down_interruptible(struct semaphore * sem)
+static int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -216,7 +217,7 @@ static int __down_interruptible(struct semaphore * sem)
 	return retval;
 }
 
-int down_interruptible(struct semaphore *sem)
+int __sched down_interruptible(struct semaphore *sem)
 {
 	int ret = 0;
 	
diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S
index ad95e88a3cbc..8faeee09fab2 100644
--- a/arch/sparc64/kernel/vmlinux.lds.S
+++ b/arch/sparc64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
   .text 0x0000000000404000 :
   {
     *(.text)
+    SCHED_TEXT
     *(.gnu.warning)
   } =0
   _etext = .;
diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c
index 8e1dfdda91fa..e19968dbc2d1 100644
--- a/arch/sparc64/lib/rwsem.c
+++ b/arch/sparc64/lib/rwsem.c
@@ -6,6 +6,7 @@
 
 #include <linux/kernel.h>
 #include <linux/rwsem.h>
+#include <linux/init.h>
 #include <linux/module.h>
 
 extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
@@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore
 extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
 extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *));
 
-void __down_read(struct rw_semaphore *sem)
+void __sched __down_read(struct rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"! beginning __down_read\n"
@@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaphore *sem)
 }
 EXPORT_SYMBOL(__down_read_trylock);
 
-void __down_write(struct rw_semaphore *sem)
+void __sched __down_write(struct rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"! beginning __down_write\n\t"
diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c
index 5c29ae51a303..977d75772d81 100644
--- a/arch/v850/kernel/process.c
+++ b/arch/v850/kernel/process.c
@@ -203,8 +203,6 @@ int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here (void);
-extern void scheduling_functions_end_here (void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
@@ -228,7 +226,6 @@ unsigned long get_wchan (struct task_struct *p)
 		    fp >= 8184+stack_page)
 			return 0;
 		pc = ((unsigned long *)fp)[1];
-		/* FIXME: This depends on the order of these functions. */
 		if (pc < first_sched || pc >= last_sched)
 			return pc;
 		fp = *(unsigned long *) fp;
diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c
index b78d714384db..2d20886863d8 100644
--- a/arch/v850/kernel/semaphore.c
+++ b/arch/v850/kernel/semaphore.c
@@ -15,6 +15,7 @@
 
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 
 #include <asm/semaphore.h>
 
@@ -56,7 +57,7 @@ void __up(struct semaphore *sem)
 
 static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED;
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +90,7 @@ void __down(struct semaphore * sem)
 	wake_up(&sem->wait);
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S
index 028c224fa66a..07ab0f292d1c 100644
--- a/arch/v850/kernel/vmlinux.lds.S
+++ b/arch/v850/kernel/vmlinux.lds.S
@@ -64,6 +64,7 @@
 #define TEXT_CONTENTS							      \
 		__stext = . ;						      \
         	*(.text)						      \
+		SCHED_TEXT
 			*(.exit.text)	/* 2.5 convention */		      \
 			*(.text.exit)	/* 2.4 convention */		      \
 			*(.text.lock)					      \
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 7b2414765ca3..d1d9471581a8 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -576,8 +576,6 @@ asmlinkage long sys_vfork(struct pt_regs regs)
 /*
  * These bracket the sleeping functions..
  */
-extern void scheduling_functions_start_here(void);
-extern void scheduling_functions_end_here(void);
 #define first_sched	((unsigned long) scheduling_functions_start_here)
 #define last_sched	((unsigned long) scheduling_functions_end_here)
 
diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c
index 5e517814dd07..2bcd4a7ec38d 100644
--- a/arch/x86_64/kernel/semaphore.c
+++ b/arch/x86_64/kernel/semaphore.c
@@ -14,6 +14,7 @@
  */
 #include <linux/config.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <asm/errno.h>
 
 #include <asm/semaphore.h>
@@ -54,7 +55,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-void __down(struct semaphore * sem)
+void __sched __down(struct semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -91,7 +92,7 @@ void __down(struct semaphore * sem)
 	tsk->state = TASK_RUNNING;
 }
 
-int __down_interruptible(struct semaphore * sem)
+int __sched __down_interruptible(struct semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index 7b9e1beb360e..c612e4d213a1 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -15,6 +15,7 @@ SECTIONS
   _text = .;			/* Text and read-only data */
   .text : {
 	*(.text)
+	SCHED_TEXT
 	*(.fixup)
 	*(.gnu.warning)
 	} = 0x9090
diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S
index 876cb937f9f1..acc1e2ca7ed7 100644
--- a/arch/x86_64/lib/thunk.S
+++ b/arch/x86_64/lib/thunk.S
@@ -35,6 +35,7 @@
 	.endm
 	
 
+	.section .sched.text
 #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
 	thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
 	thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
@@ -65,7 +66,7 @@ restore_norax:
 
 #ifdef CONFIG_SMP
 /* Support for read/write spinlocks. */
-	
+	.text
 /* rax:	pointer to rwlock_t */	
 ENTRY(__write_lock_failed)
 	lock
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 59c2b950e8b8..a4b6c768cf49 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -51,3 +51,8 @@
 		*(.security_initcall.init) 				\
 		__security_initcall_end = .;				\
 	}
+
+#define SCHED_TEXT							\
+		__scheduling_functions_start_here = .;			\
+		*(.sched.text)						\
+		__scheduling_functions_end_here = .;
diff --git a/include/linux/init.h b/include/linux/init.h
index 45069e275b3d..c6842477243c 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -46,6 +46,8 @@
 #define __exitdata	__attribute__ ((__section__(".exit.data")))
 #define __exit_call	__attribute_used__ __attribute__ ((__section__ (".exitcall.exit")))
 
+#define __sched		__attribute__((__section__(".sched.text")))
+
 #ifdef MODULE
 #define __exit		__attribute__ ((__section__(".exit.text")))
 #else
diff --git a/include/linux/sched.h b/include/linux/sched.h
index f5fa0c07a7f8..054b3c0d5962 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -170,6 +170,8 @@ extern void update_one_process(struct task_struct *p, unsigned long user,
 			       unsigned long system, int cpu);
 extern void scheduler_tick(int user_tick, int system);
 extern unsigned long cache_decay_ticks;
+extern const unsigned long scheduling_functions_start_here;
+extern const unsigned long scheduling_functions_end_here;
 
 
 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
diff --git a/kernel/sched.c b/kernel/sched.c
index 9e19d4c0d4a9..b42029abe679 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+extern unsigned long __scheduling_functions_start_here;
+extern unsigned long __scheduling_functions_end_here;
+const unsigned long scheduling_functions_start_here =
+			(unsigned long)&__scheduling_functions_start_here;
+const unsigned long scheduling_functions_end_here =
+			(unsigned long)&__scheduling_functions_end_here;
+
 /*
  * Default context-switch locking:
  */
@@ -1587,12 +1594,10 @@ out:
 	rebalance_tick(rq, 0);
 }
 
-void scheduling_functions_start_here(void) { }
-
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule);
  * off of preempt_enable.  Kernel preemptions off return from interrupt
  * occur there and call schedule directly.
  */
-asmlinkage void preempt_schedule(void)
+asmlinkage void __sched preempt_schedule(void)
 {
 	struct thread_info *ti = current_thread_info();
 
@@ -1869,7 +1874,7 @@ void fastcall complete_all(struct completion *x)
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }
 
-void fastcall wait_for_completion(struct completion *x)
+void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
 	spin_lock_irq(&x->wait.lock);
@@ -1907,7 +1912,7 @@ EXPORT_SYMBOL(wait_for_completion);
 	__remove_wait_queue(q, &wait);			\
 	spin_unlock_irqrestore(&q->lock, flags);
 
-void fastcall interruptible_sleep_on(wait_queue_head_t *q)
+void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1920,7 +1925,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(interruptible_sleep_on);
 
-long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1935,7 +1940,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(interruptible_sleep_on_timeout);
 
-void fastcall sleep_on(wait_queue_head_t *q)
+void fastcall __sched sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 
@@ -1948,7 +1953,7 @@ void fastcall sleep_on(wait_queue_head_t *q)
 
 EXPORT_SYMBOL(sleep_on);
 
-long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
+long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 
@@ -1963,8 +1968,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout)
 
 EXPORT_SYMBOL(sleep_on_timeout);
 
-void scheduling_functions_end_here(void) { }
-
 void set_user_nice(task_t *p, long nice)
 {
 	unsigned long flags;
@@ -2424,7 +2427,7 @@ asmlinkage long sys_sched_yield(void)
 	return 0;
 }
 
-void __cond_resched(void)
+void __sched __cond_resched(void)
 {
 	set_current_state(TASK_RUNNING);
 	schedule();
@@ -2438,7 +2441,7 @@ EXPORT_SYMBOL(__cond_resched);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void yield(void)
+void __sched yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
@@ -2453,7 +2456,7 @@ EXPORT_SYMBOL(yield);
  * But don't do that if it is a deliberate, throttling IO wait (this task
  * has set its backing_dev_info: the queue against which it should throttle)
  */
-void io_schedule(void)
+void __sched io_schedule(void)
 {
 	struct runqueue *rq = this_rq();
 
@@ -2464,7 +2467,7 @@ void io_schedule(void)
 
 EXPORT_SYMBOL(io_schedule);
 
-long io_schedule_timeout(long timeout)
+long __sched io_schedule_timeout(long timeout)
 {
 	struct runqueue *rq = this_rq();
 	long ret;
@@ -3010,7 +3013,7 @@ EXPORT_SYMBOL(__might_sleep);
  *
  * Called inside preempt_disable().
  */
-void __preempt_spin_lock(spinlock_t *lock)
+void __sched __preempt_spin_lock(spinlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_spin_lock(lock);
@@ -3026,7 +3029,7 @@ void __preempt_spin_lock(spinlock_t *lock)
 
 EXPORT_SYMBOL(__preempt_spin_lock);
 
-void __preempt_write_lock(rwlock_t *lock)
+void __sched __preempt_write_lock(rwlock_t *lock)
 {
 	if (preempt_count() > 1) {
 		_raw_write_lock(lock);
diff --git a/kernel/timer.c b/kernel/timer.c
index f53e0749b0d2..cbcb5522866d 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data)
  *
  * In all cases the return value is guaranteed to be non-negative.
  */
-fastcall signed long schedule_timeout(signed long timeout)
+fastcall signed long __sched schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;
@@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long nanosleep_restart(struct restart_block *restart)
+static long __sched nanosleep_restart(struct restart_block *restart)
 {
 	unsigned long expire = restart->arg0, now = jiffies;
 	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
diff --git a/lib/rwsem.c b/lib/rwsem.c
index 95469d7fb796..85dcae7e9337 100644
--- a/lib/rwsem.c
+++ b/lib/rwsem.c
@@ -5,6 +5,7 @@
  */
 #include <linux/rwsem.h>
 #include <linux/sched.h>
+#include <linux/init.h>
 #include <linux/module.h>
 
 struct rwsem_waiter {
@@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore
 /*
  * wait for the read lock to be granted
  */
-struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem)
+struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
@@ -178,7 +179,7 @@ struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem)
 /*
  * wait for the write lock to be granted
  */
-struct rw_semaphore fastcall *rwsem_down_write_failed(struct rw_semaphore *sem)
+struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 
-- 
cgit v1.2.3


From 906648b4864649cd72317718ae25ce5b33b0b8c7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:41:32 -0700
Subject: [PATCH] get_wchan() sparc64 fix

From: William Lee Irwin III <wli@holomorphy.com>

Now the scheduler text is in its own ELF section this branch is asking for
an illegal displacement.
---
 include/asm-sparc64/system.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h
index c41ddfc89a2b..c03b2d9d59e7 100644
--- a/include/asm-sparc64/system.h
+++ b/include/asm-sparc64/system.h
@@ -205,8 +205,10 @@ do {	if (test_thread_flag(TIF_PERFCTR)) {				\
 	"ldx	[%%g6 + %7], %%g4\n\t"					\
 	"wrpr	%%g0, 0x96, %%pstate\n\t"				\
 	"andcc	%%o7, %6, %%g0\n\t"					\
-	"bne,pn	%%icc, ret_from_syscall\n\t"				\
+	"beq,pn %%icc, 1f\n\t"						\
 	" mov	%%g5, %0\n\t"						\
+	"b,a ret_from_syscall\n\t"					\
+	"1:\n\t"							\
 	: "=&r" (last)							\
 	: "0" (next->thread_info),					\
 	  "i" (TI_WSTATE), "i" (TI_KSP), "i" (TI_FLAGS), "i" (TI_CWP),	\
-- 
cgit v1.2.3


From b4e0dd09f6ee56aa1c25ca9dfb4e897f241a5b57 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:42:11 -0700
Subject: [PATCH] ppc64: Fix bug in hugepage support

From: David Gibson <david@gibson.dropbear.id.au>

The PPC64 version of is_aligned_hugepage_range() is buggy.  It is supposed to
test not only that the given range is hugepage aligned, but that it lies
within the address space allowed for hugepages.  We were checking only that
the given range intersected the hugepage range, not that it lay entirely
within it.  This patch fixes the problem and changes the name of some macros
to make it less likely to make that misunderstanding again.
---
 arch/ppc64/mm/hugetlbpage.c |  7 ++++---
 include/asm-ppc64/page.h    | 12 ++++++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index b763fb9d06e3..125b45ed4cb2 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -230,7 +230,8 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
 		return -EINVAL;
-	if (! is_hugepage_only_range(addr, len))
+	if (! (within_hugepage_low_range(addr, len)
+	       || within_hugepage_high_range(addr, len)) )
 		return -EINVAL;
 	return 0;
 }
@@ -300,9 +301,9 @@ static int open_32bit_htlbpage_range(struct mm_struct *mm)
 
 int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (is_hugepage_high_range(addr, len))
+	if (within_hugepage_high_range(addr, len))
 		return 0;
-	else if (is_hugepage_low_range(addr, len))
+	else if (within_hugepage_low_range(addr, len))
 		return open_32bit_htlbpage_range(current->mm);
 
 	return -EINVAL;
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index fd707bb57da5..1c53c228ff22 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -40,15 +40,19 @@
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 
-#define is_hugepage_low_range(addr, len) \
+#define touches_hugepage_low_range(addr, len) \
 	(((addr) > (TASK_HPAGE_BASE_32-(len))) && ((addr) < TASK_HPAGE_END_32))
-#define is_hugepage_high_range(addr, len) \
+#define touches_hugepage_high_range(addr, len) \
 	(((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
+#define within_hugepage_low_range(addr, len) (((addr) >= TASK_HPAGE_BASE_32) \
+	  && ((addr)+(len) <= TASK_HPAGE_END_32) && ((addr)+(len) >= (addr)))
+#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
+	  && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
 
 #define is_hugepage_only_range(addr, len) \
-	(is_hugepage_high_range((addr), (len)) || \
+	(touches_hugepage_high_range((addr), (len)) || \
 	 (current->mm->context.low_hpages \
-	  && is_hugepage_low_range((addr), (len))))
+	  && touches_hugepage_low_range((addr), (len))))
 #define hugetlb_free_pgtables free_pgtables
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
-- 
cgit v1.2.3


From 81c31b894ebb3d1409ffa3d54e477a91b5a5ebf3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:43:37 -0700
Subject: [PATCH] ppc64: allow hugepages anywhere in low 4GB

From: David Gibson <david@gibson.dropbear.id.au>

On PPC64, to deal with the restrictions imposed by the PPC MMU's segment
design, hugepages are only allowed to be mapping in two fixed address
ranges, one 2-3G (for use by 32-bit processes) and one 1-1.5T (for use in
64-bit processes).  This is quite limiting, particularly for 32-bit
processes which want to use a lot of large page memory.

This patch relaxes this restriction, and allows any of the low 16 segments
(i.e.  those below 4G) to be individually switched over to allow hugepage
mappings (provided the segment does not already have any normal page
mappings).  The 1-1.5T fixed range for 64-bit processes remains.
---
 arch/ppc64/mm/hugetlbpage.c | 203 ++++++++++++++++++++++++++++----------------
 include/asm-ppc64/mmu.h     |   4 +-
 include/asm-ppc64/page.h    |  32 +++----
 3 files changed, 151 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index ceaefc4d9700..f5b5781d21a4 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -241,31 +241,25 @@ static void do_slbia(void *unused)
 	asm volatile ("isync; slbia; isync":::"memory");
 }
 
-/* Activate the low hpage region for 32bit processes.  mmap_sem must
- * be held*/
-static int open_32bit_htlbpage_range(struct mm_struct *mm)
+static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg)
 {
+	unsigned long start = seg << SID_SHIFT;
+	unsigned long end = (seg+1) << SID_SHIFT;
 	struct vm_area_struct *vma;
 	unsigned long addr;
 	struct mmu_gather *tlb;
 
-	if (mm->context.low_hpages)
-		return 0; /* The window is already open */
-	
-	/* Check no VMAs are in the region */
-	vma = find_vma(mm, TASK_HPAGE_BASE_32);
+	BUG_ON(seg >= 16);
 
-	if (vma && (vma->vm_start < TASK_HPAGE_END_32)) {
-		printk(KERN_DEBUG "Low HTLB region busy: PID=%d  vma @ %lx-%lx\n",
-		       current->pid, vma->vm_start, vma->vm_end);
+	/* Check no VMAs are in the region */
+	vma = find_vma(mm, start);
+	if (vma && (vma->vm_start < end))
 		return -EBUSY;
-	}
 
 	/* Clean up any leftover PTE pages in the region */
 	spin_lock(&mm->page_table_lock);
 	tlb = tlb_gather_mmu(mm, 0);
-	for (addr = TASK_HPAGE_BASE_32; addr < TASK_HPAGE_END_32;
-	     addr += PMD_SIZE) {
+	for (addr = start; addr < end; addr += PMD_SIZE) {
 		pgd_t *pgd = pgd_offset(mm, addr);
 		pmd_t *pmd;
 		struct page *page;
@@ -293,15 +287,29 @@ static int open_32bit_htlbpage_range(struct mm_struct *mm)
 		pgtable_remove_rmap(page);
 		pte_free_tlb(tlb, page);
 	}
-	tlb_finish_mmu(tlb, TASK_HPAGE_BASE_32, TASK_HPAGE_END_32);
+	tlb_finish_mmu(tlb, start, end);
 	spin_unlock(&mm->page_table_lock);
 
-	mm->context.low_hpages = 1;
+	return 0;
+}
+
+static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs)
+{
+	unsigned long i;
+
+	newsegs &= ~(mm->context.htlb_segs);
+	if (! newsegs)
+		return 0; /* The segments we want are already open */
 
+	for (i = 0; i < 16; i++)
+		if ((1 << i) & newsegs)
+			if (prepare_low_seg_for_htlb(mm, i) != 0)
+				return -EBUSY;
+
+	mm->context.htlb_segs |= newsegs;
 	/* the context change must make it to memory before the slbia,
 	 * so that further SLB misses do the right thing. */
 	mb();
-
 	on_each_cpu(do_slbia, NULL, 0, 1);
 
 	return 0;
@@ -311,8 +319,18 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
 	if (within_hugepage_high_range(addr, len))
 		return 0;
-	else if (within_hugepage_low_range(addr, len))
-		return open_32bit_htlbpage_range(current->mm);
+	else if ((addr < 0x100000000) && ((addr+len) < 0x100000000)) {
+		int err;
+		/* Yes, we need both tests, in case addr+len overflows
+		 * 64-bit arithmetic */
+		err = open_low_hpage_segs(current->mm,
+					  LOW_ESID_MASK(addr, len));
+		if (err)
+			printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)"
+			       " failed (segs: 0x%04hx)\n", addr, len,
+			       LOW_ESID_MASK(addr, len));
+		return err;
+	}
 
 	return -EINVAL;
 }
@@ -559,7 +577,7 @@ out:
 
 /* Because we have an exclusive hugepage region which lies within the
  * normal user address space, we have to take special measures to make
- * non-huge mmap()s evade the hugepage reserved region. */
+ * non-huge mmap()s evade the hugepage reserved regions. */
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 				     unsigned long len, unsigned long pgoff,
 				     unsigned long flags)
@@ -574,36 +592,29 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
-		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start) &&
-		    !is_hugepage_only_range(addr,len))
+		if (((TASK_SIZE - len) >= addr)
+		    && (!vma || (addr+len) <= vma->vm_start)
+		    && !is_hugepage_only_range(addr,len))
 			return addr;
 	}
 	start_addr = addr = mm->free_area_cache;
 
 full_search:
-	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (TASK_SIZE - len < addr) {
-			/*
-			 * Start a new search - just in case we missed
-			 * some holes.
-			 */
-			if (start_addr != TASK_UNMAPPED_BASE) {
-				start_addr = addr = TASK_UNMAPPED_BASE;
-				goto full_search;
-			}
-			return -ENOMEM;
+	vma = find_vma(mm, addr);
+	while (TASK_SIZE - len >= addr) {
+		BUG_ON(vma && (addr >= vma->vm_end));
+
+		if (touches_hugepage_low_range(addr, len)) {
+			addr = ALIGN(addr+1, 1<<SID_SHIFT);
+			vma = find_vma(mm, addr);
+			continue;
+		}
+		if (touches_hugepage_high_range(addr, len)) {
+			addr = TASK_HPAGE_END;
+			vma = find_vma(mm, addr);
+			continue;
 		}
 		if (!vma || addr + len <= vma->vm_start) {
-			if (is_hugepage_only_range(addr, len)) {
-				if (addr < TASK_HPAGE_END_32)
-					addr = TASK_HPAGE_END_32;
-				else
-					addr = TASK_HPAGE_END;
-
-				continue;
-			}
 			/*
 			 * Remember the place where we stopped the search:
 			 */
@@ -611,16 +622,70 @@ full_search:
 			return addr;
 		}
 		addr = vma->vm_end;
+		vma = vma->vm_next;
+	}
+
+	/* Make sure we didn't miss any holes */
+	if (start_addr != TASK_UNMAPPED_BASE) {
+		start_addr = addr = TASK_UNMAPPED_BASE;
+		goto full_search;
 	}
+	return -ENOMEM;
+}
+
+static unsigned long htlb_get_low_area(unsigned long len, u16 segmask)
+{
+	unsigned long addr = 0;
+	struct vm_area_struct *vma;
+
+	vma = find_vma(current->mm, addr);
+	while (addr + len <= 0x100000000UL) {
+		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+
+		if (! __within_hugepage_low_range(addr, len, segmask)) {
+			addr = ALIGN(addr+1, 1<<SID_SHIFT);
+			vma = find_vma(current->mm, addr);
+			continue;
+		}
+
+		if (!vma || (addr + len) <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		/* Depending on segmask this might not be a confirmed
+		 * hugepage region, so the ALIGN could have skipped
+		 * some VMAs */
+		vma = find_vma(current->mm, addr);
+	}
+
+	return -ENOMEM;
+}
+
+static unsigned long htlb_get_high_area(unsigned long len)
+{
+	unsigned long addr = TASK_HPAGE_BASE;
+	struct vm_area_struct *vma;
+
+	vma = find_vma(current->mm, addr);
+	for (vma = find_vma(current->mm, addr);
+	     addr + len <= TASK_HPAGE_END;
+	     vma = vma->vm_next) {
+		BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */
+		BUG_ON(! within_hugepage_high_range(addr, len));
+
+		if (!vma || (addr + len) <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+		/* Because we're in a hugepage region, this alignment
+		 * should not skip us over any VMAs */
+	}
+
+	return -ENOMEM;
 }
 
 unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 					unsigned long len, unsigned long pgoff,
 					unsigned long flags)
 {
-	struct vm_area_struct *vma;
-	unsigned long base, end;
-
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 
@@ -628,34 +693,30 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -EINVAL;
 
 	if (test_thread_flag(TIF_32BIT)) {
-		int err;
-
-		err = open_32bit_htlbpage_range(current->mm);
-		if (err)
-			return err; /* Should this just be EINVAL? */
-
-		base = TASK_HPAGE_BASE_32;
-		end = TASK_HPAGE_END_32;
-	} else {
-		base = TASK_HPAGE_BASE;
-		end = TASK_HPAGE_END;
-	}
-	
-	if (!in_hugepage_area(current->mm->context, addr) 
-	    || (addr & (HPAGE_SIZE - 1)))
-		addr = base;
+		int lastshift = 0;
+		u16 segmask, cursegs = current->mm->context.htlb_segs;
 
-	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
-		/* At this point:  (!vma || addr < vma->vm_end). */
-		if (addr + len > end)
-			return -ENOMEM;
-		if (!vma || (addr + len) <= vma->vm_start)
+		/* First see if we can do the mapping in the existing
+		 * low hpage segments */
+		addr = htlb_get_low_area(len, cursegs);
+		if (addr != -ENOMEM)
 			return addr;
-		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
 
-		/* Because we're in an exclusively hugepage region,
-		 * this alignment shouldn't have skipped over any
-		 * other vmas */
+		for (segmask = LOW_ESID_MASK(0x100000000UL-len, len);
+		     ! lastshift; segmask >>=1) {
+			if (segmask & 1)
+				lastshift = 1;
+
+			addr = htlb_get_low_area(len, cursegs | segmask);
+			if ((addr != -ENOMEM)
+			    && open_low_hpage_segs(current->mm, segmask) == 0)
+				return addr;
+		}
+		printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open"
+		       " enough segments\n");
+		return -ENOMEM;
+	} else {
+		return htlb_get_high_area(len);
 	}
 }
 
diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h
index a68e47f717e7..b42d9a4db08f 100644
--- a/include/asm-ppc64/mmu.h
+++ b/include/asm-ppc64/mmu.h
@@ -23,12 +23,12 @@ typedef unsigned long mm_context_id_t;
 typedef struct {
 	mm_context_id_t id;
 #ifdef CONFIG_HUGETLB_PAGE
-	int low_hpages;
+	u16 htlb_segs; /* bitmask */
 #endif
 } mm_context_t;
 
 #ifdef CONFIG_HUGETLB_PAGE
-#define KERNEL_LOW_HPAGES	.low_hpages = 0,
+#define KERNEL_LOW_HPAGES	.htlb_segs = 0,
 #else
 #define KERNEL_LOW_HPAGES
 #endif
diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h
index 1c53c228ff22..984602ae4fcc 100644
--- a/include/asm-ppc64/page.h
+++ b/include/asm-ppc64/page.h
@@ -22,6 +22,10 @@
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 #define PAGE_OFFSET_MASK (PAGE_SIZE-1)
 
+#define SID_SHIFT       28
+#define SID_MASK        0xfffffffff
+#define GET_ESID(x)     (((x) >> SID_SHIFT) & SID_MASK)
+
 #ifdef CONFIG_HUGETLB_PAGE
 
 #define HPAGE_SHIFT	24
@@ -33,34 +37,36 @@
 #define TASK_HPAGE_BASE 	(0x0000010000000000UL)
 #define TASK_HPAGE_END 	(0x0000018000000000UL)
 
-/* For 32-bit processes the hugepage range is 2-3G */
-#define TASK_HPAGE_BASE_32	(0x80000000UL)
-#define TASK_HPAGE_END_32	(0xc0000000UL)
+#define LOW_ESID_MASK(addr, len)	(((1U << (GET_ESID(addr+len-1)+1)) \
+	   	                	- (1U << GET_ESID(addr))) & 0xffff)
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
 
 #define touches_hugepage_low_range(addr, len) \
-	(((addr) > (TASK_HPAGE_BASE_32-(len))) && ((addr) < TASK_HPAGE_END_32))
+	(LOW_ESID_MASK((addr), (len)) & current->mm->context.htlb_segs)
 #define touches_hugepage_high_range(addr, len) \
 	(((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END))
-#define within_hugepage_low_range(addr, len) (((addr) >= TASK_HPAGE_BASE_32) \
-	  && ((addr)+(len) <= TASK_HPAGE_END_32) && ((addr)+(len) >= (addr)))
+
+#define __within_hugepage_low_range(addr, len, segmask) \
+	((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask))
+#define within_hugepage_low_range(addr, len) \
+	__within_hugepage_low_range((addr), (len), \
+				    current->mm->context.htlb_segs)
 #define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \
 	  && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr)))
 
 #define is_hugepage_only_range(addr, len) \
 	(touches_hugepage_high_range((addr), (len)) || \
-	 (current->mm->context.low_hpages \
-	  && touches_hugepage_low_range((addr), (len))))
+	  touches_hugepage_low_range((addr), (len)))
 #define hugetlb_free_pgtables free_pgtables
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
 #define in_hugepage_area(context, addr) \
 	((cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) && \
-	 ((((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
-	  ((context).low_hpages && \
-	   (((addr) >= TASK_HPAGE_BASE_32) && ((addr) < TASK_HPAGE_END_32)))))
+	 ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
+	   ( ((addr) < 0x100000000L) && \
+	     ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) )
 
 #else /* !CONFIG_HUGETLB_PAGE */
 
@@ -68,10 +74,6 @@
 
 #endif /* !CONFIG_HUGETLB_PAGE */
 
-#define SID_SHIFT       28
-#define SID_MASK        0xfffffffff
-#define GET_ESID(x)     (((x) >> SID_SHIFT) & SID_MASK)
-
 /* align addr on a size boundary - adjust address up/down if needed */
 #define _ALIGN_UP(addr,size)	(((addr)+((size)-1))&(~((size)-1)))
 #define _ALIGN_DOWN(addr,size)	((addr)&(~((size)-1)))
-- 
cgit v1.2.3


From d9110d3abbe16e21ea3d16ed8f37f22354ca9d4e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:44:15 -0700
Subject: [PATCH] ppc64: Allow PCI devices to use address that happens to fall
 in the ISA range

From: Jake Moilanen <moilanen@austin.ibm.com>

Allow PCI devices to use address that happens to fall in the ISA range,
but still protect against ISA device accesses when there is not an ISA
bus.
---
 arch/ppc64/kernel/eeh.c         |  6 ++++++
 arch/ppc64/kernel/pSeries_pci.c | 28 ++++++++++++++++++++++++++--
 include/asm-ppc64/eeh.h         | 33 +++++++++++++++------------------
 3 files changed, 47 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c
index 008ad1ef1783..303eac178519 100644
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -395,6 +395,12 @@ unsigned long eeh_check_failure(void *token, unsigned long val)
 		return val;
 	}
 
+        /* Make sure we aren't ISA */
+        if (!strcmp(dn->type, "isa")) {
+                pci_dev_put(dev);
+                return val;
+        }
+
 	if (!dn->eeh_config_addr) {
 		pci_dev_put(dev);
 		return val;
diff --git a/arch/ppc64/kernel/pSeries_pci.c b/arch/ppc64/kernel/pSeries_pci.c
index bae29d0f670d..4014ccd9fb60 100644
--- a/arch/ppc64/kernel/pSeries_pci.c
+++ b/arch/ppc64/kernel/pSeries_pci.c
@@ -44,6 +44,12 @@
 #include "open_pic.h"
 #include "pci.h"
 
+/* legal IO pages under MAX_ISA_PORT.  This is to ensure we don't touch
+   devices we don't have access to. */
+unsigned long io_page_mask;
+
+EXPORT_SYMBOL(io_page_mask);
+
 /* RTAS tokens */
 static int read_pci_config;
 static int write_pci_config;
@@ -280,6 +286,8 @@ static void __init pci_process_bridge_OF_ranges(struct pci_controller *hose,
 					pci_process_ISA_OF_ranges(isa_dn,
 						hose->io_base_phys,
 						hose->io_base_virt);
+                                        /* Allow all IO */
+                                        io_page_mask = -1;
 				}
 			}
 
@@ -523,8 +531,24 @@ void __devinit pcibios_fixup_device_resources(struct pci_dev *dev,
 	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
 		if (dev->resource[i].flags & IORESOURCE_IO) {
 			unsigned long offset = (unsigned long)hose->io_base_virt - pci_io_base;
-			dev->resource[i].start += offset;
-			dev->resource[i].end += offset;
+                        unsigned long start, end, mask;
+
+                        start = dev->resource[i].start += offset;
+                        end = dev->resource[i].end += offset;
+
+                        /* Need to allow IO access to pages that are in the
+                           ISA range */
+                        if (start < MAX_ISA_PORT) {
+                                if (end > MAX_ISA_PORT)
+                                        end = MAX_ISA_PORT;
+
+                                start >>= PAGE_SHIFT;
+                                end >>= PAGE_SHIFT;
+
+                                /* get the range of pages for the map */
+                                mask = ((1 << (end+1))-1) ^ ((1 << start)-1);
+                                io_page_mask |= mask;
+                        }
 		}
                 else if (dev->resource[i].flags & IORESOURCE_MEM) {
 			dev->resource[i].start += hose->pci_mem_offset;
diff --git a/include/asm-ppc64/eeh.h b/include/asm-ppc64/eeh.h
index d426126ddab1..4ccf43666ee5 100644
--- a/include/asm-ppc64/eeh.h
+++ b/include/asm-ppc64/eeh.h
@@ -199,74 +199,71 @@ static inline void eeh_memcpy_toio(void *dest, void *src, unsigned long n) {
 	memcpy(vdest, src, n);
 }
 
-/* The I/O macros must handle ISA ports as well as PCI I/O bars.
- * ISA does not implement EEH and ISA may not exist in the system.
- * For PCI we check for EEH failures.
- */
-#define _IO_IS_ISA(port) ((port) < 0x10000)
-#define _IO_HAS_ISA_BUS	(isa_io_base != 0)
+#define MAX_ISA_PORT 0x10000
+extern unsigned long io_page_mask;
+#define _IO_IS_VALID(port) ((port) >= MAX_ISA_PORT || (1 << (port>>PAGE_SHIFT)) & io_page_mask)
 
 static inline u8 eeh_inb(unsigned long port) {
 	u8 val;
-	if (_IO_IS_ISA(port) && !_IO_HAS_ISA_BUS)
+	if (!_IO_IS_VALID(port))
 		return ~0;
 	val = in_8((u8 *)(port+pci_io_base));
-	if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR(val, u8))
+	if (EEH_POSSIBLE_IO_ERROR(val, u8))
 		return eeh_check_failure((void*)(port), val);
 	return val;
 }
 
 static inline void eeh_outb(u8 val, unsigned long port) {
-	if (!_IO_IS_ISA(port) || _IO_HAS_ISA_BUS)
+	if (_IO_IS_VALID(port))
 		return out_8((u8 *)(port+pci_io_base), val);
 }
 
 static inline u16 eeh_inw(unsigned long port) {
 	u16 val;
-	if (_IO_IS_ISA(port) && !_IO_HAS_ISA_BUS)
+	if (!_IO_IS_VALID(port))
 		return ~0;
 	val = in_le16((u16 *)(port+pci_io_base));
-	if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR(val, u16))
+	if (EEH_POSSIBLE_IO_ERROR(val, u16))
 		return eeh_check_failure((void*)(port), val);
 	return val;
 }
 
 static inline void eeh_outw(u16 val, unsigned long port) {
-	if (!_IO_IS_ISA(port) || _IO_HAS_ISA_BUS)
+	if (_IO_IS_VALID(port))
 		return out_le16((u16 *)(port+pci_io_base), val);
 }
 
 static inline u32 eeh_inl(unsigned long port) {
 	u32 val;
-	if (_IO_IS_ISA(port) && !_IO_HAS_ISA_BUS)
+	if (!_IO_IS_VALID(port))
 		return ~0;
 	val = in_le32((u32 *)(port+pci_io_base));
-	if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR(val, u32))
+	if (EEH_POSSIBLE_IO_ERROR(val, u32))
 		return eeh_check_failure((void*)(port), val);
 	return val;
 }
 
 static inline void eeh_outl(u32 val, unsigned long port) {
-	if (!_IO_IS_ISA(port) || _IO_HAS_ISA_BUS)
+	if (_IO_IS_VALID(port))
 		return out_le32((u32 *)(port+pci_io_base), val);
 }
 
 /* in-string eeh macros */
 static inline void eeh_insb(unsigned long port, void * buf, int ns) {
 	_insb((u8 *)(port+pci_io_base), buf, ns);
-	if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR((*(((u8*)buf)+ns-1)), u8))
+	if (EEH_POSSIBLE_IO_ERROR((*(((u8*)buf)+ns-1)), u8))
 		eeh_check_failure((void*)(port), *(u8*)buf);
 }
 
 static inline void eeh_insw_ns(unsigned long port, void * buf, int ns) {
 	_insw_ns((u16 *)(port+pci_io_base), buf, ns);
-	if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR((*(((u16*)buf)+ns-1)), u16))
+	if (EEH_POSSIBLE_IO_ERROR((*(((u16*)buf)+ns-1)), u16))
 		eeh_check_failure((void*)(port), *(u16*)buf);
 }
 
 static inline void eeh_insl_ns(unsigned long port, void * buf, int nl) {
 	_insl_ns((u32 *)(port+pci_io_base), buf, nl);
-	if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR((*(((u32*)buf)+nl-1)), u32))
+	if (EEH_POSSIBLE_IO_ERROR((*(((u32*)buf)+nl-1)), u32))
 		eeh_check_failure((void*)(port), *(u32*)buf);
 }
 
-- 
cgit v1.2.3


From e80bc2ce5fd11792993c43d02a6425cc0a2138b8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:44:40 -0700
Subject: [PATCH] ppc64: Correct comments for the offsets of fields in paca

From: Will Schmidt <willschm@us.ibm.com>

Correct comments for the offsets of fields in paca
---
 include/asm-ppc64/paca.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/asm-ppc64/paca.h b/include/asm-ppc64/paca.h
index 24a2e99faae1..d368afedd735 100644
--- a/include/asm-ppc64/paca.h
+++ b/include/asm-ppc64/paca.h
@@ -64,13 +64,13 @@ struct paca_struct {
         u16 xHwProcNum;                 /* Physical processor number            0x1A */
 	u32 default_decr;		/* Default decrementer value		0x1c */	
 	u64 xKsave;			/* Saved Kernel stack addr or zero	0x20 */
-	struct ItLpQueue *lpQueuePtr;	/* LpQueue handled by this processor    0x30 */
-	u64  xTOC;			/* Kernel TOC address			0x38 */
-	STAB xStab_data;		/* Segment table information		0x40,0x48,0x50 */
-	u8 *exception_sp;		/*                                      0x58 */
-	u8 xProcEnabled;		/*                                      0x59 */
-	u8 prof_enabled;		/* 1=iSeries profiling enabled          0x60 */
-	u8 resv1[38];			/*					0x61-0x7F */
+	struct ItLpQueue *lpQueuePtr;	/* LpQueue handled by this processor    0x28 */
+	u64  xTOC;			/* Kernel TOC address			0x30 */
+	STAB xStab_data;		/* Segment table information		0x38,0x40,0x48 */
+	u8 *exception_sp;		/*                                      0x50 */
+	u8 xProcEnabled;		/*                                      0x58 */
+	u8 prof_enabled;		/* 1=iSeries profiling enabled          0x59 */
+	u8 resv1[38];			/*					0x5a-0x7f*/
 
 /*=====================================================================================
  * CACHE_LINE_2 0x0080 - 0x00FF
-- 
cgit v1.2.3


From 87fb698cd58394a5e1729a9ee56b4ada4fb0a51b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:44:51 -0700
Subject: [PATCH] ppc64: Make rtasd dump KERN_DEBUG

From: Jake Moilanen <moilanen@austin.ibm.com>

Change the loglevel of an error log printed so it
does not goto the console.  Since error logs can
be upto 2k in size, it can spam the console.
---
 arch/ppc64/kernel/rtasd.c | 8 ++++----
 include/asm-ppc64/rtas.h  | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/rtasd.c b/arch/ppc64/kernel/rtasd.c
index 99125ccfe648..b31ffad73195 100644
--- a/arch/ppc64/kernel/rtasd.c
+++ b/arch/ppc64/kernel/rtasd.c
@@ -78,7 +78,7 @@ static void printk_log_rtas(char *buf, int len)
 	char buffer[64];
 	char * str = "RTAS event";
 
-	printk(RTAS_ERR "%d -------- %s begin --------\n", error_log_cnt, str);
+	printk(RTAS_DEBUG "%d -------- %s begin --------\n", error_log_cnt, str);
 
 	/*
 	 * Print perline bytes on each line, each line will start
@@ -99,12 +99,12 @@ static void printk_log_rtas(char *buf, int len)
 		n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]);
 
 		if (j == (perline-1))
-			printk(KERN_ERR "%s\n", buffer);
+			printk(KERN_DEBUG "%s\n", buffer);
 	}
 	if ((i % perline) != 0)
-		printk(KERN_ERR "%s\n", buffer);
+		printk(KERN_DEBUG "%s\n", buffer);
 
-	printk(RTAS_ERR "%d -------- %s end ----------\n", error_log_cnt, str);
+	printk(RTAS_DEBUG "%d -------- %s end ----------\n", error_log_cnt, str);
 }
 
 static int log_rtas_len(char * buf)
diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h
index 5ce76143dce1..62838ce91e59 100644
--- a/include/asm-ppc64/rtas.h
+++ b/include/asm-ppc64/rtas.h
@@ -198,7 +198,7 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 /* All the types and not flags */
 #define ERR_TYPE_MASK	(ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC)
 
-#define RTAS_ERR KERN_ERR "RTAS: "
+#define RTAS_DEBUG KERN_DEBUG "RTAS: "
  
 #define RTAS_ERROR_LOG_MAX 2048
  
-- 
cgit v1.2.3


From 15cddddb837495b9b6441ae201d3be66e897614e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:45:43 -0700
Subject: [PATCH] ppc64: Add support for hotplug cpus

From: Joel Schopp <jschopp@austin.ibm.com>

Add support for hotplug cpus
---
 arch/ppc64/Kconfig        |   8 ++
 arch/ppc64/kernel/idle.c  |  14 +++
 arch/ppc64/kernel/irq.c   |  28 +++--
 arch/ppc64/kernel/rtas.c  |  19 +++
 arch/ppc64/kernel/setup.c |  11 +-
 arch/ppc64/kernel/smp.c   | 302 ++++++++++++++++++++++++++++++++++++++++++----
 arch/ppc64/kernel/xics.c  |  98 +++++++++++++--
 include/asm-ppc64/rtas.h  |   2 +
 include/asm-ppc64/smp.h   |   3 +
 9 files changed, 444 insertions(+), 41 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig
index 729a949ebe35..9b2f319d0cca 100644
--- a/arch/ppc64/Kconfig
+++ b/arch/ppc64/Kconfig
@@ -248,6 +248,14 @@ source "fs/Kconfig.binfmt"
 
 source "drivers/pci/Kconfig"
 
+config HOTPLUG_CPU
+	bool "Support for hot-pluggable CPUs"
+	depends on SMP && HOTPLUG && EXPERIMENTAL
+	---help---
+	  Say Y here to be able to turn CPUs off and on.
+
+	  Say N if you are unsure.
+
 source "drivers/pcmcia/Kconfig"
 
 source "drivers/pci/hotplug/Kconfig"
diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index 3ec662afac29..b30aea273974 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -26,6 +26,7 @@
 #include <linux/unistd.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/cpu.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -150,12 +151,18 @@ int default_idle(void)
 		}
 
 		schedule();
+		if (cpu_is_offline(smp_processor_id()) &&
+				system_state == SYSTEM_RUNNING)
+			cpu_die();
 	}
 
 	return 0;
 }
 
 #ifdef CONFIG_PPC_PSERIES
+
+DECLARE_PER_CPU(smt_snooze_delay);
+
 int dedicated_idle(void)
 {
 	long oldval;
@@ -236,6 +243,9 @@ int dedicated_idle(void)
 		HMT_medium();
 		lpaca->xLpPaca.xIdle = 0;
 		schedule();
+		if (cpu_is_offline(smp_processor_id()) &&
+				system_state == SYSTEM_RUNNING)
+			cpu_die();
 	}
 	return 0;
 }
@@ -245,6 +255,10 @@ int shared_idle(void)
 	struct paca_struct *lpaca = get_paca();
 
 	while (1) {
+		if (cpu_is_offline(smp_processor_id()) &&
+				system_state == SYSTEM_RUNNING)
+			cpu_die();
+
 		/* Indicate to the HV that we are idle.  Now would be
 		 * a good time to find other work to dispatch. */
 		lpaca->xLpPaca.xIdle = 1;
diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c
index 29a66ad7dc5d..70d7c0ed892e 100644
--- a/arch/ppc64/kernel/irq.c
+++ b/arch/ppc64/kernel/irq.c
@@ -683,6 +683,7 @@ static struct proc_dir_entry * root_irq_dir;
 static struct proc_dir_entry * irq_dir [NR_IRQS];
 static struct proc_dir_entry * smp_affinity_entry [NR_IRQS];
 
+/* Protected by irq descriptor spinlock */
 #ifdef CONFIG_IRQ_ALL_CPUS
 cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 #else  /* CONFIG_IRQ_ALL_CPUS */
@@ -702,16 +703,17 @@ static int irq_affinity_read_proc (char *page, char **start, off_t off,
 static int irq_affinity_write_proc (struct file *file, const char *buffer,
 					unsigned long count, void *data)
 {
-	int irq = (long)data, full_count = count, err;
+	int irq = (long)data;
+	int ret;
 	cpumask_t new_value, tmp;
 	cpumask_t allcpus = CPU_MASK_ALL;
 
 	if (!irq_desc[irq].handler->set_affinity)
 		return -EIO;
 
-	err = cpumask_parse(buffer, count, new_value);
-	if (err)
-		return err;
+	ret = cpumask_parse(buffer, count, new_value);
+	if (ret != 0)
+		return ret;
 
 	/*
 	 * We check for CPU_MASK_ALL in xics to send irqs to all cpus.
@@ -721,19 +723,30 @@ static int irq_affinity_write_proc (struct file *file, const char *buffer,
 	 */
 	cpus_and(new_value, new_value, allcpus);
 
+	/*
+	 * Grab lock here so cpu_online_map can't change, and also
+	 * protect irq_affinity[].
+	 */
+	spin_lock(&irq_desc[irq].lock);
+
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
 	 * one online CPU still has to be targeted.
 	 */
 	cpus_and(tmp, new_value, cpu_online_map);
-	if (cpus_empty(tmp))
-		return -EINVAL;
+	if (cpus_empty(tmp)) {
+		ret = -EINVAL;
+		goto out;
+	}
 
 	irq_affinity[irq] = new_value;
 	irq_desc[irq].handler->set_affinity(irq, new_value);
+	ret = count;
 
-	return full_count;
+out:
+	spin_unlock(&irq_desc[irq].lock);
+	return ret;
 }
 
 static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
@@ -946,5 +959,4 @@ unsigned int real_irq_to_virt_slowpath(unsigned int real_irq)
 
 }
 
-
 #endif
diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c
index 143b2f7ce5e7..afa36ac32add 100644
--- a/arch/ppc64/kernel/rtas.c
+++ b/arch/ppc64/kernel/rtas.c
@@ -494,6 +494,25 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
 	return 0;
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/* This version can't take the spinlock. */
+
+void rtas_stop_self(void)
+{
+	struct rtas_args *rtas_args = &(get_paca()->xRtas);
+
+	rtas_args->token = rtas_token("stop-self");
+	BUG_ON(rtas_args->token == RTAS_UNKNOWN_SERVICE);
+	rtas_args->nargs = 0;
+	rtas_args->nret  = 1;
+	rtas_args->rets  = &(rtas_args->args[0]);
+
+	printk("%u %u Ready to die...\n",
+	       smp_processor_id(), hard_smp_processor_id());
+	enter_rtas((void *)__pa(rtas_args));
+	panic("Alas, I survived.\n");
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 
 EXPORT_SYMBOL(rtas_firmware_flash_list);
 EXPORT_SYMBOL(rtas_token);
diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c
index efd3a598466e..0c230d04e9d6 100644
--- a/arch/ppc64/kernel/setup.c
+++ b/arch/ppc64/kernel/setup.c
@@ -25,6 +25,7 @@
 #include <linux/version.h>
 #include <linux/tty.h>
 #include <linux/root_dev.h>
+#include <linux/cpu.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/processor.h>
@@ -338,8 +339,13 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		return 0;
 	}
 
-	if (!cpu_online(cpu_id))
+	/* We only show online cpus: disable preempt (overzealous, I
+	 * knew) to prevent cpu going down. */
+	preempt_disable();
+	if (!cpu_online(cpu_id)) {
+		preempt_enable();
 		return 0;
+	}
 
 #ifdef CONFIG_SMP
 	pvr = per_cpu(pvr, cpu_id);
@@ -372,7 +378,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 		   ppc_proc_freq % 1000000);
 
 	seq_printf(m, "revision\t: %hd.%hd\n\n", maj, min);
-	
+
+	preempt_enable();
 	return 0;
 }
 
diff --git a/arch/ppc64/kernel/smp.c b/arch/ppc64/kernel/smp.c
index f671515c0676..72144b6122f9 100644
--- a/arch/ppc64/kernel/smp.c
+++ b/arch/ppc64/kernel/smp.c
@@ -230,10 +230,237 @@ static void __devinit smp_openpic_setup_cpu(int cpu)
 	do_openpic_setup_cpu();
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+/* Get state of physical CPU.
+ * Return codes:
+ *	0	- The processor is in the RTAS stopped state
+ *	1	- stop-self is in progress
+ *	2	- The processor is not in the RTAS stopped state
+ *	-1	- Hardware Error
+ *	-2	- Hardware Busy, Try again later.
+ */
+static int query_cpu_stopped(unsigned int pcpu)
+{
+	long cpu_status;
+	int status, qcss_tok;
+
+	qcss_tok = rtas_token("query-cpu-stopped-state");
+	BUG_ON(qcss_tok == RTAS_UNKNOWN_SERVICE);
+	status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu);
+	if (status != 0) {
+		printk(KERN_ERR
+		       "RTAS query-cpu-stopped-state failed: %i\n", status);
+		return status;
+	}
+
+	return cpu_status;
+}
+
+int __cpu_disable(void)
+{
+	/* FIXME: go put this in a header somewhere */
+	extern void xics_migrate_irqs_away(void);
+
+	systemcfg->processorCount--;
+
+	/*fix boot_cpuid here*/
+	if (smp_processor_id() == boot_cpuid)
+		boot_cpuid = any_online_cpu(cpu_online_map);
+
+	/* FIXME: abstract this to not be platform specific later on */
+	xics_migrate_irqs_away();
+	return 0;
+}
+
+void __cpu_die(unsigned int cpu)
+{
+	int tries;
+	int cpu_status;
+	unsigned int pcpu = get_hard_smp_processor_id(cpu);
+
+	for (tries = 0; tries < 5; tries++) {
+		cpu_status = query_cpu_stopped(pcpu);
+
+		if (cpu_status == 0)
+			break;
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+	if (cpu_status != 0) {
+		printk("Querying DEAD? cpu %i (%i) shows %i\n",
+		       cpu, pcpu, cpu_status);
+	}
+
+	/* Isolation and deallocation are definatly done by
+	 * drslot_chrp_cpu.  If they were not they would be
+	 * done here.  Change isolate state to Isolate and
+	 * change allocation-state to Unusable.
+	 */
+	paca[cpu].xProcStart = 0;
+
+	/* So we can recognize if it fails to come up next time. */
+	cpu_callin_map[cpu] = 0;
+}
+
+/* Kill this cpu */
+void cpu_die(void)
+{
+	local_irq_disable();
+	rtas_stop_self();
+	/* Should never get here... */
+	BUG();
+	for(;;);
+}
+
+/* Search all cpu device nodes for an offline logical cpu.  If a
+ * device node has a "ibm,my-drc-index" property (meaning this is an
+ * LPAR), paranoid-check whether we own the cpu.  For each "thread"
+ * of a cpu, if it is offline and has the same hw index as before,
+ * grab that in preference.
+ */
+static unsigned int find_physical_cpu_to_start(unsigned int old_hwindex)
+{
+	struct device_node *np = NULL;
+	unsigned int best = -1U;
+
+	while ((np = of_find_node_by_type(np, "cpu"))) {
+		int nr_threads, len;
+		u32 *index = (u32 *)get_property(np, "ibm,my-drc-index", NULL);
+		u32 *tid = (u32 *)
+			get_property(np, "ibm,ppc-interrupt-server#s", &len);
+
+		if (!tid)
+			tid = (u32 *)get_property(np, "reg", &len);
+
+		if (!tid)
+			continue;
+
+		/* If there is a drc-index, make sure that we own
+		 * the cpu.
+		 */
+		if (index) {
+			int state;
+			int rc = rtas_get_sensor(9003, *index, &state);
+			if (rc != 0 || state != 1)
+				continue;
+		}
+
+		nr_threads = len / sizeof(u32);
+
+		while (nr_threads--) {
+			if (0 == query_cpu_stopped(tid[nr_threads])) {
+				best = tid[nr_threads];
+				if (best == old_hwindex)
+					goto out;
+			}
+		}
+	}
+out:
+	of_node_put(np);
+	return best;
+}
+
+/**
+ * smp_startup_cpu() - start the given cpu
+ *
+ * At boot time, there is nothing to do.  At run-time, call RTAS with
+ * the appropriate start location, if the cpu is in the RTAS stopped
+ * state.
+ *
+ * Returns:
+ *	0	- failure
+ *	1	- success
+ */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+	int status;
+	extern void (*pseries_secondary_smp_init)(unsigned int cpu);
+	unsigned long start_here = __pa(pseries_secondary_smp_init);
+	unsigned int pcpu;
+
+	/* At boot time the cpus are already spinning in hold
+	 * loops, so nothing to do. */
+ 	if (system_state == SYSTEM_BOOTING)
+		return 1;
+
+	pcpu = find_physical_cpu_to_start(get_hard_smp_processor_id(lcpu));
+	if (pcpu == -1U) {
+		printk(KERN_INFO "No more cpus available, failing\n");
+		return 0;
+	}
+
+	/* Fixup atomic count: it exited inside IRQ handler. */
+	((struct task_struct *)paca[lcpu].xCurrent)->thread_info->preempt_count
+		= 0;
+	/* Fixup SLB round-robin so next segment (kernel) goes in segment 0 */
+	paca[lcpu].xStab_data.next_round_robin = 0;
+
+	/* At boot this is done in prom.c. */
+	paca[lcpu].xHwProcNum = pcpu;
+
+	status = rtas_call(rtas_token("start-cpu"), 3, 1, NULL,
+			   pcpu, start_here, lcpu);
+	if (status != 0) {
+		printk(KERN_ERR "start-cpu failed: %i\n", status);
+		return 0;
+	}
+	return 1;
+}
+
+static inline void look_for_more_cpus(void)
+{
+	int num_addr_cell, num_size_cell, len, i, maxcpus;
+	struct device_node *np;
+	unsigned int *ireg;
+
+	/* Find the property which will tell us about how many CPUs
+	 * we're allowed to have. */
+	if ((np = find_path_device("/rtas")) == NULL) {
+		printk(KERN_ERR "Could not find /rtas in device tree!");
+		return;
+	}
+	num_addr_cell = prom_n_addr_cells(np);
+	num_size_cell = prom_n_size_cells(np);
+
+	ireg = (unsigned int *)get_property(np, "ibm,lrdr-capacity", &len);
+	if (ireg == NULL) {
+		/* FIXME: make sure not marked as lrdr_capable() */
+		return;
+	}
+
+	maxcpus = ireg[num_addr_cell + num_size_cell];
+	/* DRENG need to account for threads here too */
+
+	if (maxcpus > NR_CPUS) {
+		printk(KERN_WARNING
+		       "Partition configured for %d cpus, "
+		       "operating system maximum is %d.\n", maxcpus, NR_CPUS);
+		maxcpus = NR_CPUS;
+	} else
+		printk(KERN_INFO "Partition configured for %d cpus.\n",
+		       maxcpus);
+
+	/* Make those cpus (which might appear later) possible too. */
+	for (i = 0; i < maxcpus; i++)
+		cpu_set(i, cpu_possible_map);
+}
+#else /* ... CONFIG_HOTPLUG_CPU */
+static inline int __devinit smp_startup_cpu(unsigned int lcpu)
+{
+	return 1;
+}
+static inline void look_for_more_cpus(void)
+{
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
 static void smp_pSeries_kick_cpu(int nr)
 {
 	BUG_ON(nr < 0 || nr >= NR_CPUS);
 
+	if (!smp_startup_cpu(nr))
+		return;
+
 	/* The processor is currently spinning, waiting
 	 * for the xProcStart field to become non-zero
 	 * After we set xProcStart, the processor will
@@ -241,7 +468,7 @@ static void smp_pSeries_kick_cpu(int nr)
 	 */
 	paca[nr].xProcStart = 1;
 }
-#endif
+#endif /* CONFIG_PPC_PSERIES */
 
 static void __init smp_space_timers(unsigned int max_cpus)
 {
@@ -462,12 +689,9 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 		       int wait)
 { 
 	struct call_data_struct data;
-	int ret = -1, cpus = num_online_cpus()-1;
+	int ret = -1, cpus;
 	unsigned long timeout;
 
-	if (!cpus)
-		return 0;
-
 	data.func = func;
 	data.info = info;
 	atomic_set(&data.started, 0);
@@ -476,6 +700,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic,
 		atomic_set(&data.finished, 0);
 
 	spin_lock(&call_lock);
+	/* Must grab online cpu count with preempt disabled, otherwise
+	 * it can change. */
+	cpus = num_online_cpus() - 1;
+	if (!cpus) {
+		ret = 0;
+		goto out;
+	}
+
 	call_data = &data;
 	wmb();
 	/* Send a message to all other CPUs and wait for them to respond */
@@ -565,8 +797,31 @@ static void __devinit smp_store_cpu_info(int id)
 	per_cpu(pvr, id) = _get_PVR();
 }
 
+static void __init smp_create_idle(unsigned int cpu)
+{
+	struct pt_regs regs;
+	struct task_struct *p;
+
+	/* create a process for the processor */
+	/* only regs.msr is actually used, and 0 is OK for it */
+	memset(&regs, 0, sizeof(struct pt_regs));
+	p = copy_process(CLONE_VM | CLONE_IDLETASK,
+			 0, &regs, 0, NULL, NULL);
+	if (IS_ERR(p))
+		panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p));
+
+	wake_up_forked_process(p);
+	init_idle(p, cpu);
+	unhash_process(p);
+
+	paca[cpu].xCurrent = (u64)p;
+	current_set[cpu] = p->thread_info;
+}
+
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
+	unsigned int cpu;
+
 	/* 
 	 * setup_cpu may need to be called on the boot cpu. We havent
 	 * spun any cpus up but lets be paranoid.
@@ -593,6 +848,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	 * number of msecs off until someone does a settimeofday()
 	 */
 	do_gtod.tb_orig_stamp = tb_last_stamp;
+
+	look_for_more_cpus();
 #endif
 
 	max_cpus = smp_ops->probe();
@@ -601,20 +858,31 @@ void __init smp_prepare_cpus(unsigned int max_cpus)
 	__save_cpu_setup();
 
 	smp_space_timers(max_cpus);
+
+	for_each_cpu(cpu)
+		if (cpu != boot_cpuid)
+			smp_create_idle(cpu);
 }
 
 void __devinit smp_prepare_boot_cpu(void)
 {
-	cpu_set(smp_processor_id(), cpu_online_map);
-	/* FIXME: what about cpu_possible()? */
+	BUG_ON(smp_processor_id() != boot_cpuid);
+
+	/* cpu_possible is set up in prom.c */
+	cpu_set(boot_cpuid, cpu_online_map);
+
+	paca[boot_cpuid].xCurrent = (u64)current;
+	current_set[boot_cpuid] = current->thread_info;
 }
 
 int __devinit __cpu_up(unsigned int cpu)
 {
-	struct pt_regs regs;
-	struct task_struct *p;
 	int c;
 
+	/* At boot, don't bother with non-present cpus -JSCHOPP */
+	if (system_state == SYSTEM_BOOTING && !cpu_present_at_boot(cpu))
+		return -ENOENT;
+
 	paca[cpu].prof_counter = 1;
 	paca[cpu].prof_multiplier = 1;
 	paca[cpu].default_decr = tb_ticks_per_jiffy / decr_overclock;
@@ -632,19 +900,9 @@ int __devinit __cpu_up(unsigned int cpu)
 		paca[cpu].xStab_data.real = virt_to_abs(tmp);
 	}
 
-	/* create a process for the processor */
-	/* only regs.msr is actually used, and 0 is OK for it */
-	memset(&regs, 0, sizeof(struct pt_regs));
-	p = copy_process(CLONE_VM|CLONE_IDLETASK, 0, &regs, 0, NULL, NULL);
-	if (IS_ERR(p))
-		panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p));
-
-	wake_up_forked_process(p);
-	init_idle(p, cpu);
-	unhash_process(p);
-
-	paca[cpu].xCurrent = (u64)p;
-	current_set[cpu] = p->thread_info;
+	/* The information for processor bringup must be written out
+	 * to main store before we release the processor. */
+	mb();
 
 	/* The information for processor bringup must
 	 * be written out to main store before we release
diff --git a/arch/ppc64/kernel/xics.c b/arch/ppc64/kernel/xics.c
index 9696dc866540..c4d4574cc675 100644
--- a/arch/ppc64/kernel/xics.c
+++ b/arch/ppc64/kernel/xics.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/gfp.h>
 #include <linux/radix-tree.h>
+#include <linux/cpu.h>
 #include <asm/prom.h>
 #include <asm/io.h>
 #include <asm/pgtable.h>
@@ -372,6 +373,9 @@ irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs)
 	int cpu = smp_processor_id();
 
 	ops->qirr_info(cpu, 0xff);
+
+	WARN_ON(cpu_is_offline(cpu));
+
 	while (xics_ipi_message[cpu].value) {
 		if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION,
 				       &xics_ipi_message[cpu].value)) {
@@ -514,6 +518,9 @@ nextnode:
 	if (systemcfg->platform == PLATFORM_PSERIES) {
 #ifdef CONFIG_SMP
 		for_each_cpu(i) {
+			/* FIXME: Do this dynamically! --RR */
+			if (!cpu_present_at_boot(i))
+				continue;
 			xics_per_cpu[i] = __ioremap((ulong)inodes[get_hard_smp_processor_id(i)].addr, 
 						    (ulong)inodes[get_hard_smp_processor_id(i)].size,
 						    _PAGE_NO_CACHE);
@@ -575,9 +582,7 @@ void xics_request_IPIs(void)
 
 static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 {
-        irq_desc_t *desc = irq_desc + virq;
 	unsigned int irq;
-	unsigned long flags;
 	long status;
 	unsigned long xics_status[2];
 	unsigned long newmask;
@@ -589,14 +594,12 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	if (irq == XICS_IPI)
 		return;
 
-        spin_lock_irqsave(&desc->lock, flags);
-
 	status = rtas_call(ibm_get_xive, 1, 3, (void *)&xics_status, irq);
 
 	if (status) {
 		printk(KERN_ERR "xics_set_affinity: irq=%d ibm,get-xive "
 		       "returns %ld\n", irq, status);
-		goto out;
+		return;
 	}
 
 	/* For the moment only implement delivery to all cpus or one cpu */
@@ -605,7 +608,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	} else {
 		cpus_and(tmp, cpu_online_map, cpumask);
 		if (cpus_empty(tmp))
-			goto out;
+			return;
 		newmask = get_hard_smp_processor_id(first_cpu(cpumask));
 	}
 
@@ -615,9 +618,86 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	if (status) {
 		printk(KERN_ERR "xics_set_affinity irq=%d ibm,set-xive "
 		       "returns %ld\n", irq, status);
-		goto out;
+		return;
+	}
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* Interrupts are disabled. */
+void xics_migrate_irqs_away(void)
+{
+	int set_indicator = rtas_token("set-indicator");
+	const unsigned long giqs = 9005UL; /* Global Interrupt Queue Server */
+	unsigned long status = 0;
+	unsigned int irq, cpu = smp_processor_id();
+	unsigned long xics_status[2];
+	unsigned long flags;
+
+	BUG_ON(set_indicator == RTAS_UNKNOWN_SERVICE);
+
+	/* Reject any interrupt that was queued to us... */
+	ops->cppr_info(cpu, 0);
+	iosync();
+
+	/* Refuse any new interrupts... */
+	rtas_call(set_indicator, 3, 1, &status, giqs,
+		  hard_smp_processor_id(), 0UL);
+	WARN_ON(status != 0);
+
+	/* Allow IPIs again... */
+	ops->cppr_info(cpu, DEFAULT_PRIORITY);
+	iosync();
+
+	printk(KERN_WARNING "HOTPLUG: Migrating IRQs away\n");
+	for_each_irq(irq) {
+		irq_desc_t *desc = get_irq_desc(irq);
+
+		/* We need to get IPIs still. */
+		if (irq_offset_down(irq) == XICS_IPI)
+			continue;
+
+		/* We only need to migrate enabled IRQS */
+		if (desc == NULL || desc->handler == NULL
+		    || desc->action == NULL
+		    || desc->handler->set_affinity == NULL)
+			continue;
+
+		spin_lock_irqsave(&desc->lock, flags);
+
+		status = rtas_call(ibm_get_xive, 1, 3, (void *)&xics_status,
+				   irq);
+		if (status) {
+			printk(KERN_ERR "migrate_irqs_away: irq=%d "
+					"ibm,get-xive returns %ld\n",
+					irq, status);
+			goto unlock;
+		}
+
+		/*
+		 * We only support delivery to all cpus or to one cpu.
+		 * The irq has to be migrated only in the single cpu
+		 * case.
+		 */
+		if (xics_status[0] != get_hard_smp_processor_id(cpu))
+			goto unlock;
+
+		printk(KERN_WARNING "IRQ %d affinity broken off cpu %u\n",
+		       irq, cpu);
+
+		/* Reset affinity to all cpus */
+		xics_status[0] = default_distrib_server;
+
+		status = rtas_call(ibm_set_xive, 3, 1, NULL,
+				irq, xics_status[0], xics_status[1]);
+		if (status)
+			printk(KERN_ERR "migrate_irqs_away irq=%d "
+					"ibm,set-xive returns %ld\n",
+					irq, status);
+
+unlock:
+		spin_unlock_irqrestore(&desc->lock, flags);
 	}
 
-out:
-        spin_unlock_irqrestore(&desc->lock, flags);
 }
+#endif
diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h
index 62838ce91e59..712747a2b3f9 100644
--- a/include/asm-ppc64/rtas.h
+++ b/include/asm-ppc64/rtas.h
@@ -219,6 +219,8 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal);
 extern spinlock_t rtas_data_buf_lock;
 extern char rtas_data_buf[RTAS_DATA_BUF_SIZE];
 
+extern void rtas_stop_self(void);
+
 /* RMO buffer reserved for user-space RTAS use */
 extern unsigned long rtas_rmo_buf;
 
diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h
index 22fc412bcfc9..8a96f975e496 100644
--- a/include/asm-ppc64/smp.h
+++ b/include/asm-ppc64/smp.h
@@ -70,6 +70,9 @@ extern cpumask_t cpu_available_map;
 void smp_init_iSeries(void);
 void smp_init_pSeries(void);
 
+extern int __cpu_disable(void);
+extern void __cpu_die(unsigned int cpu);
+extern void cpu_die(void) __attribute__((noreturn));
 #endif /* !(CONFIG_SMP) */
 
 #define get_hard_smp_processor_id(CPU) (paca[(CPU)].xHwProcNum)
-- 
cgit v1.2.3


From 69bc70b9552bcf74b35053ba91bc4c909522dcf0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:45:56 -0700
Subject: [PATCH] ppc64: Add RTAS os-term call for panic on pSeries

From: Michael Strosaker <strosake@us.ibm.com>

Add RTAS os-term call for panic on pSeries
---
 arch/ppc64/kernel/chrp_setup.c |  1 +
 arch/ppc64/kernel/rtas.c       | 21 +++++++++++++++++++++
 arch/ppc64/kernel/setup.c      | 19 +++++++++++++++++++
 include/asm-ppc64/machdep.h    |  1 +
 include/asm-ppc64/rtas.h       |  1 +
 5 files changed, 43 insertions(+)

(limited to 'include')

diff --git a/arch/ppc64/kernel/chrp_setup.c b/arch/ppc64/kernel/chrp_setup.c
index a2e281768381..4d7756de570f 100644
--- a/arch/ppc64/kernel/chrp_setup.c
+++ b/arch/ppc64/kernel/chrp_setup.c
@@ -267,6 +267,7 @@ chrp_init(unsigned long r3, unsigned long r4, unsigned long r5,
 	ppc_md.restart        = rtas_restart;
 	ppc_md.power_off      = rtas_power_off;
 	ppc_md.halt           = rtas_halt;
+	ppc_md.panic          = rtas_os_term;
 
 	ppc_md.get_boot_time  = pSeries_get_boot_time;
 	ppc_md.get_rtc_time   = pSeries_get_rtc_time;
diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c
index afa36ac32add..4a27c3d8312c 100644
--- a/arch/ppc64/kernel/rtas.c
+++ b/arch/ppc64/kernel/rtas.c
@@ -448,6 +448,27 @@ rtas_halt(void)
         rtas_power_off();
 }
 
+/* Must be in the RMO region, so we place it here */
+static char rtas_os_term_buf[2048];
+
+void rtas_os_term(char *str)
+{
+	long status;
+
+	snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str);
+
+	do {
+		status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL,
+				   __pa(rtas_os_term_buf));
+
+		if (status == RTAS_BUSY)
+			udelay(1);
+		else if (status != 0)
+			printk(KERN_EMERG "ibm,os-term call failed %ld\n",
+			       status);
+	} while (status == RTAS_BUSY);
+}
+
 unsigned long rtas_rmo_buf = 0;
 
 asmlinkage int ppc_rtas(struct rtas_args __user *uargs)
diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c
index 0c230d04e9d6..9c220a4c039c 100644
--- a/arch/ppc64/kernel/setup.c
+++ b/arch/ppc64/kernel/setup.c
@@ -26,6 +26,7 @@
 #include <linux/tty.h>
 #include <linux/root_dev.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/processor.h>
@@ -94,6 +95,13 @@ unsigned long SYSRQ_KEY;
 
 struct machdep_calls ppc_md;
 
+static int ppc64_panic_event(struct notifier_block *, unsigned long, void *);
+
+static struct notifier_block ppc64_panic_block = {
+	notifier_call: ppc64_panic_event,
+	priority: INT_MIN /* may not return; must be done last */
+};
+
 /*
  * Perhaps we can put the pmac screen_info[] here
  * on pmac as well so we don't need the ifdef's.
@@ -319,6 +327,14 @@ EXPORT_SYMBOL(machine_halt);
 unsigned long ppc_proc_freq;
 unsigned long ppc_tb_freq;
 
+static int ppc64_panic_event(struct notifier_block *this,
+                             unsigned long event, void *ptr)
+{
+	ppc_md.panic((char *)ptr);  /* May not return */
+	return NOTIFY_DONE;
+}
+
+
 #ifdef CONFIG_SMP
 DEFINE_PER_CPU(unsigned int, pvr);
 #endif
@@ -605,6 +621,9 @@ void __init setup_arch(char **cmdline_p)
 	/* reboot on panic */
 	panic_timeout = 180;
 
+	if (ppc_md.panic)
+		notifier_chain_register(&panic_notifier_list, &ppc64_panic_block);
+
 	init_mm.start_code = PAGE_OFFSET;
 	init_mm.end_code = (unsigned long) _etext;
 	init_mm.end_data = (unsigned long) _edata;
diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h
index a4d181d79c21..10e7e9ec6251 100644
--- a/include/asm-ppc64/machdep.h
+++ b/include/asm-ppc64/machdep.h
@@ -79,6 +79,7 @@ struct machdep_calls {
 	void		(*restart)(char *cmd);
 	void		(*power_off)(void);
 	void		(*halt)(void);
+	void		(*panic)(char *str);
 
 	int		(*set_rtc_time)(struct rtc_time *);
 	void		(*get_rtc_time)(struct rtc_time *);
diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h
index 712747a2b3f9..7f6139064c7c 100644
--- a/include/asm-ppc64/rtas.h
+++ b/include/asm-ppc64/rtas.h
@@ -175,6 +175,7 @@ extern void call_rtas_display_status(char);
 extern void rtas_restart(char *cmd);
 extern void rtas_power_off(void);
 extern void rtas_halt(void);
+extern void rtas_os_term(char *str);
 extern int rtas_get_sensor(int sensor, int index, int *state);
 extern int rtas_get_power_level(int powerdomain, int *level);
 extern int rtas_set_power_level(int powerdomain, int level, int *setlevel);
-- 
cgit v1.2.3


From b902751692396b28b9dc5b9c7266bfc32090f333 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:46:22 -0700
Subject: [PATCH] ppc64: irq cleanups

From: Paul Mackerras <paulus@samba.org>

Create and use irq_offset_up/down, get_irq_desc, for_each_irq
---
 arch/ppc64/kernel/chrp_setup.c    |  2 +-
 arch/ppc64/kernel/i8259.c         |  4 +--
 arch/ppc64/kernel/iSeries_irq.c   |  4 +--
 arch/ppc64/kernel/iSeries_setup.h |  3 --
 arch/ppc64/kernel/irq.c           | 59 ++++++++++++++++++++-------------------
 arch/ppc64/kernel/open_pic.c      | 23 +++++++++------
 arch/ppc64/kernel/open_pic.h      |  8 +-----
 arch/ppc64/kernel/prom.c          |  4 +--
 arch/ppc64/kernel/ras.c           |  5 ++--
 arch/ppc64/kernel/setup.c         |  3 +-
 arch/ppc64/kernel/vio.c           |  3 +-
 arch/ppc64/kernel/xics.c          | 36 +++++++++++-------------
 include/asm-ppc64/hw_irq.h        | 21 ++++++++++++--
 include/asm-ppc64/irq.h           | 41 ++++++++++++++++++++++-----
 include/asm-ppc64/smp.h           |  2 ++
 15 files changed, 130 insertions(+), 88 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/chrp_setup.c b/arch/ppc64/kernel/chrp_setup.c
index 4d7756de570f..5a1d60221c15 100644
--- a/arch/ppc64/kernel/chrp_setup.c
+++ b/arch/ppc64/kernel/chrp_setup.c
@@ -252,7 +252,7 @@ chrp_init(unsigned long r3, unsigned long r4, unsigned long r5,
 
 	ppc_md.setup_arch     = chrp_setup_arch;
 	ppc_md.get_cpuinfo    = chrp_get_cpuinfo;
-	if(naca->interrupt_controller == IC_OPEN_PIC) {
+	if (naca->interrupt_controller == IC_OPEN_PIC) {
 		ppc_md.init_IRQ       = pSeries_init_openpic; 
 		ppc_md.get_irq        = openpic_get_irq;
 	} else {
diff --git a/arch/ppc64/kernel/i8259.c b/arch/ppc64/kernel/i8259.c
index c1026da59fb7..2f2b9bf8cf1c 100644
--- a/arch/ppc64/kernel/i8259.c
+++ b/arch/ppc64/kernel/i8259.c
@@ -124,8 +124,8 @@ static void i8259_unmask_irq(unsigned int irq_nr)
 
 static void i8259_end_irq(unsigned int irq)
 {
-	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
-	    irq_desc[irq].action)
+	if (!(get_irq_desc(irq)->status & (IRQ_DISABLED|IRQ_INPROGRESS)) &&
+	    get_irq_desc(irq)->action)
 		i8259_unmask_irq(irq);
 }
 
diff --git a/arch/ppc64/kernel/iSeries_irq.c b/arch/ppc64/kernel/iSeries_irq.c
index abbe9a499377..fe6d63676c09 100644
--- a/arch/ppc64/kernel/iSeries_irq.c
+++ b/arch/ppc64/kernel/iSeries_irq.c
@@ -122,8 +122,8 @@ void __init iSeries_activate_IRQs()
 	int irq;
 	unsigned long flags;
 
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		irq_desc_t *desc = &irq_desc[irq];
+	for_each_irq (irq) {
+		irq_desc_t *desc = get_irq_desc(irq);
 
 		if (desc && desc->handler && desc->handler->startup) {
 			spin_lock_irqsave(&desc->lock, flags);
diff --git a/arch/ppc64/kernel/iSeries_setup.h b/arch/ppc64/kernel/iSeries_setup.h
index 53776d403508..240dad4ef20c 100644
--- a/arch/ppc64/kernel/iSeries_setup.h
+++ b/arch/ppc64/kernel/iSeries_setup.h
@@ -19,8 +19,6 @@
 #ifndef	__ISERIES_SETUP_H__
 #define	__ISERIES_SETUP_H__
 
-#include <linux/irq.h>		/* for irq_desc_t */
-
 extern void iSeries_init_early(void);
 extern void iSeries_init(unsigned long r3, unsigned long ird_start,
 		unsigned long ird_end, unsigned long cline_start,
@@ -29,7 +27,6 @@ extern void iSeries_setup_arch(void);
 extern void iSeries_setup_residual(struct seq_file *m, int cpu_id);
 extern void iSeries_get_cpuinfo(struct seq_file *m);
 extern void iSeries_init_IRQ(void);
-extern void iSeries_init_irq_desc(irq_desc_t *);
 extern int iSeries_get_irq(struct pt_regs *regs);
 extern void iSeries_restart(char *cmd);
 extern void iSeries_power_off(void);
diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c
index 70d7c0ed892e..5a4d3e47241e 100644
--- a/arch/ppc64/kernel/irq.c
+++ b/arch/ppc64/kernel/irq.c
@@ -67,6 +67,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	}
 };
 
+int __irq_offset_value;
 int ppc_spurious_interrupts = 0;
 unsigned long lpEvent_count = 0;
 
@@ -76,7 +77,7 @@ setup_irq(unsigned int irq, struct irqaction * new)
 	int shared = 0;
 	unsigned long flags;
 	struct irqaction *old, **p;
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = get_irq_desc(irq);
 
 	/*
 	 * Some drivers like serial.c use request_irq() heavily,
@@ -134,7 +135,7 @@ setup_irq(unsigned int irq, struct irqaction * new)
 
 inline void synchronize_irq(unsigned int irq)
 {
-	while (irq_desc[irq].status & IRQ_INPROGRESS)
+	while (get_irq_desc(irq)->status & IRQ_INPROGRESS)
 		cpu_relax();
 }
 
@@ -148,11 +149,10 @@ EXPORT_SYMBOL(synchronize_irq);
 static int
 do_free_irq(int irq, void* dev_id)
 {
-	irq_desc_t *desc;
+	irq_desc_t *desc = get_irq_desc(irq);
 	struct irqaction **p;
 	unsigned long flags;
 
-	desc = irq_desc + irq;
 	spin_lock_irqsave(&desc->lock,flags);
 	p = &desc->action;
 	for (;;) {
@@ -247,7 +247,7 @@ EXPORT_SYMBOL(free_irq);
  
 inline void disable_irq_nosync(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = get_irq_desc(irq);
 	unsigned long flags;
 
 	spin_lock_irqsave(&desc->lock, flags);
@@ -276,7 +276,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
  
 void disable_irq(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = get_irq_desc(irq);
 	disable_irq_nosync(irq);
 	if (desc->action)
 		synchronize_irq(irq);
@@ -296,7 +296,7 @@ EXPORT_SYMBOL(disable_irq);
  
 void enable_irq(unsigned int irq)
 {
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = get_irq_desc(irq);
 	unsigned long flags;
 
 	spin_lock_irqsave(&desc->lock, flags);
@@ -327,6 +327,7 @@ int show_interrupts(struct seq_file *p, void *v)
 {
 	int i = *(loff_t *) v, j;
 	struct irqaction * action;
+	irq_desc_t *desc;
 	unsigned long flags;
 
 	if (i == 0) {
@@ -339,8 +340,9 @@ int show_interrupts(struct seq_file *p, void *v)
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		desc = get_irq_desc(i);
+		spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action || !action->handler)
 			goto skip;
 		seq_printf(p, "%3d: ", i);
@@ -352,17 +354,17 @@ int show_interrupts(struct seq_file *p, void *v)
 #else
 		seq_printf(p, "%10u ", kstat_irqs(i));
 #endif /* CONFIG_SMP */
-		if (irq_desc[i].handler)		
-			seq_printf(p, " %s ", irq_desc[i].handler->typename );
+		if (desc->handler)
+			seq_printf(p, " %s ", desc->handler->typename );
 		else
 			seq_printf(p, "  None      ");
-		seq_printf(p, "%s", (irq_desc[i].status & IRQ_LEVEL) ? "Level " : "Edge  ");
+		seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge  ");
 		seq_printf(p, "    %s",action->name);
 		for (action=action->next; action; action = action->next)
 			seq_printf(p, ", %s", action->name);
 		seq_putc(p, '\n');
 skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == NR_IRQS)
 		seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts);
 	return 0;
@@ -482,7 +484,7 @@ void ppc_irq_dispatch_handler(struct pt_regs *regs, int irq)
 	int status;
 	struct irqaction *action;
 	int cpu = smp_processor_id();
-	irq_desc_t *desc = irq_desc + irq;
+	irq_desc_t *desc = get_irq_desc(irq);
 	irqreturn_t action_ret;
 
 	kstat_cpu(cpu).irqs[irq]++;
@@ -564,11 +566,11 @@ out:
 	 * The ->end() handler has to deal with interrupts which got
 	 * disabled while the handler was running.
 	 */
-	if (irq_desc[irq].handler) {
-		if (irq_desc[irq].handler->end)
-			irq_desc[irq].handler->end(irq);
-		else if (irq_desc[irq].handler->enable)
-			irq_desc[irq].handler->enable(irq);
+	if (desc->handler) {
+		if (desc->handler->end)
+			desc->handler->end(irq);
+		else if (desc->handler->enable)
+			desc->handler->enable(irq);
 	}
 	spin_unlock(&desc->lock);
 }
@@ -683,7 +685,7 @@ static struct proc_dir_entry * root_irq_dir;
 static struct proc_dir_entry * irq_dir [NR_IRQS];
 static struct proc_dir_entry * smp_affinity_entry [NR_IRQS];
 
-/* Protected by irq descriptor spinlock */
+/* Protected by get_irq_desc(irq)->lock. */
 #ifdef CONFIG_IRQ_ALL_CPUS
 cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
 #else  /* CONFIG_IRQ_ALL_CPUS */
@@ -703,12 +705,13 @@ static int irq_affinity_read_proc (char *page, char **start, off_t off,
 static int irq_affinity_write_proc (struct file *file, const char *buffer,
 					unsigned long count, void *data)
 {
-	int irq = (long)data;
+	unsigned int irq = (long)data;
+	irq_desc_t *desc = get_irq_desc(irq);
 	int ret;
 	cpumask_t new_value, tmp;
 	cpumask_t allcpus = CPU_MASK_ALL;
 
-	if (!irq_desc[irq].handler->set_affinity)
+	if (!desc->handler->set_affinity)
 		return -EIO;
 
 	ret = cpumask_parse(buffer, count, new_value);
@@ -727,7 +730,7 @@ static int irq_affinity_write_proc (struct file *file, const char *buffer,
 	 * Grab lock here so cpu_online_map can't change, and also
 	 * protect irq_affinity[].
 	 */
-	spin_lock(&irq_desc[irq].lock);
+	spin_lock(&desc->lock);
 
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
@@ -741,11 +744,11 @@ static int irq_affinity_write_proc (struct file *file, const char *buffer,
 	}
 
 	irq_affinity[irq] = new_value;
-	irq_desc[irq].handler->set_affinity(irq, new_value);
+	desc->handler->set_affinity(irq, new_value);
 	ret = count;
 
 out:
-	spin_unlock(&irq_desc[irq].lock);
+	spin_unlock(&desc->lock);
 	return ret;
 }
 
@@ -841,8 +844,8 @@ void init_irq_proc (void)
 	/*
 	 * Create entries for all existing IRQs.
 	 */
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].handler == NULL)
+	for_each_irq(i) {
+		if (get_irq_desc(i)->handler == NULL)
 			continue;
 		register_irq_proc(i);
 	}
@@ -870,7 +873,7 @@ unsigned int virt_irq_to_real_map[NR_IRQS];
  * we don't end up with an interrupt number >= NR_IRQS.
  */
 #define MIN_VIRT_IRQ	3
-#define MAX_VIRT_IRQ	(NR_IRQS - NUM_8259_INTERRUPTS - 1)
+#define MAX_VIRT_IRQ	(NR_IRQS - NUM_ISA_INTERRUPTS - 1)
 #define NR_VIRT_IRQS	(MAX_VIRT_IRQ - MIN_VIRT_IRQ + 1)
 
 void
diff --git a/arch/ppc64/kernel/open_pic.c b/arch/ppc64/kernel/open_pic.c
index e97d6ddc18d9..0eed791f3eb6 100644
--- a/arch/ppc64/kernel/open_pic.c
+++ b/arch/ppc64/kernel/open_pic.c
@@ -67,7 +67,6 @@ static void openpic_disable_irq(u_int irq);
 static void openpic_initirq(u_int irq, u_int pri, u_int vector, int polarity,
 			    int is_level);
 static void openpic_mapirq(u_int irq, u_int cpumask);
-static void openpic_set_sense(u_int irq, int sense);
 
 static void find_ISUs(void);
 
@@ -170,7 +169,7 @@ void __init pSeries_init_openpic(void)
         int i;
         unsigned int *addrp;
         unsigned char* chrp_int_ack_special = 0;
-        unsigned char init_senses[NR_IRQS - NUM_8259_INTERRUPTS];
+        unsigned char init_senses[NR_IRQS - NUM_ISA_INTERRUPTS];
         int nmi_irq = -1;
 #if defined(CONFIG_VT) && defined(CONFIG_ADB_KEYBOARD) && defined(XMON)
         struct device_node *kbd;
@@ -185,12 +184,12 @@ void __init pSeries_init_openpic(void)
 			__ioremap(addrp[prom_n_addr_cells(np)-1], 1, _PAGE_NO_CACHE);
         /* hydra still sets OpenPIC_InitSenses to a static set of values */
         if (OpenPIC_InitSenses == NULL) {
-                prom_get_irq_senses(init_senses, NUM_8259_INTERRUPTS, NR_IRQS);
+                prom_get_irq_senses(init_senses, NUM_ISA_INTERRUPTS, NR_IRQS);
                 OpenPIC_InitSenses = init_senses;
-                OpenPIC_NumInitSenses = NR_IRQS - NUM_8259_INTERRUPTS;
+                OpenPIC_NumInitSenses = NR_IRQS - NUM_ISA_INTERRUPTS;
         }
-        openpic_init(1, NUM_8259_INTERRUPTS, chrp_int_ack_special, nmi_irq);
-        for ( i = 0 ; i < NUM_8259_INTERRUPTS  ; i++ )
+        openpic_init(1, NUM_ISA_INTERRUPTS, chrp_int_ack_special, nmi_irq);
+        for (i = 0; i < NUM_ISA_INTERRUPTS; i++)
                 irq_desc[i].handler = &i8259_pic;
 	of_node_put(np);
 }
@@ -441,7 +440,7 @@ static int __init openpic_setup_i8259(void)
 
 	if (naca->interrupt_controller == IC_OPEN_PIC) {
 		/* Initialize the cascade */
-		if (request_irq(NUM_8259_INTERRUPTS, no_action, SA_INTERRUPT,
+		if (request_irq(NUM_ISA_INTERRUPTS, no_action, SA_INTERRUPT,
 				"82c59 cascade", NULL))
 			printk(KERN_ERR "Unable to get OpenPIC IRQ 0 for cascade\n");
 		i8259_init();
@@ -820,13 +819,21 @@ static void openpic_mapirq(u_int irq, u_int physmask)
  *
  *  sense: 1 for level, 0 for edge
  */
-static inline void openpic_set_sense(u_int irq, int sense)
+#if 0	/* not used */
+static void openpic_set_sense(u_int irq, int sense)
 {
 	openpic_safe_writefield(&GET_ISU(irq).Vector_Priority,
 				OPENPIC_SENSE_LEVEL,
 				(sense ? OPENPIC_SENSE_LEVEL : 0));
 }
 
+static int openpic_get_sense(u_int irq)
+{
+	return openpic_readfield(&GET_ISU(irq).Vector_Priority,
+				 OPENPIC_SENSE_LEVEL) != 0;
+}
+#endif
+
 static void openpic_end_irq(unsigned int irq_nr)
 {
 	openpic_eoi();
diff --git a/arch/ppc64/kernel/open_pic.h b/arch/ppc64/kernel/open_pic.h
index cf6a31f55c71..21f0a7afb84a 100644
--- a/arch/ppc64/kernel/open_pic.h
+++ b/arch/ppc64/kernel/open_pic.h
@@ -14,6 +14,7 @@
 
 #include <linux/config.h>
 #include <linux/cpumask.h>
+#include <linux/irq.h>
 
 #define OPENPIC_SIZE	0x40000
 
@@ -38,11 +39,4 @@ extern void openpic_init_processor(u_int cpumask);
 extern void openpic_setup_ISU(int isu_num, unsigned long addr);
 extern void openpic_cause_IPI(u_int ipi, u_int cpumask);
 
-extern inline int openpic_to_irq(int irq)
-{
-	if (systemcfg->platform == PLATFORM_POWERMAC)
-		return irq;
-	return irq += NUM_8259_INTERRUPTS;
-}
-/*extern int open_pic_irq_offset;*/
 #endif /* _PPC64_KERNEL_OPEN_PIC_H */
diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c
index e092a13594ce..f1cfd43dd39c 100644
--- a/arch/ppc64/kernel/prom.c
+++ b/arch/ppc64/kernel/prom.c
@@ -2191,7 +2191,7 @@ finish_node_interrupts(struct device_node *np, unsigned long mem_start)
 			printk(KERN_CRIT "Could not allocate interrupt "
 			       "number for %s\n", np->full_name);
 		} else
-			np->intrs[i].line = openpic_to_irq(virq);
+			np->intrs[i].line = irq_offset_up(virq);
 
 		/* We offset irq numbers for the u3 MPIC by 128 in PowerMac */
 		if (systemcfg->platform == PLATFORM_POWERMAC && ic && ic->parent) {
@@ -3019,7 +3019,7 @@ static int of_finish_dynamic_node_interrupts(struct device_node *node)
 			       "number for %s\n", node->full_name);
 			return -ENOMEM;
 		}
-		node->intrs[i].line = openpic_to_irq(virq);
+		node->intrs[i].line = irq_offset_up(virq);
 		if (n > 1)
 			node->intrs[i].sense = irq[1];
 		if (n > 2) {
diff --git a/arch/ppc64/kernel/ras.c b/arch/ppc64/kernel/ras.c
index 405bf1c51db0..c21e797a3c6d 100644
--- a/arch/ppc64/kernel/ras.c
+++ b/arch/ppc64/kernel/ras.c
@@ -1,4 +1,3 @@
-
 /*
  * ras.c
  * Copyright (C) 2001 Dave Engebretsen IBM Corporation
@@ -80,7 +79,7 @@ static int __init init_ras_IRQ(void)
 				       "number for %s\n", np->full_name);
 				break;
 			}
-			request_irq(virq + NUM_8259_INTERRUPTS, 
+			request_irq(irq_offset_up(virq),
 				    ras_error_interrupt, 0, 
 				    "RAS_ERROR", NULL);
 			ireg++;
@@ -98,7 +97,7 @@ static int __init init_ras_IRQ(void)
 				       " number for %s\n", np->full_name);
 				break;
 			}
-			request_irq(virq + NUM_8259_INTERRUPTS, 
+			request_irq(irq_offset_up(virq),
 				    ras_epow_interrupt, 0, 
 				    "RAS_EPOW", NULL);
 			ireg++;
diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c
index 9c220a4c039c..24b62c83479c 100644
--- a/arch/ppc64/kernel/setup.c
+++ b/arch/ppc64/kernel/setup.c
@@ -25,8 +25,8 @@
 #include <linux/version.h>
 #include <linux/tty.h>
 #include <linux/root_dev.h>
-#include <linux/cpu.h>
 #include <linux/notifier.h>
+#include <linux/cpu.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/processor.h>
@@ -224,6 +224,7 @@ void setup_system(unsigned long r3, unsigned long r4, unsigned long r5,
 	if (systemcfg->platform & PLATFORM_PSERIES) {
 		early_console_initialized = 1;
 		register_console(&udbg_console);
+		__irq_offset_value = NUM_ISA_INTERRUPTS;
 		finish_device_tree();
 		chrp_init(r3, r4, r5, r6, r7);
 
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
index f5eb219233ef..fba53a437e60 100644
--- a/arch/ppc64/kernel/vio.c
+++ b/arch/ppc64/kernel/vio.c
@@ -26,7 +26,6 @@
 #include <asm/ppcdebug.h>
 #include <asm/vio.h>
 #include <asm/hvcall.h>
-#include "open_pic.h"
 
 #define DBGENTER() pr_debug("%s entered\n", __FUNCTION__)
 
@@ -256,7 +255,7 @@ struct vio_dev * __devinit vio_register_device(struct device_node *of_node)
 			printk(KERN_ERR "Unable to allocate interrupt "
 			       "number for %s\n", of_node->full_name);
 		} else
-			viodev->irq = openpic_to_irq(virq);
+			viodev->irq = irq_offset_up(virq);
 	}
 
 	/* init generic 'struct device' fields: */
diff --git a/arch/ppc64/kernel/xics.c b/arch/ppc64/kernel/xics.c
index e1d557dac270..50dadf89f5f7 100644
--- a/arch/ppc64/kernel/xics.c
+++ b/arch/ppc64/kernel/xics.c
@@ -59,7 +59,6 @@ struct hw_interrupt_type xics_8259_pic = {
 static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_KERNEL);
 
 #define XICS_IPI		2
-#define XICS_IRQ_OFFSET		0x10
 #define XICS_IRQ_SPURIOUS	0
 
 /* Want a priority other than 0.  Various HW issues require this. */
@@ -217,7 +216,7 @@ xics_ops pSeriesLP_ops = {
 
 static unsigned int xics_startup(unsigned int virq)
 {
-	virq -= XICS_IRQ_OFFSET;
+	virq = irq_offset_down(virq);
 	if (radix_tree_insert(&irq_map, virt_irq_to_real(virq),
 			      &virt_irq_to_real_map[virq]) == -ENOMEM)
 		printk(KERN_CRIT "Out of memory creating real -> virtual"
@@ -242,8 +241,7 @@ static void xics_enable_irq(unsigned int virq)
 	long call_status;
 	unsigned int server;
 
-	virq -= XICS_IRQ_OFFSET;
-	irq = virt_irq_to_real(virq);
+	irq = virt_irq_to_real(irq_offset_down(virq));
 	if (irq == XICS_IPI)
 		return;
 
@@ -301,25 +299,25 @@ static void xics_disable_irq(unsigned int virq)
 {
 	unsigned int irq;
 
-	virq -= XICS_IRQ_OFFSET;
-	irq = virt_irq_to_real(virq);
+	irq = virt_irq_to_real(irq_offset_down(virq));
 	xics_disable_real_irq(irq);
 }
 
-static void xics_end_irq(unsigned int	irq)
+static void xics_end_irq(unsigned int irq)
 {
 	int cpu = smp_processor_id();
 
 	iosync();
-	ops->xirr_info_set(cpu, ((0xff<<24) |
-				 (virt_irq_to_real(irq-XICS_IRQ_OFFSET))));
+	ops->xirr_info_set(cpu, ((0xff << 24) |
+				 (virt_irq_to_real(irq_offset_down(irq)))));
+
 }
 
 static void xics_mask_and_ack_irq(unsigned int irq)
 {
 	int cpu = smp_processor_id();
 
-	if (irq < XICS_IRQ_OFFSET) {
+	if (irq < irq_offset_value()) {
 		i8259_pic.ack(irq);
 		iosync();
 		ops->xirr_info_set(cpu, ((0xff<<24) |
@@ -345,7 +343,8 @@ int xics_get_irq(struct pt_regs *regs)
 		irq = i8259_irq(cpu);
 		if (irq == -1) {
 			/* Spurious cascaded interrupt.  Still must ack xics */
-                        xics_end_irq(XICS_IRQ_OFFSET + xics_irq_8259_cascade);
+			xics_end_irq(irq_offset_up(xics_irq_8259_cascade));
+
 			irq = -1;
 		}
 	} else if (vec == XICS_IRQ_SPURIOUS) {
@@ -359,7 +358,7 @@ int xics_get_irq(struct pt_regs *regs)
 			       " disabling it.\n", vec);
 			xics_disable_real_irq(vec);
 		} else
-			irq += XICS_IRQ_OFFSET;
+			irq = irq_offset_up(irq);
 	}
 	return irq;
 }
@@ -541,9 +540,9 @@ nextnode:
 	xics_8259_pic.enable = i8259_pic.enable;
 	xics_8259_pic.disable = i8259_pic.disable;
 	for (i = 0; i < 16; ++i)
-		irq_desc[i].handler = &xics_8259_pic;
+		get_irq_desc(i)->handler = &xics_8259_pic;
 	for (; i < NR_IRQS; ++i)
-		irq_desc[i].handler = &xics_pic;
+		get_irq_desc(i)->handler = &xics_pic;
 
 	ops->cppr_info(boot_cpuid, 0xff);
 	iosync();
@@ -559,7 +558,7 @@ static int __init xics_setup_i8259(void)
 {
 	if (naca->interrupt_controller == IC_PPC_XIC &&
 	    xics_irq_8259_cascade != -1) {
-		if (request_irq(xics_irq_8259_cascade + XICS_IRQ_OFFSET,
+		if (request_irq(irq_offset_up(xics_irq_8259_cascade),
 				no_action, 0, "8259 cascade", 0))
 			printk(KERN_ERR "xics_init_IRQ: couldn't get 8259 cascade\n");
 		i8259_init();
@@ -574,9 +573,9 @@ void xics_request_IPIs(void)
 	virt_irq_to_real_map[XICS_IPI] = XICS_IPI;
 
 	/* IPIs are marked SA_INTERRUPT as they must run with irqs disabled */
-	request_irq(XICS_IPI + XICS_IRQ_OFFSET, xics_ipi_action, SA_INTERRUPT,
+	request_irq(irq_offset_up(XICS_IPI), xics_ipi_action, SA_INTERRUPT,
 		    "IPI", 0);
-	irq_desc[XICS_IPI+XICS_IRQ_OFFSET].status |= IRQ_PER_CPU;
+	get_irq_desc(irq_offset_up(XICS_IPI))->status |= IRQ_PER_CPU;
 }
 #endif
 
@@ -589,8 +588,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask)
 	cpumask_t allcpus = CPU_MASK_ALL;
 	cpumask_t tmp = CPU_MASK_NONE;
 
-	virq -= XICS_IRQ_OFFSET;
-	irq = virt_irq_to_real(virq);
+	irq = virt_irq_to_real(irq_offset_down(virq));
 	if (irq == XICS_IPI)
 		return;
 
diff --git a/include/asm-ppc64/hw_irq.h b/include/asm-ppc64/hw_irq.h
index 8db7a1a70756..baea40e695ec 100644
--- a/include/asm-ppc64/hw_irq.h
+++ b/include/asm-ppc64/hw_irq.h
@@ -75,9 +75,24 @@ static inline void __do_save_and_cli(unsigned long *flags)
 
 #endif /* CONFIG_PPC_ISERIES */
 
-#define mask_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->disable) irq_desc[irq].handler->disable(irq);})
-#define unmask_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->enable) irq_desc[irq].handler->enable(irq);})
-#define ack_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->ack) irq_desc[irq].handler->ack(irq);})
+#define mask_irq(irq)						\
+	({							\
+	 	irq_desc_t *desc = get_irq_desc(irq);		\
+		if (desc->handler && desc->handler->disable)	\
+			desc->handler->disable(irq);		\
+	})
+#define unmask_irq(irq)						\
+	({							\
+	 	irq_desc_t *desc = get_irq_desc(irq);		\
+		if (desc->handler && desc->handler->enable)	\
+			desc->handler->enable(irq);		\
+	})
+#define ack_irq(irq)						\
+	({							\
+	 	irq_desc_t *desc = get_irq_desc(irq);		\
+		if (desc->handler && desc->handler->ack)	\
+			desc->handler->ack(irq);		\
+	})
 
 /* Should we handle this via lost interrupts and IPIs or should we don't care like
  * we do now ? --BenH.
diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h
index f4ed6fe326dd..949e19f96be1 100644
--- a/include/asm-ppc64/irq.h
+++ b/include/asm-ppc64/irq.h
@@ -11,6 +11,11 @@
 
 #include <asm/atomic.h>
 
+/*
+ * Maximum number of interrupt sources that we can handle.
+ */
+#define NR_IRQS		512
+
 extern void disable_irq(unsigned int);
 extern void disable_irq_nosync(unsigned int);
 extern void enable_irq(unsigned int);
@@ -18,12 +23,11 @@ extern void enable_irq(unsigned int);
 /* this number is used when no interrupt has been assigned */
 #define NO_IRQ			(-1)
 
-/*
- * this is the maximum number of virtual irqs we will use.
- */
-#define NR_IRQS			512
+#define get_irq_desc(irq) (&irq_desc[(irq)])
 
-#define NUM_8259_INTERRUPTS	16
+/* Define a way to iterate across irqs. */
+#define for_each_irq(i) \
+	for ((i) = 0; (i) < NR_IRQS; ++(i))
 
 /* Interrupt numbers are virtual in case they are sparsely
  * distributed by the hardware.
@@ -41,12 +45,35 @@ static inline unsigned int virt_irq_to_real(unsigned int virt_irq)
 	return virt_irq_to_real_map[virt_irq];
 }
 
+/*
+ * Because many systems have two overlapping names spaces for
+ * interrupts (ISA and XICS for example), and the ISA interrupts
+ * have historically not been easy to renumber, we allow ISA
+ * interrupts to take values 0 - 15, and shift up the remaining
+ * interrupts by 0x10.
+ */
+#define NUM_ISA_INTERRUPTS	0x10
+extern int __irq_offset_value;
+
+static inline int irq_offset_up(int irq)
+{
+	return(irq + __irq_offset_value);
+}
+
+static inline int irq_offset_down(int irq)
+{
+	return(irq - __irq_offset_value);
+}
+
+static inline int irq_offset_value(void)
+{
+	return __irq_offset_value;
+}
+
 static __inline__ int irq_canonicalize(int irq)
 {
 	return irq;
 }
 
-#define NR_MASK_WORDS	((NR_IRQS + 63) / 64)
-
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h
index 8a96f975e496..3d7e3d7c7663 100644
--- a/include/asm-ppc64/smp.h
+++ b/include/asm-ppc64/smp.h
@@ -67,6 +67,8 @@ extern cpumask_t cpu_available_map;
 #endif
 #define PPC_MSG_DEBUGGER_BREAK  3
 
+extern cpumask_t irq_affinity[];
+
 void smp_init_iSeries(void);
 void smp_init_pSeries(void);
 
-- 
cgit v1.2.3


From 0e75cd7813f82e49975d3ba8a1bf6113aa497547 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:46:59 -0700
Subject: [PATCH] ppc64: Update CPU features

From: Anton Blanchard <anton@samba.org>

Update CPU features. Remove DABR feature, all cpus have it. Add MMCRA,
PMC8, SMT, COHERENT_ICACHE, LOCKLESS_TLBIE features
---
 arch/ppc64/kernel/cputable.c | 30 +++++++++++++++++-------------
 arch/ppc64/xmon/xmon.c       |  9 ++-------
 include/asm-ppc64/cputable.h |  8 ++++++--
 3 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c
index 672cd6a30169..df13c89ff25c 100644
--- a/arch/ppc64/kernel/cputable.c
+++ b/arch/ppc64/kernel/cputable.c
@@ -48,7 +48,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Power3 */
 	    0xffff0000, 0x00400000, "POWER3 (630)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-		    CPU_FTR_DABR | CPU_FTR_IABR,
+		    CPU_FTR_IABR | CPU_FTR_PMC8,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power3,
@@ -57,7 +57,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Power3+ */
 	    0xffff0000, 0x00410000, "POWER3 (630+)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-		    CPU_FTR_DABR | CPU_FTR_IABR,
+		    CPU_FTR_IABR | CPU_FTR_PMC8,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power3,
@@ -66,7 +66,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Northstar */
 	    0xffff0000, 0x00330000, "RS64-II (northstar)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-		    CPU_FTR_DABR | CPU_FTR_IABR,
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power3,
@@ -75,7 +75,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Pulsar */
 	    0xffff0000, 0x00340000, "RS64-III (pulsar)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-		    CPU_FTR_DABR | CPU_FTR_IABR,
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power3,
@@ -84,7 +84,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* I-star */
 	    0xffff0000, 0x00360000, "RS64-III (icestar)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-		    CPU_FTR_DABR | CPU_FTR_IABR,
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power3,
@@ -93,7 +93,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* S-star */
 	    0xffff0000, 0x00370000, "RS64-IV (sstar)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-		    CPU_FTR_DABR | CPU_FTR_IABR,
+		    CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power3,
@@ -102,7 +102,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Power4 */
 	    0xffff0000, 0x00350000, "POWER4 (gp)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_DABR,
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power4,
@@ -111,7 +111,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Power4+ */
 	    0xffff0000, 0x00380000, "POWER4+ (gq)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_DABR,
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power4,
@@ -120,7 +120,8 @@ struct cpu_spec	cpu_specs[] = {
     {	/* PPC970 */
 	    0xffff0000, 0x00390000, "PPC970",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP,
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP |
+		    CPU_FTR_CAN_NAP | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP,
 	    128, 128,
 	    __setup_cpu_ppc970,
@@ -129,7 +130,8 @@ struct cpu_spec	cpu_specs[] = {
     {	/* PPC970FX */
 	    0xffff0000, 0x003c0000, "PPC970FX",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP,
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP |
+		    CPU_FTR_CAN_NAP | CPU_FTR_PMC8 | CPU_FTR_MMCRA,
 	    COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP,
 	    128, 128,
 	    __setup_cpu_ppc970,
@@ -138,7 +140,8 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Power5 */
 	    0xffff0000, 0x003a0000, "POWER5 (gr)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2,
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_MMCRA | CPU_FTR_SMT |
+		    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power4,
@@ -147,7 +150,8 @@ struct cpu_spec	cpu_specs[] = {
     {	/* Power5 */
 	    0xffff0000, 0x003b0000, "POWER5 (gs)",
 	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2,
+		    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_MMCRA | CPU_FTR_SMT |
+		    CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power4,
@@ -156,7 +160,7 @@ struct cpu_spec	cpu_specs[] = {
     {	/* default match */
 	    0x00000000, 0x00000000, "POWER4 (compatible)",
   	    CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE |
-	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_DABR,
+		    CPU_FTR_PPCAS_ARCH_V2,
 	    COMMON_USER_PPC64,
 	    128, 128,
 	    __setup_cpu_power4,
diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c
index 7695c3ff4962..8bf490b348f1 100644
--- a/arch/ppc64/xmon/xmon.c
+++ b/arch/ppc64/xmon/xmon.c
@@ -452,7 +452,7 @@ insert_bpts()
 		}
 	}
 
-	if ((cur_cpu_spec->cpu_features & CPU_FTR_DABR) && dabr.enabled)
+	if (dabr.enabled)
 		set_dabr(dabr.address);
 	if ((cur_cpu_spec->cpu_features & CPU_FTR_IABR) && iabr.enabled)
 		set_iabr(iabr.address);
@@ -465,8 +465,7 @@ remove_bpts()
 	struct bpt *bp;
 	unsigned instr;
 
-	if ((cur_cpu_spec->cpu_features & CPU_FTR_DABR))
-		set_dabr(0);
+	set_dabr(0);
 	if ((cur_cpu_spec->cpu_features & CPU_FTR_IABR))
 		set_iabr(0);
 
@@ -751,10 +750,6 @@ bpt_cmds(void)
 	cmd = inchar();
 	switch (cmd) {
 	case 'd':	/* bd - hardware data breakpoint */
-		if (!(cur_cpu_spec->cpu_features & CPU_FTR_DABR)) {
-			printf("Not implemented on this cpu\n");
-			break;
-		}
 		mode = 7;
 		cmd = inchar();
 		if (cmd == 'r')
diff --git a/include/asm-ppc64/cputable.h b/include/asm-ppc64/cputable.h
index 99c3abfba704..abca635f9f9b 100644
--- a/include/asm-ppc64/cputable.h
+++ b/include/asm-ppc64/cputable.h
@@ -125,8 +125,12 @@ extern firmware_feature_t firmware_features_table[];
 #define CPU_FTR_TLBIEL         		0x0000000400000000
 #define CPU_FTR_NOEXECUTE     		0x0000000800000000
 #define CPU_FTR_NODSISRALIGN  		0x0000001000000000
-#define CPU_FTR_DABR  			0x0000002000000000
-#define CPU_FTR_IABR  			0x0000004000000000
+#define CPU_FTR_IABR  			0x0000002000000000
+#define CPU_FTR_MMCRA  			0x0000004000000000
+#define CPU_FTR_PMC8  			0x0000008000000000
+#define CPU_FTR_SMT  			0x0000010000000000
+#define CPU_FTR_COHERENT_ICACHE  	0x0000020000000000
+#define CPU_FTR_LOCKLESS_TLBIE		0x0000040000000000
 
 /* Platform firmware features */
 #define FW_FTR_                         0x0000000000000001
-- 
cgit v1.2.3


From c1a86d3b4fb53cd954a18edb6157b58584209c8b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:47:25 -0700
Subject: [PATCH] ppc64: Add some POWER5 specific optimisations

From: Anton Blanchard <anton@samba.org>

Add some POWER5 specific optimisations:
- icache is coherent, no need to explicitly flush
- tlbie lock no longer required
---
 arch/ppc64/kernel/misc.S         |  2 +-
 arch/ppc64/kernel/pSeries_htab.c | 32 ++++++++++++++++++++++----------
 arch/ppc64/kernel/pSeries_lpar.c | 11 +++++++----
 arch/ppc64/mm/hash_low.S         |  2 ++
 arch/ppc64/mm/init.c             | 10 +++++++++-
 include/asm-ppc64/cacheflush.h   | 10 ++++++++--
 6 files changed, 49 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S
index a9a0a9953f7d..dced77d38ece 100644
--- a/arch/ppc64/kernel/misc.S
+++ b/arch/ppc64/kernel/misc.S
@@ -132,7 +132,7 @@ _GLOBAL(flush_instruction_cache)
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_GLOBAL(flush_icache_range)
+_GLOBAL(__flush_icache_range)
 
 /*
  * Flush the data cache to memory 
diff --git a/arch/ppc64/kernel/pSeries_htab.c b/arch/ppc64/kernel/pSeries_htab.c
index dbe18cf5715d..8f556f3c9df7 100644
--- a/arch/ppc64/kernel/pSeries_htab.c
+++ b/arch/ppc64/kernel/pSeries_htab.c
@@ -221,9 +221,11 @@ static long pSeries_hpte_updatepp(unsigned long slot, unsigned long newpp,
 	if ((cur_cpu_spec->cpu_features & CPU_FTR_TLBIEL) && !large && local) {
 		tlbiel(va);
 	} else {
-		spin_lock_irqsave(&pSeries_tlbie_lock, flags);
+		if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+			spin_lock_irqsave(&pSeries_tlbie_lock, flags);
 		tlbie(va, large);
-		spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
+		if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+			spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
 	}
 
 	return ret;
@@ -255,9 +257,11 @@ static void pSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea)
 	set_pp_bit(newpp, hptep);
 
 	/* Ensure it is out of the tlb too */
-	spin_lock_irqsave(&pSeries_tlbie_lock, flags);
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+		spin_lock_irqsave(&pSeries_tlbie_lock, flags);
 	tlbie(va, 0);
-	spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+		spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
 }
 
 static void pSeries_hpte_invalidate(unsigned long slot, unsigned long va,
@@ -287,9 +291,11 @@ static void pSeries_hpte_invalidate(unsigned long slot, unsigned long va,
 	if ((cur_cpu_spec->cpu_features & CPU_FTR_TLBIEL) && !large && local) {
 		tlbiel(va);
 	} else {
-		spin_lock_irqsave(&pSeries_tlbie_lock, flags);
+		if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+			spin_lock_irqsave(&pSeries_tlbie_lock, flags);
 		tlbie(va, large);
-		spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
+		if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+			spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
 	}
 }
 
@@ -356,7 +362,8 @@ static void pSeries_flush_hash_range(unsigned long context,
 		asm volatile("ptesync":::"memory");
 	} else {
 		/* XXX double check that it is safe to take this late */
-		spin_lock_irqsave(&pSeries_tlbie_lock, flags);
+		if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+			spin_lock_irqsave(&pSeries_tlbie_lock, flags);
 
 		asm volatile("ptesync":::"memory");
 
@@ -365,7 +372,8 @@ static void pSeries_flush_hash_range(unsigned long context,
 
 		asm volatile("eieio; tlbsync; ptesync":::"memory");
 
-		spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
+		if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+			spin_unlock_irqrestore(&pSeries_tlbie_lock, flags);
 	}
 }
 
@@ -384,8 +392,12 @@ void hpte_init_pSeries(void)
 	root = of_find_node_by_path("/");
 	if (root) {
 		model = get_property(root, "model", NULL);
-		if (strcmp(model, "CHRP IBM,9076-N81"))
-			ppc_md.flush_hash_range = pSeries_flush_hash_range;
+		if (!strcmp(model, "CHRP IBM,9076-N81")) {
+			of_node_put(root);
+			return;
+		}
 		of_node_put(root);
 	}
+
+	ppc_md.flush_hash_range = pSeries_flush_hash_range;
 }
diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c
index d1a28982f378..12b0fb86acad 100644
--- a/arch/ppc64/kernel/pSeries_lpar.c
+++ b/arch/ppc64/kernel/pSeries_lpar.c
@@ -21,6 +21,7 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
+#include <linux/pci.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -30,13 +31,13 @@
 #include <asm/mmu_context.h>
 #include <asm/ppcdebug.h>
 #include <asm/iommu.h>
-#include <linux/pci.h>
 #include <asm/naca.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/hvcall.h>
 #include <asm/prom.h>
 #include <asm/abs_addr.h>
+#include <asm/cputable.h>
 
 /* in pSeries_hvCall.S */
 EXPORT_SYMBOL(plpar_hcall);
@@ -146,7 +147,7 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npage
 				   (u64)tcenum << 12, 
 				   tce.te_word );
 		
-		if(rc && printk_ratelimit()) {
+		if (rc && printk_ratelimit()) {
 			printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc);
 			printk("\tindex   = 0x%lx\n", (u64)tbl->it_index);
 			printk("\ttcenum  = 0x%lx\n", (u64)tcenum);
@@ -559,12 +560,14 @@ void pSeries_lpar_flush_hash_range(unsigned long context, unsigned long number,
 	unsigned long flags;
 	struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch);
 
-	spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+		spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags);
 
 	for (i = 0; i < number; i++)
 		flush_hash_page(context, batch->addr[i], batch->pte[i], local);
 
-	spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE))
+		spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags);
 }
 
 void pSeries_lpar_mm_init(void)
diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S
index 94e187a8bc40..0d6b5c29b645 100644
--- a/arch/ppc64/mm/hash_low.S
+++ b/arch/ppc64/mm/hash_low.S
@@ -125,11 +125,13 @@ _GLOBAL(__hash_page)
 	/* We eventually do the icache sync here (maybe inline that
 	 * code rather than call a C function...) 
 	 */
+BEGIN_FTR_SECTION
 BEGIN_FTR_SECTION
 	mr	r4,r30
 	mr	r5,r7
 	bl	.hash_page_do_lazy_icache
 END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE)
+END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE)
 
 	/* At this point, r3 contains new PP bits, save them in
 	 * place of "access" in the param area (sic)
diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c
index 61708dc2dd50..a62225a645d1 100644
--- a/arch/ppc64/mm/init.c
+++ b/arch/ppc64/mm/init.c
@@ -696,6 +696,8 @@ void __init mem_init(void)
  */
 void flush_dcache_page(struct page *page)
 {
+	if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE)
+		return;
 	/* avoid an atomic op if possible */
 	if (test_bit(PG_arch_1, &page->flags))
 		clear_bit(PG_arch_1, &page->flags);
@@ -705,6 +707,8 @@ void clear_user_page(void *page, unsigned long vaddr, struct page *pg)
 {
 	clear_page(page);
 
+	if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE)
+		return;
 	/*
 	 * We shouldnt have to do this, but some versions of glibc
 	 * require it (ld.so assumes zero filled pages are icache clean)
@@ -736,6 +740,9 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr,
 		return;
 #endif
 
+	if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE)
+		return;
+
 	/* avoid an atomic op if possible */
 	if (test_bit(PG_arch_1, &pg->flags))
 		clear_bit(PG_arch_1, &pg->flags);
@@ -768,7 +775,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea,
 	cpumask_t tmp;
 
 	/* handle i-cache coherency */
-	if (!(cur_cpu_spec->cpu_features & CPU_FTR_NOEXECUTE)) {
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE) &&
+	    !(cur_cpu_spec->cpu_features & CPU_FTR_NOEXECUTE)) {
 		unsigned long pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			struct page *page = pfn_to_page(pfn);
diff --git a/include/asm-ppc64/cacheflush.h b/include/asm-ppc64/cacheflush.h
index 7d958ac381b0..d6f24f9a1fed 100644
--- a/include/asm-ppc64/cacheflush.h
+++ b/include/asm-ppc64/cacheflush.h
@@ -1,8 +1,8 @@
 #ifndef _PPC64_CACHEFLUSH_H
 #define _PPC64_CACHEFLUSH_H
 
-/* Keep includes the same across arches.  */
 #include <linux/mm.h>
+#include <asm/cputable.h>
 
 /*
  * No cache flushing is required when address mappings are
@@ -18,7 +18,7 @@
 #define flush_cache_vunmap(start, end)		do { } while (0)
 
 extern void flush_dcache_page(struct page *page);
-extern void flush_icache_range(unsigned long, unsigned long);
+extern void __flush_icache_range(unsigned long, unsigned long);
 extern void flush_icache_user_range(struct vm_area_struct *vma,
 				    struct page *page, unsigned long addr,
 				    int len);
@@ -35,4 +35,10 @@ do { memcpy(dst, src, len); \
 
 extern void __flush_dcache_icache(void *page_va);
 
+static inline void flush_icache_range(unsigned long start, unsigned long stop)
+{
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE))
+		__flush_icache_range(start, stop);
+}
+
 #endif /* _PPC64_CACHEFLUSH_H */
-- 
cgit v1.2.3


From 12c9ae0de28d9fef2766c4ae5a1f01b7ab6aca20 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:47:39 -0700
Subject: [PATCH] ppc64: Add PMCs to sysfs

From: Anton Blanchard <anton@samba.org>

Add PMCs to sysfs.
---
 arch/ppc64/kernel/sysfs.c     | 108 ++++++++++++++++++++++++++++++++++++++++++
 include/asm-ppc64/processor.h |  29 +++++++-----
 2 files changed, 125 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c
index 7699817c3fc4..b168d4a52864 100644
--- a/arch/ppc64/kernel/sysfs.c
+++ b/arch/ppc64/kernel/sysfs.c
@@ -4,7 +4,113 @@
 #include <linux/smp.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
+#include <linux/sched.h>
+#include <asm/current.h>
 #include <asm/processor.h>
+#include <asm/cputable.h>
+#include <asm/hvcall.h>
+
+/* PMC stuff */
+
+/* XXX convert to rusty's on_one_cpu */
+static unsigned long run_on_cpu(unsigned long cpu,
+			        unsigned long (*func)(unsigned long),
+				unsigned long arg)
+{
+	cpumask_t old_affinity = current->cpus_allowed;
+	unsigned long ret;
+
+	/* should return -EINVAL to userspace */
+	if (set_cpus_allowed(current, cpumask_of_cpu(cpu)))
+		return 0;
+
+	ret = func(arg);
+
+	set_cpus_allowed(current, old_affinity);
+
+	return ret;
+}
+
+#define SYSFS_PMCSETUP(NAME, ADDRESS) \
+static unsigned long read_##NAME(unsigned long junk) \
+{ \
+	return mfspr(ADDRESS); \
+} \
+static unsigned long write_##NAME(unsigned long val) \
+{ \
+	mtspr(ADDRESS, val); \
+	return 0; \
+} \
+static ssize_t show_##NAME(struct sys_device *dev, char *buf) \
+{ \
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \
+	return sprintf(buf, "%lx\n", val); \
+} \
+static ssize_t store_##NAME(struct sys_device *dev, const char *buf, \
+			    size_t count) \
+{ \
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev); \
+	unsigned long val; \
+	int ret = sscanf(buf, "%lx", &val); \
+	if (ret != 1) \
+		return -EINVAL; \
+	run_on_cpu(cpu->sysdev.id, write_##NAME, val); \
+	return count; \
+}
+
+SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0);
+SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1);
+SYSFS_PMCSETUP(mmcra, SPRN_MMCRA);
+SYSFS_PMCSETUP(pmc1, SPRN_PMC1);
+SYSFS_PMCSETUP(pmc2, SPRN_PMC2);
+SYSFS_PMCSETUP(pmc3, SPRN_PMC3);
+SYSFS_PMCSETUP(pmc4, SPRN_PMC4);
+SYSFS_PMCSETUP(pmc5, SPRN_PMC5);
+SYSFS_PMCSETUP(pmc6, SPRN_PMC6);
+SYSFS_PMCSETUP(pmc7, SPRN_PMC7);
+SYSFS_PMCSETUP(pmc8, SPRN_PMC8);
+SYSFS_PMCSETUP(purr, SPRN_PURR);
+
+static SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0);
+static SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1);
+static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra);
+static SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1);
+static SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2);
+static SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3);
+static SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4);
+static SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5);
+static SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6);
+static SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7);
+static SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8);
+static SYSDEV_ATTR(purr, 0600, show_purr, NULL);
+
+static void __init register_cpu_pmc(struct sys_device *s)
+{
+	sysdev_create_file(s, &attr_mmcr0);
+	sysdev_create_file(s, &attr_mmcr1);
+
+	if (cur_cpu_spec->cpu_features & CPU_FTR_MMCRA)
+		sysdev_create_file(s, &attr_mmcra);
+
+	sysdev_create_file(s, &attr_pmc1);
+	sysdev_create_file(s, &attr_pmc2);
+	sysdev_create_file(s, &attr_pmc3);
+	sysdev_create_file(s, &attr_pmc4);
+	sysdev_create_file(s, &attr_pmc5);
+	sysdev_create_file(s, &attr_pmc6);
+
+	if (cur_cpu_spec->cpu_features & CPU_FTR_PMC8) {
+		sysdev_create_file(s, &attr_pmc7);
+		sysdev_create_file(s, &attr_pmc8);
+	}
+
+	if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
+		sysdev_create_file(s, &attr_purr);
+}
+
+
+/* NUMA stuff */
 
 #ifdef CONFIG_NUMA
 static struct node node_devices[MAX_NUMNODES];
@@ -60,6 +166,8 @@ static int __init topology_init(void)
 #endif
 		register_cpu(c, cpu, parent);
 
+		register_cpu_pmc(&c->sysdev);
+
 		sysdev_create_file(&c->sysdev, &attr_physical_id);
 	}
 
diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h
index b8c7d26b947b..ea8bf67f7007 100644
--- a/include/asm-ppc64/processor.h
+++ b/include/asm-ppc64/processor.h
@@ -235,8 +235,6 @@
 #define	SPRN_IMMR	0x27E  	/* Internal Memory Map Register */
 #define	SPRN_L2CR	0x3F9	/* Level 2 Cache Control Regsiter */
 #define	SPRN_LR		0x008	/* Link Register */
-#define	SPRN_MMCR0	0x3B8	/* Monitor Mode Control Register 0 */
-#define	SPRN_MMCR1	0x3BC	/* Monitor Mode Control Register 1 */
 #define	SPRN_PBL1	0x3FC	/* Protection Bound Lower 1 */
 #define	SPRN_PBL2	0x3FE	/* Protection Bound Lower 2 */
 #define	SPRN_PBU1	0x3FD	/* Protection Bound Upper 1 */
@@ -244,10 +242,7 @@
 #define	SPRN_PID	0x3B1	/* Process ID */
 #define	SPRN_PIR	0x3FF	/* Processor Identification Register */
 #define	SPRN_PIT	0x3DB	/* Programmable Interval Timer */
-#define	SPRN_PMC1	0x3B9	/* Performance Counter Register 1 */
-#define	SPRN_PMC2	0x3BA	/* Performance Counter Register 2 */
-#define	SPRN_PMC3	0x3BD	/* Performance Counter Register 3 */
-#define	SPRN_PMC4	0x3BE	/* Performance Counter Register 4 */
+#define	SPRN_PURR	0x135	/* Processor Utilization of Resources Register */
 #define	SPRN_PVR	0x11F	/* Processor Version Register */
 #define	SPRN_RPA	0x3D6	/* Required Physical Address Register */
 #define	SPRN_SDA	0x3BF	/* Sampled Data Address Register */
@@ -307,17 +302,26 @@
 #define	    WRS_SYSTEM		3		/* WDT forced system reset */
 #define	  TSR_PIS		0x08000000	/* PIT Interrupt Status */
 #define	  TSR_FIS		0x04000000	/* FIT Interrupt Status */
-#define	SPRN_UMMCR0	0x3A8	/* User Monitor Mode Control Register 0 */
-#define	SPRN_UMMCR1	0x3AC	/* User Monitor Mode Control Register 0 */
-#define	SPRN_UPMC1	0x3A9	/* User Performance Counter Register 1 */
-#define	SPRN_UPMC2	0x3AA	/* User Performance Counter Register 2 */
-#define	SPRN_UPMC3	0x3AD	/* User Performance Counter Register 3 */
-#define	SPRN_UPMC4	0x3AE	/* User Performance Counter Register 4 */
 #define	SPRN_USIA	0x3AB	/* User Sampled Instruction Address Register */
 #define	SPRN_XER	0x001	/* Fixed Point Exception Register */
 #define	SPRN_ZPR	0x3B0	/* Zone Protection Register */
 #define SPRN_VRSAVE     0x100   /* Vector save */
 
+/* Performance monitor SPRs */
+#define SPRN_SIAR	780
+#define SPRN_SDAR	781
+#define SPRN_MMCRA	786
+#define SPRN_PMC1	787
+#define SPRN_PMC2	788
+#define SPRN_PMC3	789
+#define SPRN_PMC4	790
+#define SPRN_PMC5	791
+#define SPRN_PMC6	792
+#define SPRN_PMC7	793
+#define SPRN_PMC8	794
+#define SPRN_MMCR0	795
+#define SPRN_MMCR1	798
+
 /* Short-hand versions for a number of the above SPRNs */
 
 #define	CTR	SPRN_CTR	/* Counter Register */
@@ -343,6 +347,7 @@
 #define	__LR	SPRN_LR
 #define	PVR	SPRN_PVR	/* Processor Version */
 #define	PIR	SPRN_PIR	/* Processor ID */
+#define	PURR	SPRN_PURR	/* Processor Utilization of Resource Register */
 #define	RPA	SPRN_RPA	/* Required Physical Address Register */
 #define	SDR1	SPRN_SDR1      	/* MMU hash base register */
 #define	SPR0	SPRN_SPRG0	/* Supervisor Private Registers */
-- 
cgit v1.2.3


From f4421b9c28e02a7260d6896b25fc4ac4f158baf0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:48:25 -0700
Subject: [PATCH] ppc64: Use enum dma_data_direction for the vio DMA api
 routines.

From: Stephen Rothwell <sfr@canb.auug.org.au>

This patch uses enum dma_data_direction for the vio DMA api routines.
This allows us to remove some include of linux/pci.h.

Also missed some pci_dma_mapping_error uses.
---
 arch/ppc64/kernel/dma.c | 16 ++++++++--------
 arch/ppc64/kernel/vio.c | 23 +++++++++++------------
 drivers/net/ibmveth.c   | 25 ++++++++++++-------------
 include/asm-ppc64/vio.h | 21 ++++++++++-----------
 4 files changed, 41 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/dma.c b/arch/ppc64/kernel/dma.c
index eb6f7996c7fe..26839a571415 100644
--- a/arch/ppc64/kernel/dma.c
+++ b/arch/ppc64/kernel/dma.c
@@ -77,7 +77,7 @@ dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size,
 		return pci_map_single(to_pci_dev(dev), cpu_addr, size, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	if (dev->bus == &vio_bus_type)
-		return vio_map_single(to_vio_dev(dev), cpu_addr, size, (int)direction);
+		return vio_map_single(to_vio_dev(dev), cpu_addr, size, direction);
 #endif
 	BUG();
 	return (dma_addr_t)0;
@@ -91,7 +91,7 @@ void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
 		pci_unmap_single(to_pci_dev(dev), dma_addr, size, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	else if (dev->bus == &vio_bus_type)
-		vio_unmap_single(to_vio_dev(dev), dma_addr, size, (int)direction);
+		vio_unmap_single(to_vio_dev(dev), dma_addr, size, direction);
 #endif
 	else
 		BUG();
@@ -106,7 +106,7 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page,
 		return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	if (dev->bus == &vio_bus_type)
-		return vio_map_page(to_vio_dev(dev), page, offset, size, (int)direction);
+		return vio_map_page(to_vio_dev(dev), page, offset, size, direction);
 #endif
 	BUG();
 	return (dma_addr_t)0;
@@ -120,7 +120,7 @@ void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
 		pci_unmap_page(to_pci_dev(dev), dma_address, size, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	else if (dev->bus == &vio_bus_type)
-		vio_unmap_page(to_vio_dev(dev), dma_address, size, (int)direction);
+		vio_unmap_page(to_vio_dev(dev), dma_address, size, direction);
 #endif
 	else
 		BUG();
@@ -134,7 +134,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	if (dev->bus == &vio_bus_type)
-		return vio_map_sg(to_vio_dev(dev), sg, nents, (int)direction);
+		return vio_map_sg(to_vio_dev(dev), sg, nents, direction);
 #endif
 	BUG();
 	return 0;
@@ -148,7 +148,7 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
 		pci_unmap_sg(to_pci_dev(dev), sg, nhwentries, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	else if (dev->bus == &vio_bus_type)
-		vio_unmap_sg(to_vio_dev(dev), sg, nhwentries, (int)direction);
+		vio_unmap_sg(to_vio_dev(dev), sg, nhwentries, direction);
 #endif
 	else
 		BUG();
@@ -162,7 +162,7 @@ void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size,
 		pci_dma_sync_single(to_pci_dev(dev), dma_handle, size, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	else if (dev->bus == &vio_bus_type)
-		vio_dma_sync_single(to_vio_dev(dev), dma_handle, size, (int)direction);
+		vio_dma_sync_single(to_vio_dev(dev), dma_handle, size, direction);
 #endif
 	else
 		BUG();
@@ -176,7 +176,7 @@ void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
 		pci_dma_sync_sg(to_pci_dev(dev), sg, nelems, (int)direction);
 #ifdef CONFIG_PPC_PSERIES
 	else if (dev->bus == &vio_bus_type)
-		vio_dma_sync_sg(to_vio_dev(dev), sg, nelems, (int)direction);
+		vio_dma_sync_sg(to_vio_dev(dev), sg, nelems, direction);
 #endif
 	else
 		BUG();
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
index fba53a437e60..054027f898fb 100644
--- a/arch/ppc64/kernel/vio.c
+++ b/arch/ppc64/kernel/vio.c
@@ -14,7 +14,6 @@
 
 #include <linux/init.h>
 #include <linux/console.h>
-#include <linux/pci.h>
 #include <linux/version.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
@@ -416,14 +415,14 @@ EXPORT_SYMBOL(vio_disable_interrupts);
 
 
 dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr,
-			  size_t size, int direction )
+			  size_t size, enum dma_data_direction direction)
 {
 	struct iommu_table *tbl;
 	dma_addr_t dma_handle = DMA_ERROR_CODE;
 	unsigned long uaddr;
 	unsigned int npages;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	uaddr = (unsigned long)vaddr;
 	npages = PAGE_ALIGN( uaddr + size ) - ( uaddr & PAGE_MASK );
@@ -432,7 +431,7 @@ dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr,
 	tbl = dev->iommu_table;
 
 	if (tbl) {
-		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+		dma_handle = iommu_alloc(tbl, vaddr, npages, (int)direction);
 		dma_handle |= (uaddr & ~PAGE_MASK);
 	}
 
@@ -441,12 +440,12 @@ dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr,
 EXPORT_SYMBOL(vio_map_single);
 
 void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle,
-		      size_t size, int direction)
+		      size_t size, enum dma_data_direction direction)
 {
 	struct iommu_table * tbl;
 	unsigned int npages;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	npages = PAGE_ALIGN( dma_handle + size ) - ( dma_handle & PAGE_MASK );
 	npages >>= PAGE_SHIFT;
@@ -458,11 +457,11 @@ void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle,
 EXPORT_SYMBOL(vio_unmap_single);
 
 int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems,
-	       int direction)
+	       enum dma_data_direction direction)
 {
 	struct iommu_table *tbl;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	if (nelems == 0)
 		return 0;
@@ -471,16 +470,16 @@ int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems,
 	if (!tbl)
 		return 0;
 
-	return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, direction);
+	return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, (int)direction);
 }
 EXPORT_SYMBOL(vio_map_sg);
 
 void vio_unmap_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems,
-		  int direction)
+		  enum dma_data_direction direction)
 {
 	struct iommu_table *tbl;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	tbl = vdev->iommu_table;
 	if (tbl)
@@ -516,7 +515,7 @@ void *vio_alloc_consistent(struct vio_dev *dev, size_t size,
 			/* Page allocation succeeded */
 			memset(ret, 0, npages << PAGE_SHIFT);
 			/* Set up tces to cover the allocated range */
-			tce = iommu_alloc(tbl, ret, npages, PCI_DMA_BIDIRECTIONAL);
+			tce = iommu_alloc(tbl, ret, npages, (int)DMA_BIDIRECTIONAL);
 			if (tce == DMA_ERROR_CODE) {
 				PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: iommu_alloc failed\n" );
 				free_pages((unsigned long)ret, order);
diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index c3f4944e724e..6427a25c2719 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -39,7 +39,6 @@
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
-#include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
@@ -218,7 +217,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
 		ibmveth_assert(index != 0xffff);
 		ibmveth_assert(pool->skbuff[index] == NULL);
 
-		dma_addr = vio_map_single(adapter->vdev, skb->data, pool->buff_size, PCI_DMA_FROMDEVICE);
+		dma_addr = vio_map_single(adapter->vdev, skb->data, pool->buff_size, DMA_FROM_DEVICE);
 
 		pool->dma_addr[index] = dma_addr;
 		pool->skbuff[index] = skb;
@@ -236,7 +235,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc
 		if(lpar_rc != H_Success) {
 			pool->skbuff[index] = NULL;
 			pool->consumer_index--;
-			vio_unmap_single(adapter->vdev, pool->dma_addr[index], pool->buff_size, PCI_DMA_FROMDEVICE);
+			vio_unmap_single(adapter->vdev, pool->dma_addr[index], pool->buff_size, DMA_FROM_DEVICE);
 			dev_kfree_skb_any(skb);
 			adapter->replenish_add_buff_failure++;
 			break;
@@ -300,7 +299,7 @@ static void ibmveth_free_buffer_pool(struct ibmveth_adapter *adapter, struct ibm
 				vio_unmap_single(adapter->vdev,
 						 pool->dma_addr[i],
 						 pool->buff_size,
-						 PCI_DMA_FROMDEVICE);
+						 DMA_FROM_DEVICE);
 				dev_kfree_skb_any(skb);
 				pool->skbuff[i] = NULL;
 			}
@@ -338,7 +337,7 @@ static void ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter, u64
 	vio_unmap_single(adapter->vdev,
 			 adapter->rx_buff_pool[pool].dma_addr[index],
 			 adapter->rx_buff_pool[pool].buff_size,
-			 PCI_DMA_FROMDEVICE);
+			 DMA_FROM_DEVICE);
 
 	free_index = adapter->rx_buff_pool[pool].producer_index++ % adapter->rx_buff_pool[pool].size;
 	adapter->rx_buff_pool[pool].free_map[free_index] = index;
@@ -406,7 +405,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 {
 	if(adapter->buffer_list_addr != NULL) {
 		if(!vio_dma_mapping_error(adapter->buffer_list_dma)) {
-			vio_unmap_single(adapter->vdev, adapter->buffer_list_dma, 4096, PCI_DMA_BIDIRECTIONAL);
+			vio_unmap_single(adapter->vdev, adapter->buffer_list_dma, 4096, DMA_BIDIRECTIONAL);
 			adapter->buffer_list_dma = DMA_ERROR_CODE;
 		}
 		free_page((unsigned long)adapter->buffer_list_addr);
@@ -415,7 +414,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 
 	if(adapter->filter_list_addr != NULL) {
 		if(!vio_dma_mapping_error(adapter->filter_list_dma)) {
-			vio_unmap_single(adapter->vdev, adapter->filter_list_dma, 4096, PCI_DMA_BIDIRECTIONAL);
+			vio_unmap_single(adapter->vdev, adapter->filter_list_dma, 4096, DMA_BIDIRECTIONAL);
 			adapter->filter_list_dma = DMA_ERROR_CODE;
 		}
 		free_page((unsigned long)adapter->filter_list_addr);
@@ -424,7 +423,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 
 	if(adapter->rx_queue.queue_addr != NULL) {
 		if(!vio_dma_mapping_error(adapter->rx_queue.queue_dma)) {
-			vio_unmap_single(adapter->vdev, adapter->rx_queue.queue_dma, adapter->rx_queue.queue_len, PCI_DMA_BIDIRECTIONAL);
+			vio_unmap_single(adapter->vdev, adapter->rx_queue.queue_dma, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL);
 			adapter->rx_queue.queue_dma = DMA_ERROR_CODE;
 		}
 		kfree(adapter->rx_queue.queue_addr);
@@ -470,9 +469,9 @@ static int ibmveth_open(struct net_device *netdev)
 		return -ENOMEM;
 	}
 
-	adapter->buffer_list_dma = vio_map_single(adapter->vdev, adapter->buffer_list_addr, 4096, PCI_DMA_BIDIRECTIONAL);
-	adapter->filter_list_dma = vio_map_single(adapter->vdev, adapter->filter_list_addr, 4096, PCI_DMA_BIDIRECTIONAL);
-	adapter->rx_queue.queue_dma = vio_map_single(adapter->vdev, adapter->rx_queue.queue_addr, adapter->rx_queue.queue_len, PCI_DMA_BIDIRECTIONAL);
+	adapter->buffer_list_dma = vio_map_single(adapter->vdev, adapter->buffer_list_addr, 4096, DMA_BIDIRECTIONAL);
+	adapter->filter_list_dma = vio_map_single(adapter->vdev, adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL);
+	adapter->rx_queue.queue_dma = vio_map_single(adapter->vdev, adapter->rx_queue.queue_addr, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL);
 
 	if((vio_dma_mapping_error(adapter->buffer_list_dma) ) ||
 	   (vio_dma_mapping_error(adapter->filter_list_dma)) ||
@@ -673,7 +672,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 				vio_unmap_single(adapter->vdev,
 						 desc[curfrag+1].fields.address,
 						 desc[curfrag+1].fields.length,
-						 PCI_DMA_TODEVICE);
+						 DMA_TO_DEVICE);
 				curfrag++;
 			}
 			dev_kfree_skb(skb);
@@ -710,7 +709,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 	}
 
 	do {
-		vio_unmap_single(adapter->vdev, desc[nfrags].fields.address, desc[nfrags].fields.length, PCI_DMA_TODEVICE);
+		vio_unmap_single(adapter->vdev, desc[nfrags].fields.address, desc[nfrags].fields.length, DMA_TO_DEVICE);
 	} while(--nfrags >= 0);
 
 	dev_kfree_skb(skb);
diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h
index d70eeb87dfd9..7fe3a23736e7 100644
--- a/include/asm-ppc64/vio.h
+++ b/include/asm-ppc64/vio.h
@@ -17,7 +17,6 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/device.h>
-#include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <asm/hvcall.h>
 #include <asm/prom.h>
@@ -58,13 +57,13 @@ int vio_enable_interrupts(struct vio_dev *dev);
 int vio_disable_interrupts(struct vio_dev *dev);
 
 dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, 
-			  size_t size, int direction);
+			  size_t size, enum dma_data_direction direction);
 void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle, 
-		      size_t size, int direction);
+		      size_t size, enum dma_data_direction direction);
 int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, 
-	       int nelems, int direction);
+	       int nelems, enum dma_data_direction direction);
 void vio_unmap_sg(struct vio_dev *vdev, struct scatterlist *sglist, 
-		  int nelems, int direction);
+		  int nelems, enum dma_data_direction direction);
 void *vio_alloc_consistent(struct vio_dev *dev, size_t size, 
 			   dma_addr_t *dma_handle);
 void vio_free_consistent(struct vio_dev *dev, size_t size, void *vaddr, 
@@ -81,18 +80,18 @@ static inline int vio_dma_supported(struct vio_dev *hwdev, u64 mask)
 
 
 static inline void vio_dma_sync_single(struct vio_dev *hwdev,
-				       dma_addr_t dma_handle,
-				       size_t size, int direction)
+				       dma_addr_t dma_handle, size_t size,
+				       enum dma_data_direction direction)
 {
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 	/* nothing to do */
 }
 
 static inline void vio_dma_sync_sg(struct vio_dev *hwdev,
-				   struct scatterlist *sg,
-				   int nelems, int direction)
+				   struct scatterlist *sg, int nelems,
+				   enum dma_data_direction direction)
 {
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 	/* nothing to do */
 }
 static inline int vio_set_dma_mask(struct vio_dev *dev, u64 mask) { return -EIO; }
-- 
cgit v1.2.3


From 9b678c1e2247e6dddf4bf245554bd3c099e456ab Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:48:41 -0700
Subject: [PATCH] ppc64: Use enum dma_data_direction for all APIs

From: Stephen Rothwell <sfr@canb.auug.org.au>

This is just a cleanup to use enum dma_data_direction for all APIs
except the pci_dma_ ones (since they are defined generically).

Also make most of the functions in arch/ppc64/kernel/pci_iommu.c
static.
---
 arch/ppc64/kernel/iSeries_iommu.c  |  7 ++++---
 arch/ppc64/kernel/iommu.c          |  6 +++---
 arch/ppc64/kernel/pSeries_iommu.c  |  5 +++--
 arch/ppc64/kernel/pSeries_lpar.c   |  9 +++++----
 arch/ppc64/kernel/pci_dma_direct.c |  9 +++++----
 arch/ppc64/kernel/pci_iommu.c      | 30 +++++++++++++++---------------
 arch/ppc64/kernel/pmac_iommu.c     |  3 ++-
 arch/ppc64/kernel/vio.c            |  6 +++---
 include/asm-ppc64/iommu.h          |  6 ++++--
 include/asm-ppc64/machdep.h        |  3 ++-
 include/asm-ppc64/pci-bridge.h     |  2 ++
 include/asm-ppc64/pci.h            | 20 ++++++++++++--------
 12 files changed, 60 insertions(+), 46 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/iSeries_iommu.c b/arch/ppc64/kernel/iSeries_iommu.c
index 1922af2dbd43..ea4ef7497193 100644
--- a/arch/ppc64/kernel/iSeries_iommu.c
+++ b/arch/ppc64/kernel/iSeries_iommu.c
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -69,7 +70,7 @@ extern struct list_head iSeries_Global_Device_List;
 
 
 static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
-			      unsigned long uaddr, int direction)
+		unsigned long uaddr, enum dma_data_direction direction)
 {
 	u64 rc;
 	union tce_entry tce;
@@ -82,12 +83,12 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages,
 			/* Virtual Bus */
 			tce.te_bits.tb_valid = 1;
 			tce.te_bits.tb_allio = 1;
-			if (direction != PCI_DMA_TODEVICE)
+			if (direction != DMA_TO_DEVICE)
 				tce.te_bits.tb_rdwr = 1;
 		} else {
 			/* PCI Bus */
 			tce.te_bits.tb_rdwr = 1; /* Read allowed */
-			if (direction != PCI_DMA_TODEVICE)
+			if (direction != DMA_TO_DEVICE)
 				tce.te_bits.tb_pciwr = 1;
 		}
 		
diff --git a/arch/ppc64/kernel/iommu.c b/arch/ppc64/kernel/iommu.c
index e3f032bbbab4..aa6b207cd321 100644
--- a/arch/ppc64/kernel/iommu.c
+++ b/arch/ppc64/kernel/iommu.c
@@ -31,7 +31,6 @@
 #include <linux/mm.h>
 #include <linux/spinlock.h>
 #include <linux/string.h>
-#include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/init.h>
 #include <asm/io.h>
@@ -142,7 +141,7 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned long np
 }
 
 dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
-		       unsigned int npages, int direction)
+		       unsigned int npages, enum dma_data_direction direction)
 {
 	unsigned long entry, flags;
 	dma_addr_t ret = DMA_ERROR_CODE;
@@ -227,7 +226,8 @@ void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 }
 
 int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev,
-		   struct scatterlist *sglist, int nelems, int direction)
+		   struct scatterlist *sglist, int nelems,
+		   enum dma_data_direction direction)
 {
 	dma_addr_t dma_next, dma_addr;
 	unsigned long flags;
diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c
index cfa278643ab5..367da0eb5b51 100644
--- a/arch/ppc64/kernel/pSeries_iommu.c
+++ b/arch/ppc64/kernel/pSeries_iommu.c
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/rtas.h>
@@ -46,7 +47,7 @@
 
 static void tce_build_pSeries(struct iommu_table *tbl, long index, 
 			      long npages, unsigned long uaddr, 
-			      int direction)
+			      enum dma_data_direction direction)
 {
 	union tce_entry t;
 	union tce_entry *tp;
@@ -54,7 +55,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index,
 	t.te_word = 0;
 	t.te_rdwr = 1; // Read allowed 
 
-	if (direction != PCI_DMA_TODEVICE)
+	if (direction != DMA_TO_DEVICE)
 		t.te_pciwr = 1;
 
 	tp = ((union tce_entry *)tbl->it_base) + index;
diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c
index 12b0fb86acad..15a9eb4ed014 100644
--- a/arch/ppc64/kernel/pSeries_lpar.c
+++ b/arch/ppc64/kernel/pSeries_lpar.c
@@ -21,7 +21,7 @@
 
 #include <linux/config.h>
 #include <linux/kernel.h>
-#include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <asm/processor.h>
 #include <asm/mmu.h>
 #include <asm/page.h>
@@ -130,8 +130,9 @@ long plpar_put_term_char(unsigned long termno,
 				  lbuf[1]);
 }
 
-static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages,
-				unsigned long uaddr, int direction )
+static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+		long npages, unsigned long uaddr,
+		enum dma_data_direction direction)
 {
 	u64 rc;
 	union tce_entry tce;
@@ -139,7 +140,7 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npage
 	tce.te_word = 0;
 	tce.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT;
 	tce.te_rdwr = 1;
-	if (direction != PCI_DMA_TODEVICE)
+	if (direction != DMA_TO_DEVICE)
 		tce.te_pciwr = 1;
 
 	while (npages--) {
diff --git a/arch/ppc64/kernel/pci_dma_direct.c b/arch/ppc64/kernel/pci_dma_direct.c
index 1cd843237ed9..dc96c0cf2b91 100644
--- a/arch/ppc64/kernel/pci_dma_direct.c
+++ b/arch/ppc64/kernel/pci_dma_direct.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/bootmem.h>
 #include <linux/mm.h>
+#include <linux/dma-mapping.h>
 
 #include <asm/sections.h>
 #include <asm/io.h>
@@ -49,18 +50,18 @@ static void pci_direct_free_consistent(struct pci_dev *hwdev, size_t size,
 }
 
 static dma_addr_t pci_direct_map_single(struct pci_dev *hwdev, void *ptr,
-				  size_t size, int direction)
+		size_t size, enum dma_data_direction direction)
 {
 	return virt_to_abs(ptr);
 }
 
 static void pci_direct_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
-			      size_t size, int direction)
+		size_t size, enum dma_data_direction direction)
 {
 }
 
 static int pci_direct_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-		       int nents, int direction)
+		int nents, enum dma_data_direction direction)
 {
 	int i;
 
@@ -73,7 +74,7 @@ static int pci_direct_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
 }
 
 static void pci_direct_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
-			  int nents, int direction)
+		int nents, enum dma_data_direction direction)
 {
 }
 
diff --git a/arch/ppc64/kernel/pci_iommu.c b/arch/ppc64/kernel/pci_iommu.c
index 8d9869173efd..0d4da23394e7 100644
--- a/arch/ppc64/kernel/pci_iommu.c
+++ b/arch/ppc64/kernel/pci_iommu.c
@@ -66,7 +66,7 @@ static inline struct iommu_table *devnode_table(struct pci_dev *dev)
  * Returns the virtual address of the buffer and sets dma_handle
  * to the dma address (mapping) of the first page.
  */
-void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size,
+static void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size,
 			   dma_addr_t *dma_handle)
 {
 	struct iommu_table *tbl;
@@ -100,7 +100,7 @@ void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size,
 	memset(ret, 0, size);
 
 	/* Set up tces to cover the allocated range */
-	mapping = iommu_alloc(tbl, ret, npages, PCI_DMA_BIDIRECTIONAL);
+	mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
 
 	if (mapping == DMA_ERROR_CODE) {
 		free_pages((unsigned long)ret, order);
@@ -112,7 +112,7 @@ void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size,
 }
 
 
-void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size,
+static void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size,
 			 void *vaddr, dma_addr_t dma_handle)
 {
 	struct iommu_table *tbl;
@@ -136,15 +136,15 @@ void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size,
  * need not be page aligned, the dma_addr_t returned will point to the same
  * byte within the page as vaddr.
  */
-dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr,
-				size_t size, int direction)
+static dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr,
+		size_t size, enum dma_data_direction direction)
 {
 	struct iommu_table * tbl;
 	dma_addr_t dma_handle = DMA_ERROR_CODE;
 	unsigned long uaddr;
 	unsigned int npages;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	uaddr = (unsigned long)vaddr;
 	npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
@@ -167,13 +167,13 @@ dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr,
 }
 
 
-void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
-		      size_t size, int direction)
+static void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction)
 {
 	struct iommu_table *tbl;
 	unsigned int npages;
 	
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	npages = (PAGE_ALIGN(dma_handle + size) - (dma_handle & PAGE_MASK))
 		>> PAGE_SHIFT;
@@ -185,12 +185,12 @@ void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
 }
 
 
-int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems,
-	       int direction)
+static int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
 {
 	struct iommu_table * tbl;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	if (nelems == 0)
 		return 0;
@@ -202,12 +202,12 @@ int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelem
 	return iommu_alloc_sg(tbl, &pdev->dev, sglist, nelems, direction);
 }
 
-void pci_iommu_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems,
-		  int direction)
+static void pci_iommu_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
 {
 	struct iommu_table *tbl;
 
-	BUG_ON(direction == PCI_DMA_NONE);
+	BUG_ON(direction == DMA_NONE);
 
 	tbl = devnode_table(pdev); 
 	if (!tbl)
diff --git a/arch/ppc64/kernel/pmac_iommu.c b/arch/ppc64/kernel/pmac_iommu.c
index e04c344c127e..0e91536b73c1 100644
--- a/arch/ppc64/kernel/pmac_iommu.c
+++ b/arch/ppc64/kernel/pmac_iommu.c
@@ -33,6 +33,7 @@
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/pci.h>
+#include <linux/dma-mapping.h>
 #include <linux/vmalloc.h>
 #include <asm/io.h>
 #include <asm/prom.h>
@@ -141,7 +142,7 @@ static void dart_flush(struct iommu_table *tbl)
 
 static void dart_build_pmac(struct iommu_table *tbl, long index, 
 			    long npages, unsigned long uaddr,
-			    int direction)
+			    enum dma_data_direction direction)
 {
 	unsigned int *dp;
 	unsigned int rpn;
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
index 054027f898fb..530528a3fbfb 100644
--- a/arch/ppc64/kernel/vio.c
+++ b/arch/ppc64/kernel/vio.c
@@ -431,7 +431,7 @@ dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr,
 	tbl = dev->iommu_table;
 
 	if (tbl) {
-		dma_handle = iommu_alloc(tbl, vaddr, npages, (int)direction);
+		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
 		dma_handle |= (uaddr & ~PAGE_MASK);
 	}
 
@@ -470,7 +470,7 @@ int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems,
 	if (!tbl)
 		return 0;
 
-	return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, (int)direction);
+	return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, direction);
 }
 EXPORT_SYMBOL(vio_map_sg);
 
@@ -515,7 +515,7 @@ void *vio_alloc_consistent(struct vio_dev *dev, size_t size,
 			/* Page allocation succeeded */
 			memset(ret, 0, npages << PAGE_SHIFT);
 			/* Set up tces to cover the allocated range */
-			tce = iommu_alloc(tbl, ret, npages, (int)DMA_BIDIRECTIONAL);
+			tce = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
 			if (tce == DMA_ERROR_CODE) {
 				PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: iommu_alloc failed\n" );
 				free_pages((unsigned long)ret, order);
diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h
index 3aeadc38d5f3..a2cc850ef10a 100644
--- a/include/asm-ppc64/iommu.h
+++ b/include/asm-ppc64/iommu.h
@@ -25,6 +25,7 @@
 #include <asm/types.h>
 #include <linux/spinlock.h>
 #include <linux/device.h>
+#include <linux/dma-mapping.h>
 
 /*
  * IOMAP_MAX_ORDER defines the largest contiguous block
@@ -132,14 +133,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl);
 
 /* allocates a range of tces and sets them to the pages  */
 extern dma_addr_t iommu_alloc(struct iommu_table *, void *page, 
-			      unsigned int numPages, int direction);
+			      unsigned int numPages,
+			      enum dma_data_direction direction);
 extern void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
 		       unsigned int npages);
 
 /* same with sg lists */
 extern int iommu_alloc_sg(struct iommu_table *table, struct device *dev,
 			  struct scatterlist *sglist, int nelems,
-			  int direction);
+			  enum dma_data_direction direction);
 extern void iommu_free_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 			  int nelems);
 
diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h
index 10e7e9ec6251..bb961a029fae 100644
--- a/include/asm-ppc64/machdep.h
+++ b/include/asm-ppc64/machdep.h
@@ -11,6 +11,7 @@
 
 #include <linux/config.h>
 #include <linux/seq_file.h>
+#include <linux/dma-mapping.h>
 
 struct pt_regs;
 struct pci_bus;	
@@ -57,7 +58,7 @@ struct machdep_calls {
 				     long index,
 				     long npages,
 				     unsigned long uaddr,
-				     int direction);
+				     enum dma_data_direction direction);
 	void		(*tce_free)(struct iommu_table *tbl,
 				    long index,
 				    long npages);
diff --git a/include/asm-ppc64/pci-bridge.h b/include/asm-ppc64/pci-bridge.h
index a092b9cae621..08ba3f2b89ba 100644
--- a/include/asm-ppc64/pci-bridge.h
+++ b/include/asm-ppc64/pci-bridge.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_PCI_BRIDGE_H
 #define _ASM_PCI_BRIDGE_H
 
+#include <linux/pci.h>
+
 /*
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
diff --git a/include/asm-ppc64/pci.h b/include/asm-ppc64/pci.h
index 9186c7d55e9d..032a2e6e8224 100644
--- a/include/asm-ppc64/pci.h
+++ b/include/asm-ppc64/pci.h
@@ -64,13 +64,13 @@ struct pci_dma_ops {
 				       void *vaddr, dma_addr_t dma_handle);
 
 	dma_addr_t	(*pci_map_single)(struct pci_dev *hwdev, void *ptr,
-					  size_t size, int direction);
+					  size_t size, enum dma_data_direction direction);
 	void		(*pci_unmap_single)(struct pci_dev *hwdev, dma_addr_t dma_addr,
-					    size_t size, int direction);
+					    size_t size, enum dma_data_direction direction);
 	int		(*pci_map_sg)(struct pci_dev *hwdev, struct scatterlist *sg,
-				      int nents, int direction);
+				      int nents, enum dma_data_direction direction);
 	void		(*pci_unmap_sg)(struct pci_dev *hwdev, struct scatterlist *sg,
-					int nents, int direction);
+					int nents, enum dma_data_direction direction);
 	int		(*pci_dma_supported)(struct pci_dev *hwdev, u64 mask);
 	int		(*pci_dac_dma_supported)(struct pci_dev *hwdev, u64 mask);
 };
@@ -92,25 +92,29 @@ static inline void pci_free_consistent(struct pci_dev *hwdev, size_t size,
 static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr,
 					size_t size, int direction)
 {
-	return pci_dma_ops.pci_map_single(hwdev, ptr, size, direction); 
+	return pci_dma_ops.pci_map_single(hwdev, ptr, size,
+			(enum dma_data_direction)direction);
 }
 
 static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr,
 				    size_t size, int direction)
 {
-	pci_dma_ops.pci_unmap_single(hwdev, dma_addr, size, direction);
+	pci_dma_ops.pci_unmap_single(hwdev, dma_addr, size,
+			(enum dma_data_direction)direction);
 }
 
 static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg,
 			     int nents, int direction)
 {
-	return pci_dma_ops.pci_map_sg(hwdev, sg, nents, direction);
+	return pci_dma_ops.pci_map_sg(hwdev, sg, nents,
+			(enum dma_data_direction)direction);
 }
 
 static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
 				int nents, int direction)
 {
-	pci_dma_ops.pci_unmap_sg(hwdev, sg, nents, direction);
+	pci_dma_ops.pci_unmap_sg(hwdev, sg, nents,
+			(enum dma_data_direction)direction);
 }
 
 static inline void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev,
-- 
cgit v1.2.3


From e1df56ff96e6b7be4a651dad58ba38cda5f0d8b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:48:54 -0700
Subject: [PATCH] ppc64: Consolidate some of the iommu DMA mapping routines.

From: Stephen Rothwell <sfr@canb.auug.org.au>

This patch consolidates some of the iommu DMA mapping routines.
---
 arch/ppc64/kernel/iommu.c     | 125 +++++++++++++++++++++++++++++++++++++++---
 arch/ppc64/kernel/pci_iommu.c | 121 +++-------------------------------------
 arch/ppc64/kernel/vio.c       | 113 +++-----------------------------------
 include/asm-ppc64/iommu.h     |  34 ++++++------
 4 files changed, 149 insertions(+), 244 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/iommu.c b/arch/ppc64/kernel/iommu.c
index aa6b207cd321..fb321026ea72 100644
--- a/arch/ppc64/kernel/iommu.c
+++ b/arch/ppc64/kernel/iommu.c
@@ -140,7 +140,7 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned long np
 	return n;
 }
 
-dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
+static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
 		       unsigned int npages, enum dma_data_direction direction)
 {
 	unsigned long entry, flags;
@@ -206,7 +206,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 		__clear_bit(free_entry+i, tbl->it_map);
 }
 
-void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
+static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 		unsigned int npages)
 {
 	unsigned long flags;
@@ -225,9 +225,9 @@ void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
-int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev,
-		   struct scatterlist *sglist, int nelems,
-		   enum dma_data_direction direction)
+int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction)
 {
 	dma_addr_t dma_next, dma_addr;
 	unsigned long flags;
@@ -235,6 +235,11 @@ int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev,
 	int outcount;
 	unsigned long handle;
 
+	BUG_ON(direction == DMA_NONE);
+
+	if ((nelems == 0) || !tbl)
+		return 0;
+
 	outs = s = segstart = &sglist[0];
 	outcount = 1;
 	handle = 0;
@@ -349,11 +354,16 @@ int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev,
 }
 
 
-void iommu_free_sg(struct iommu_table *tbl, struct scatterlist *sglist,
-		   int nelems)
+void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction)
 {
 	unsigned long flags;
 
+	BUG_ON(direction == DMA_NONE);
+
+	if (!tbl)
+		return;
+
 	spin_lock_irqsave(&(tbl->it_lock), flags);
 
 	while (nelems--) {
@@ -414,3 +424,104 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl)
 
 	return tbl;
 }
+
+/* Creates TCEs for a user provided buffer.  The user buffer must be
+ * contiguous real kernel storage (not vmalloc).  The address of the buffer
+ * passed here is the kernel (virtual) address of the buffer.  The buffer
+ * need not be page aligned, the dma_addr_t returned will point to the same
+ * byte within the page as vaddr.
+ */
+dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
+		size_t size, enum dma_data_direction direction)
+{
+	dma_addr_t dma_handle = DMA_ERROR_CODE;
+	unsigned long uaddr;
+	unsigned int npages;
+
+	BUG_ON(direction == DMA_NONE);
+
+	uaddr = (unsigned long)vaddr;
+	npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
+	npages >>= PAGE_SHIFT;
+
+	if (tbl) {
+		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
+		if (dma_handle == DMA_ERROR_CODE) {
+			if (printk_ratelimit())  {
+				printk(KERN_INFO "iommu_alloc failed, "
+						"tbl %p vaddr %p npages %d\n",
+						tbl, vaddr, npages);
+			}
+		} else
+			dma_handle |= (uaddr & ~PAGE_MASK);
+	}
+
+	return dma_handle;
+}
+
+void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+
+	if (tbl)
+		iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
+					(dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
+}
+
+/* Allocates a contiguous real buffer and creates mappings over it.
+ * Returns the virtual address of the buffer and sets dma_handle
+ * to the dma address (mapping) of the first page.
+ */
+void *iommu_alloc_consistent(struct iommu_table *tbl, size_t size,
+		dma_addr_t *dma_handle)
+{
+	void *ret = NULL;
+	dma_addr_t mapping;
+	unsigned int npages, order;
+
+	size = PAGE_ALIGN(size);
+	npages = size >> PAGE_SHIFT;
+	order = get_order(size);
+
+ 	/*
+	 * Client asked for way too much space.  This is checked later
+	 * anyway.  It is easier to debug here for the drivers than in
+	 * the tce tables.
+	 */
+	if (order >= IOMAP_MAX_ORDER) {
+		printk("iommu_alloc_consistent size too large: 0x%lx\n", size);
+		return (void *)DMA_ERROR_CODE;
+	}
+
+	if (!tbl)
+		return NULL;
+
+	/* Alloc enough pages (and possibly more) */
+	ret = (void *)__get_free_pages(GFP_ATOMIC, order);
+	if (!ret)
+		return NULL;
+	memset(ret, 0, size);
+
+	/* Set up tces to cover the allocated range */
+	mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
+	if (mapping == DMA_ERROR_CODE) {
+		free_pages((unsigned long)ret, order);
+		ret = NULL;
+	} else
+		*dma_handle = mapping;
+	return ret;
+}
+
+void iommu_free_consistent(struct iommu_table *tbl, size_t size,
+			 void *vaddr, dma_addr_t dma_handle)
+{
+	unsigned int npages;
+
+	if (tbl) {
+		size = PAGE_ALIGN(size);
+		npages = size >> PAGE_SHIFT;
+		iommu_free(tbl, dma_handle, npages);
+		free_pages((unsigned long)vaddr, get_order(size));
+	}
+}
diff --git a/arch/ppc64/kernel/pci_iommu.c b/arch/ppc64/kernel/pci_iommu.c
index 0d4da23394e7..4d99851fe815 100644
--- a/arch/ppc64/kernel/pci_iommu.c
+++ b/arch/ppc64/kernel/pci_iommu.c
@@ -43,8 +43,6 @@
 #include <asm/iSeries/iSeries_pci.h>
 #endif /* CONFIG_PPC_ISERIES */
 
-#define DBG(...)
-
 static inline struct iommu_table *devnode_table(struct pci_dev *dev)
 {
 	if (!dev)
@@ -69,67 +67,15 @@ static inline struct iommu_table *devnode_table(struct pci_dev *dev)
 static void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size,
 			   dma_addr_t *dma_handle)
 {
-	struct iommu_table *tbl;
-	void *ret = NULL;
-	dma_addr_t mapping;
-	unsigned int npages, order;
-
-	size = PAGE_ALIGN(size);
-	npages = size >> PAGE_SHIFT;
-	order = get_order(size);
-
- 	/* Client asked for way too much space.  This is checked later anyway */
-	/* It is easier to debug here for the drivers than in the tce tables.*/
-	if (order >= IOMAP_MAX_ORDER) {
-		printk("PCI_DMA: pci_alloc_consistent size too large: 0x%lx\n",
-			size);
-		return NULL;
-	}
-
-	tbl = devnode_table(hwdev); 
-
-	if (!tbl)
-		return NULL;
-
-	/* Alloc enough pages (and possibly more) */
-	ret = (void *)__get_free_pages(GFP_ATOMIC, order);
-
-	if (!ret)
-		return NULL;
-
-	memset(ret, 0, size);
-
-	/* Set up tces to cover the allocated range */
-	mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
-
-	if (mapping == DMA_ERROR_CODE) {
-		free_pages((unsigned long)ret, order);
-		ret = NULL;
-	} else
-		*dma_handle = mapping;
-
-	return ret;
+	return iommu_alloc_consistent(devnode_table(hwdev), size, dma_handle);
 }
 
-
 static void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size,
 			 void *vaddr, dma_addr_t dma_handle)
 {
-	struct iommu_table *tbl;
-	unsigned int npages;
-	
-	size = PAGE_ALIGN(size);
-	npages = size >> PAGE_SHIFT;
-
-	tbl = devnode_table(hwdev); 
-
-	if (tbl) {
-		iommu_free(tbl, dma_handle, npages);
-		free_pages((unsigned long)vaddr, get_order(size));
-	}
+	iommu_free_consistent(devnode_table(hwdev), size, vaddr, dma_handle);
 }
 
-
 /* Creates TCEs for a user provided buffer.  The user buffer must be 
  * contiguous real kernel storage (not vmalloc).  The address of the buffer
  * passed here is the kernel (virtual) address of the buffer.  The buffer
@@ -139,81 +85,28 @@ static void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size,
 static dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr,
 		size_t size, enum dma_data_direction direction)
 {
-	struct iommu_table * tbl;
-	dma_addr_t dma_handle = DMA_ERROR_CODE;
-	unsigned long uaddr;
-	unsigned int npages;
-
-	BUG_ON(direction == DMA_NONE);
-
-	uaddr = (unsigned long)vaddr;
-	npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
-	npages >>= PAGE_SHIFT;
-
-	tbl = devnode_table(hwdev); 
-
-	if (tbl) {
-		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
-		if (dma_handle == DMA_ERROR_CODE) {
-			if (printk_ratelimit())  {
-				printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %p npages %d\n",
-				       tbl, vaddr, npages);
-			}
-		} else 
-			dma_handle |= (uaddr & ~PAGE_MASK);
-	}
-
-	return dma_handle;
+	return iommu_map_single(devnode_table(hwdev), vaddr, size, direction);
 }
 
 
 static void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle,
 		size_t size, enum dma_data_direction direction)
 {
-	struct iommu_table *tbl;
-	unsigned int npages;
-	
-	BUG_ON(direction == DMA_NONE);
-
-	npages = (PAGE_ALIGN(dma_handle + size) - (dma_handle & PAGE_MASK))
-		>> PAGE_SHIFT;
-
-	tbl = devnode_table(hwdev); 
-
-	if (tbl) 
-		iommu_free(tbl, dma_handle, npages);
+	iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction);
 }
 
 
 static int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist,
 		int nelems, enum dma_data_direction direction)
 {
-	struct iommu_table * tbl;
-
-	BUG_ON(direction == DMA_NONE);
-
-	if (nelems == 0)
-		return 0;
-
-	tbl = devnode_table(pdev); 
-	if (!tbl)
-		return 0;
-
-	return iommu_alloc_sg(tbl, &pdev->dev, sglist, nelems, direction);
+	return iommu_map_sg(&pdev->dev, devnode_table(pdev), sglist,
+			nelems, direction);
 }
 
 static void pci_iommu_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist,
 		int nelems, enum dma_data_direction direction)
 {
-	struct iommu_table *tbl;
-
-	BUG_ON(direction == DMA_NONE);
-
-	tbl = devnode_table(pdev); 
-	if (!tbl)
-		return;
-
-	iommu_free_sg(tbl, sglist, nelems);
+	iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction);
 }
 
 /* We support DMA to/from any memory page via the iommu */
diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c
index 530528a3fbfb..c4c16b8cd7a9 100644
--- a/arch/ppc64/kernel/vio.c
+++ b/arch/ppc64/kernel/vio.c
@@ -413,145 +413,46 @@ int vio_disable_interrupts(struct vio_dev *dev)
 }
 EXPORT_SYMBOL(vio_disable_interrupts);
 
-
 dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr,
 			  size_t size, enum dma_data_direction direction)
 {
-	struct iommu_table *tbl;
-	dma_addr_t dma_handle = DMA_ERROR_CODE;
-	unsigned long uaddr;
-	unsigned int npages;
-
-	BUG_ON(direction == DMA_NONE);
-
-	uaddr = (unsigned long)vaddr;
-	npages = PAGE_ALIGN( uaddr + size ) - ( uaddr & PAGE_MASK );
-	npages >>= PAGE_SHIFT;
-
-	tbl = dev->iommu_table;
-
-	if (tbl) {
-		dma_handle = iommu_alloc(tbl, vaddr, npages, direction);
-		dma_handle |= (uaddr & ~PAGE_MASK);
-	}
-
-	return dma_handle;
+	return iommu_map_single(dev->iommu_table, vaddr, size, direction);
 }
 EXPORT_SYMBOL(vio_map_single);
 
 void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle,
 		      size_t size, enum dma_data_direction direction)
 {
-	struct iommu_table * tbl;
-	unsigned int npages;
-
-	BUG_ON(direction == DMA_NONE);
-
-	npages = PAGE_ALIGN( dma_handle + size ) - ( dma_handle & PAGE_MASK );
-	npages >>= PAGE_SHIFT;
-
-	tbl = dev->iommu_table;
-	if(tbl)
-		iommu_free(tbl, dma_handle, npages);
+	iommu_unmap_single(dev->iommu_table, dma_handle, size, direction);
 }
 EXPORT_SYMBOL(vio_unmap_single);
 
 int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems,
 	       enum dma_data_direction direction)
 {
-	struct iommu_table *tbl;
-
-	BUG_ON(direction == DMA_NONE);
-
-	if (nelems == 0)
-		return 0;
-
-	tbl = vdev->iommu_table;
-	if (!tbl)
-		return 0;
-
-	return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, direction);
+	return iommu_map_sg(&vdev->dev, vdev->iommu_table, sglist,
+			nelems, direction);
 }
 EXPORT_SYMBOL(vio_map_sg);
 
 void vio_unmap_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems,
 		  enum dma_data_direction direction)
 {
-	struct iommu_table *tbl;
-
-	BUG_ON(direction == DMA_NONE);
-
-	tbl = vdev->iommu_table;
-	if (tbl)
-		iommu_free_sg(tbl, sglist, nelems);
+	iommu_unmap_sg(vdev->iommu_table, sglist, nelems, direction);
 }
 EXPORT_SYMBOL(vio_unmap_sg);
 
 void *vio_alloc_consistent(struct vio_dev *dev, size_t size,
 			   dma_addr_t *dma_handle)
 {
-	struct iommu_table * tbl;
-	void *ret = NULL;
-	unsigned int npages, order;
-	dma_addr_t tce;
-
-	size = PAGE_ALIGN(size);
-	npages = size >> PAGE_SHIFT;
-	order = get_order(size);
-
- 	/* Client asked for way to much space.  This is checked later anyway */
-	/* It is easier to debug here for the drivers than in the tce tables.*/
- 	if(order >= IOMAP_MAX_ORDER) {
- 		printk("VIO_DMA: vio_alloc_consistent size too large: 0x%lx \n", size);
- 		return NULL;
- 	}
-
-	tbl = dev->iommu_table;
-
-	if (tbl) {
-		/* Alloc enough pages (and possibly more) */
-		ret = (void *)__get_free_pages(GFP_ATOMIC, order);
-		if (ret) {
-			/* Page allocation succeeded */
-			memset(ret, 0, npages << PAGE_SHIFT);
-			/* Set up tces to cover the allocated range */
-			tce = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL);
-			if (tce == DMA_ERROR_CODE) {
-				PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: iommu_alloc failed\n" );
-				free_pages((unsigned long)ret, order);
-				ret = NULL;
-			} else {
-				*dma_handle = tce;
-			}
-		}
-		else PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: __get_free_pages failed for size = %d\n", size);
-	}
-	else PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: get_iommu_table failed for 0x%016lx\n", dev);
-
-	PPCDBG(PPCDBG_TCE, "\tvio_alloc_consistent: dma_handle = 0x%16.16lx\n", *dma_handle);
-	PPCDBG(PPCDBG_TCE, "\tvio_alloc_consistent: return     = 0x%16.16lx\n", ret);
-	return ret;
+	return iommu_alloc_consistent(dev->iommu_table, size, dma_handle);
 }
 EXPORT_SYMBOL(vio_alloc_consistent);
 
 void vio_free_consistent(struct vio_dev *dev, size_t size,
 			 void *vaddr, dma_addr_t dma_handle)
 {
-	struct iommu_table *tbl;
-	unsigned int npages;
-
-	PPCDBG(PPCDBG_TCE, "vio_free_consistent:\n");
-	PPCDBG(PPCDBG_TCE, "\tdev = 0x%16.16lx, size = 0x%16.16lx, dma_handle = 0x%16.16lx, vaddr = 0x%16.16lx\n", dev, size, dma_handle, vaddr);
-
-	size = PAGE_ALIGN(size);
-	npages = size >> PAGE_SHIFT;
-
-	tbl = dev->iommu_table;
-
-	if ( tbl ) {
-		iommu_free(tbl, dma_handle, npages);
-		free_pages((unsigned long)vaddr, get_order(size));
-	}
+	iommu_free_consistent(dev->iommu_table, size, vaddr, dma_handle);
 }
 EXPORT_SYMBOL(vio_free_consistent);
 
diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h
index a2cc850ef10a..689b6adf383e 100644
--- a/include/asm-ppc64/iommu.h
+++ b/include/asm-ppc64/iommu.h
@@ -19,8 +19,8 @@
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
  */
 
-#ifndef _PCI_DMA_H
-#define _PCI_DMA_H
+#ifndef _ASM_IOMMU_H
+#define _ASM_IOMMU_H
 
 #include <asm/types.h>
 #include <linux/spinlock.h>
@@ -131,20 +131,20 @@ extern void iommu_devnode_init(struct iSeries_Device_Node *dn);
  */
 extern struct iommu_table *iommu_init_table(struct iommu_table * tbl);
 
-/* allocates a range of tces and sets them to the pages  */
-extern dma_addr_t iommu_alloc(struct iommu_table *, void *page, 
-			      unsigned int numPages,
-			      enum dma_data_direction direction);
-extern void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
-		       unsigned int npages);
-
-/* same with sg lists */
-extern int iommu_alloc_sg(struct iommu_table *table, struct device *dev,
-			  struct scatterlist *sglist, int nelems,
-			  enum dma_data_direction direction);
-extern void iommu_free_sg(struct iommu_table *tbl, struct scatterlist *sglist,
-			  int nelems);
-
+extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
+		struct scatterlist *sglist, int nelems,
+		enum dma_data_direction direction);
+extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
+		int nelems, enum dma_data_direction direction);
+
+extern void *iommu_alloc_consistent(struct iommu_table *tbl, size_t size,
+		dma_addr_t *dma_handle);
+extern void iommu_free_consistent(struct iommu_table *tbl, size_t size,
+		void *vaddr, dma_addr_t dma_handle);
+extern dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
+		size_t size, enum dma_data_direction direction);
+extern void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
+		size_t size, enum dma_data_direction direction);
 
 extern void tce_init_pSeries(void);
 extern void tce_init_iSeries(void);
@@ -154,4 +154,4 @@ extern void pci_dma_init_direct(void);
 
 extern int ppc64_iommu_off;
 
-#endif
+#endif /* _ASM_IOMMU_H */
-- 
cgit v1.2.3


From 53e8cdeb3bd9eea4ccd67de4a3b5b479680ca063 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:49:21 -0700
Subject: [PATCH] ppc64: replace vio_dma_mapping_error with dma_mapping_error
 everywhere.

From: Stephen Rothwell <sfr@canb.auug.org.au>

James Bottomley is right, this was a mistake.  This patch replaces
vio_dma_mapping_error with dma_mapping_error everywhere.
---
 drivers/net/ibmveth.c   | 16 ++++++++--------
 include/asm-ppc64/vio.h |  5 -----
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include')

diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c
index 6427a25c2719..3da41374a127 100644
--- a/drivers/net/ibmveth.c
+++ b/drivers/net/ibmveth.c
@@ -404,7 +404,7 @@ static inline void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter)
 static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 {
 	if(adapter->buffer_list_addr != NULL) {
-		if(!vio_dma_mapping_error(adapter->buffer_list_dma)) {
+		if(!dma_mapping_error(adapter->buffer_list_dma)) {
 			vio_unmap_single(adapter->vdev, adapter->buffer_list_dma, 4096, DMA_BIDIRECTIONAL);
 			adapter->buffer_list_dma = DMA_ERROR_CODE;
 		}
@@ -413,7 +413,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 	} 
 
 	if(adapter->filter_list_addr != NULL) {
-		if(!vio_dma_mapping_error(adapter->filter_list_dma)) {
+		if(!dma_mapping_error(adapter->filter_list_dma)) {
 			vio_unmap_single(adapter->vdev, adapter->filter_list_dma, 4096, DMA_BIDIRECTIONAL);
 			adapter->filter_list_dma = DMA_ERROR_CODE;
 		}
@@ -422,7 +422,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter)
 	}
 
 	if(adapter->rx_queue.queue_addr != NULL) {
-		if(!vio_dma_mapping_error(adapter->rx_queue.queue_dma)) {
+		if(!dma_mapping_error(adapter->rx_queue.queue_dma)) {
 			vio_unmap_single(adapter->vdev, adapter->rx_queue.queue_dma, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL);
 			adapter->rx_queue.queue_dma = DMA_ERROR_CODE;
 		}
@@ -473,9 +473,9 @@ static int ibmveth_open(struct net_device *netdev)
 	adapter->filter_list_dma = vio_map_single(adapter->vdev, adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL);
 	adapter->rx_queue.queue_dma = vio_map_single(adapter->vdev, adapter->rx_queue.queue_addr, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL);
 
-	if((vio_dma_mapping_error(adapter->buffer_list_dma) ) ||
-	   (vio_dma_mapping_error(adapter->filter_list_dma)) ||
-	   (vio_dma_mapping_error(adapter->rx_queue.queue_dma))) {
+	if((dma_mapping_error(adapter->buffer_list_dma) ) ||
+	   (dma_mapping_error(adapter->filter_list_dma)) ||
+	   (dma_mapping_error(adapter->rx_queue.queue_dma))) {
 		ibmveth_error_printk("unable to map filter or buffer list pages\n");
 		ibmveth_cleanup(adapter);
 		return -ENOMEM;
@@ -644,7 +644,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 	desc[0].fields.address = vio_map_single(adapter->vdev, skb->data, desc[0].fields.length, PCI_DMA_TODEVICE);
 	desc[0].fields.valid   = 1;
 
-	if(vio_dma_mapping_error(desc[0].fields.address)) {
+	if(dma_mapping_error(desc[0].fields.address)) {
 		ibmveth_error_printk("tx: unable to map initial fragment\n");
 		adapter->tx_map_failed++;
 		adapter->stats.tx_dropped++;
@@ -663,7 +663,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev)
 		desc[curfrag+1].fields.length = frag->size;
 		desc[curfrag+1].fields.valid  = 1;
 
-		if(vio_dma_mapping_error(desc[curfrag+1].fields.address)) {
+		if(dma_mapping_error(desc[curfrag+1].fields.address)) {
 			ibmveth_error_printk("tx: unable to map fragment %d\n", curfrag);
 			adapter->tx_map_failed++;
 			adapter->stats.tx_dropped++;
diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h
index 7fe3a23736e7..107201b25008 100644
--- a/include/asm-ppc64/vio.h
+++ b/include/asm-ppc64/vio.h
@@ -137,9 +137,4 @@ static inline struct vio_dev *to_vio_dev(struct device *dev)
 	return container_of(dev, struct vio_dev, dev);
 }
 
-static inline int vio_dma_mapping_error(dma_addr_t dma_addr)
-{
-	return dma_mapping_error(dma_addr);
-}
-
 #endif /* _ASM_VIO_H */
-- 
cgit v1.2.3


From c3a85f1fb88cfa30ab4af65348eaf4290233cac8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:49:46 -0700
Subject: [PATCH] ppc64: Oops cleanup

From: Anton Blanchard <anton@samba.org>

Oops cleanup:

- Move prototypes into system.h
- Move the debugger hooks into die, all the calls sites were calling them.
- Handle bad values passed to prregs
---
 arch/ppc64/kernel/traps.c  | 50 +++++++++++++++++++++-------------------------
 arch/ppc64/mm/fault.c      |  8 +-------
 arch/ppc64/xmon/xmon.c     | 34 +++++++++++++++++++------------
 include/asm-ppc64/system.h |  5 +++++
 4 files changed, 50 insertions(+), 47 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/traps.c b/arch/ppc64/kernel/traps.c
index 6074cae369e9..82d9b8b24e61 100644
--- a/arch/ppc64/kernel/traps.c
+++ b/arch/ppc64/kernel/traps.c
@@ -37,9 +37,6 @@
 #include <asm/processor.h>
 #include <asm/ppcdebug.h>
 
-extern int fix_alignment(struct pt_regs *);
-extern void bad_page_fault(struct pt_regs *, unsigned long, int);
-
 #ifdef CONFIG_PPC_PSERIES
 /* This is true if we are using the firmware NMI handler (typically LPAR) */
 extern int fwnmi_active;
@@ -67,11 +64,17 @@ EXPORT_SYMBOL(__debugger_fault_handler);
 
 static spinlock_t die_lock = SPIN_LOCK_UNLOCKED;
 
-void die(const char *str, struct pt_regs *regs, long err)
+int die(const char *str, struct pt_regs *regs, long err)
 {
 	static int die_counter;
 	int nl = 0;
 
+	if (debugger_fault_handler(regs))
+		return 1;
+
+	if (debugger(regs))
+		return 1;
+
 	console_verbose();
 	spin_lock_irq(&die_lock);
 	bust_spinlocks(1);
@@ -126,15 +129,16 @@ void die(const char *str, struct pt_regs *regs, long err)
 		panic("Fatal exception");
 	}
 	do_exit(SIGSEGV);
+
+	return 0;
 }
 
 static void
 _exception(int signr, siginfo_t *info, struct pt_regs *regs)
 {
 	if (!user_mode(regs)) {
-		if (debugger(regs))
+		if (die("Exception in kernel mode", regs, signr))
 			return;
-		die("Exception in kernel mode", regs, signr);
 	}
 
 	force_sig_info(signr, info, current);
@@ -188,8 +192,7 @@ SystemResetException(struct pt_regs *regs)
 	}
 #endif
 
-	if (!debugger(regs))
-		die("System Reset", regs, 0);
+	die("System Reset", regs, 0);
 
 	/* Must die if the interrupt is not recoverable */
 	if (!(regs->msr & MSR_RI))
@@ -246,9 +249,6 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log err)
  *
  * On hardware prior to Power 4 these exceptions were asynchronous which
  * means we can't tell exactly where it occurred and so we can't recover.
- *
- * Note that the debugger should test RI=0 and warn the user that system
- * state has been corrupted.
  */
 void
 MachineCheckException(struct pt_regs *regs)
@@ -266,12 +266,11 @@ MachineCheckException(struct pt_regs *regs)
 	}
 #endif
 
-	if (debugger_fault_handler(regs))
-		return;
-	if (debugger(regs))
-		return;
+	die("Machine check", regs, 0);
 
-	die("Machine check in kernel mode", regs, 0);
+	/* Must die if the interrupt is not recoverable */
+	if (!(regs->msr & MSR_RI))
+		panic("Unrecoverable Machine check");
 }
 
 void
@@ -397,9 +396,6 @@ ProgramCheckException(struct pt_regs *regs)
 {
 	siginfo_t info;
 
-	if (debugger_fault_handler(regs))
-		return;
-
 	if (regs->msr & 0x100000) {
 		/* IEEE FP exception */
 
@@ -438,16 +434,18 @@ ProgramCheckException(struct pt_regs *regs)
 	}
 }
 
-void
-KernelFPUnavailableException(struct pt_regs *regs)
+void KernelFPUnavailableException(struct pt_regs *regs)
 {
-	die("Unrecoverable FP Unavailable Exception in Kernel", regs, 0);
+	printk(KERN_EMERG "Unrecoverable FP Unavailable Exception "
+			  "%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable FP Unavailable Exception", regs, SIGABRT);
 }
 
-void
-KernelAltivecUnavailableException(struct pt_regs *regs)
+void KernelAltivecUnavailableException(struct pt_regs *regs)
 {
-	die("Unrecoverable VMX/Altivec Unavailable Exception in Kernel", regs, 0);
+	printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception "
+			  "%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT);
 }
 
 void
@@ -539,7 +537,6 @@ void unrecoverable_exception(struct pt_regs *regs)
 {
 	printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n",
 	       regs->trap, regs->nip);
-	debugger(regs);
 	die("Unrecoverable exception", regs, SIGABRT);
 }
 
@@ -551,7 +548,6 @@ void kernel_bad_stack(struct pt_regs *regs)
 {
 	printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n",
 	       regs->gpr[1], regs->nip);
-	debugger(regs);
 	die("Bad kernel stack pointer", regs, SIGABRT);
 }
 
diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c
index 4fd5100acff1..73712c143825 100644
--- a/arch/ppc64/mm/fault.c
+++ b/arch/ppc64/mm/fault.c
@@ -37,8 +37,6 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 
-void bad_page_fault(struct pt_regs *, unsigned long, int);
-
 /*
  * The error_code parameter is
  *  - DSISR for a non-SLB data access fault,
@@ -177,10 +175,8 @@ do_sigbus:
  * It is called from do_page_fault above and from some of the procedures
  * in traps.c.
  */
-void
-bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
+void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 {
-	extern void die(const char *, struct pt_regs *, long);
 	const struct exception_table_entry *entry;
 
 	/* Are we prepared to handle this fault?  */
@@ -190,7 +186,5 @@ bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
 	}
 
 	/* kernel has accessed a bad area */
-	if (debugger(regs))
-		return;
 	die("Kernel access of bad area", regs, sig);
 }
diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c
index 8bf490b348f1..90fe14bacc59 100644
--- a/arch/ppc64/xmon/xmon.c
+++ b/arch/ppc64/xmon/xmon.c
@@ -542,8 +542,7 @@ cmds(struct pt_regs *excp)
 			symbol_lookup();
 			break;
 		case 'r':
-			if (excp != NULL)
-				prregs(excp);	/* print regs */
+			prregs(excp);	/* print regs */
 			break;
 		case 'e':
 			if (excp == NULL)
@@ -966,8 +965,7 @@ static void backtrace(struct pt_regs *excp)
 
 spinlock_t exception_print_lock = SPIN_LOCK_UNLOCKED;
 
-void
-excprint(struct pt_regs *fp)
+void excprint(struct pt_regs *fp)
 {
 	unsigned long flags;
 
@@ -1002,21 +1000,31 @@ excprint(struct pt_regs *fp)
 	spin_unlock_irqrestore(&exception_print_lock, flags);
 }
 
-void
-prregs(struct pt_regs *fp)
+void prregs(struct pt_regs *fp)
 {
 	int n;
 	unsigned long base;
 
 	if (scanhex((void *)&base))
 		fp = (struct pt_regs *) base;
-	for (n = 0; n < 16; ++n)
-		printf("R%.2ld = %.16lx   R%.2ld = %.16lx\n", n, fp->gpr[n],
-		       n+16, fp->gpr[n+16]);
-	printf("pc  = %.16lx   msr = %.16lx\nlr  = %.16lx   cr  = %.16lx\n",
-	       fp->nip, fp->msr, fp->link, fp->ccr);
-	printf("ctr = %.16lx   xer = %.16lx   trap = %8lx\n",
-	       fp->ctr, fp->xer, fp->trap);
+
+	if (setjmp(bus_error_jmp) == 0) {
+		__debugger_fault_handler = handle_fault;
+		sync();
+		for (n = 0; n < 16; ++n)
+			printf("R%.2ld = %.16lx   R%.2ld = %.16lx\n", n,
+			       fp->gpr[n], n+16, fp->gpr[n+16]);
+		printf("pc  = %.16lx   msr = %.16lx\nlr  = %.16lx   "
+		       "cr  = %.16lx\n", fp->nip, fp->msr, fp->link, fp->ccr);
+		printf("ctr = %.16lx   xer = %.16lx   trap = %8lx\n",
+		       fp->ctr, fp->xer, fp->trap);
+
+		sync();
+		/* wait a little while to see if we get a machine check */
+		__delay(200);
+	} else {
+		printf("*** Error reading regs\n");
+	}
 }
 
 void
diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h
index 9d732aa1256e..152a59acbc35 100644
--- a/include/asm-ppc64/system.h
+++ b/include/asm-ppc64/system.h
@@ -94,7 +94,12 @@ static inline int debugger_dabr_match(struct pt_regs *regs) { return 0; }
 static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; }
 #endif
 
+extern int fix_alignment(struct pt_regs *regs);
+extern void bad_page_fault(struct pt_regs *regs, unsigned long address,
+			   int sig);
 extern void show_regs(struct pt_regs * regs);
+extern int die(const char *str, struct pt_regs *regs, long err);
+
 extern void flush_instruction_cache(void);
 extern int _get_PVR(void);
 extern void giveup_fpu(struct task_struct *);
-- 
cgit v1.2.3


From b7ceb1452399ef59ab14868337d2d74a9b5c4c8d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:49:59 -0700
Subject: [PATCH] ppc64: Add smt_snooze_delay cpu sysfs attribute

From: Anton Blanchard <anton@samba.org>

Add smt_snooze_delay cpu sysfs attribute
---
 arch/ppc64/kernel/idle.c  |  7 ++--
 arch/ppc64/kernel/prom.c  | 46 ------------------------
 arch/ppc64/kernel/sysfs.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-ppc64/naca.h  |  6 ++--
 4 files changed, 95 insertions(+), 53 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c
index b30aea273974..a9a501df397c 100644
--- a/arch/ppc64/kernel/idle.c
+++ b/arch/ppc64/kernel/idle.c
@@ -161,13 +161,14 @@ int default_idle(void)
 
 #ifdef CONFIG_PPC_PSERIES
 
-DECLARE_PER_CPU(smt_snooze_delay);
+DECLARE_PER_CPU(unsigned long, smt_snooze_delay);
 
 int dedicated_idle(void)
 {
 	long oldval;
 	struct paca_struct *lpaca = get_paca(), *ppaca;
 	unsigned long start_snooze;
+	unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay);
 
 	ppaca = &paca[smp_processor_id() ^ 1];
 
@@ -180,14 +181,14 @@ int dedicated_idle(void)
 		if (!oldval) {
 			set_thread_flag(TIF_POLLING_NRFLAG);
 			start_snooze = __get_tb() +
-				naca->smt_snooze_delay*tb_ticks_per_usec;
+				*smt_snooze_delay * tb_ticks_per_usec;
 			while (!need_resched()) {
 				/* need_resched could be 1 or 0 at this 
 				 * point.  If it is 0, set it to 0, so
 				 * an IPI/Prod is sent.  If it is 1, keep
 				 * it that way & schedule work.
 				 */
-				if (naca->smt_snooze_delay == 0 ||
+				if (*smt_snooze_delay == 0 ||
 				    __get_tb() < start_snooze) {
 					HMT_low(); /* Low thread priority */
 					continue;
diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c
index f1cfd43dd39c..6748b2244e88 100644
--- a/arch/ppc64/kernel/prom.c
+++ b/arch/ppc64/kernel/prom.c
@@ -1254,7 +1254,6 @@ smt_setup(void)
 {
 	char *p, *q;
 	char my_smt_enabled = SMT_DYNAMIC;
-	unsigned long my_smt_snooze_delay; 
 	ihandle prom_options = NULL;
 	char option[9];
 	unsigned long offset = reloc_offset();
@@ -1301,51 +1300,6 @@ smt_setup(void)
 	if (!found )
 		my_smt_enabled = SMT_DYNAMIC; /* default to on */
 
-	found = 0;
-	if (my_smt_enabled) {
-		if (strstr(RELOC(cmd_line), RELOC("smt-snooze-delay="))) {
-			for (q = RELOC(cmd_line); (p = strstr(q, RELOC("smt-snooze-delay="))) != 0; ) {
-				q = p + 17;
-				if (p > RELOC(cmd_line) && p[-1] != ' ')
-					continue;
-				found = 1;
-				/* Don't use simple_strtoul() because _ctype & others aren't RELOC'd */
-				my_smt_snooze_delay = 0;
-				while (*q >= '0' && *q <= '9') {
-					my_smt_snooze_delay = my_smt_snooze_delay * 10 + *q - '0';
-					q++;
-				}
-			}
-		}
-
-		if (!found) {
-			prom_options = (ihandle)call_prom(RELOC("finddevice"), 1, 1, RELOC("/options"));
-			if (prom_options != (ihandle) -1) {
-				call_prom(RELOC("getprop"), 
-					4, 1, prom_options,
-					RELOC("ibm,smt-snooze-delay"), 
-					option, 
-					sizeof(option));
-				if (option[0] != 0) {
-					found = 1;
-					/* Don't use simple_strtoul() because _ctype & others aren't RELOC'd */
-					my_smt_snooze_delay = 0;
-					q = option;
-					while (*q >= '0' && *q <= '9') {
-						my_smt_snooze_delay = my_smt_snooze_delay * 10 + *q - '0';
-						q++;
-					}
-				}
-			}
-		}
-
-		if (!found) {
-			my_smt_snooze_delay = 0; /* default value */
-		}
-	} else {
-		my_smt_snooze_delay = 0; /* default value */
-	}
-	_naca->smt_snooze_delay = my_smt_snooze_delay;
 	_naca->smt_state = my_smt_enabled;
 }
 
diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c
index 05ef5291a737..3bcbdec74195 100644
--- a/arch/ppc64/kernel/sysfs.c
+++ b/arch/ppc64/kernel/sysfs.c
@@ -9,6 +9,90 @@
 #include <asm/processor.h>
 #include <asm/cputable.h>
 #include <asm/hvcall.h>
+#include <asm/prom.h>
+
+
+/* SMT stuff */
+
+#ifndef CONFIG_PPC_ISERIES
+
+/* default to snooze disabled */
+DEFINE_PER_CPU(unsigned long, smt_snooze_delay);
+
+static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf,
+				      size_t count)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+	ssize_t ret;
+	unsigned long snooze;
+
+	ret = sscanf(buf, "%lu", &snooze);
+	if (ret != 1)
+		return -EINVAL;
+
+	per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze;
+
+	return count;
+}
+
+static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf)
+{
+	struct cpu *cpu = container_of(dev, struct cpu, sysdev);
+
+	return sprintf(buf, "%lu\n", per_cpu(smt_snooze_delay, cpu->sysdev.id));
+}
+
+static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay,
+		   store_smt_snooze_delay);
+
+/* Only parse OF options if the matching cmdline option was not specified */
+static int smt_snooze_cmdline;
+
+static int __init smt_setup(void)
+{
+	struct device_node *options;
+	unsigned int *val;
+	unsigned int cpu;
+
+	if (!cur_cpu_spec->cpu_features & CPU_FTR_SMT)
+		return 1;
+
+	options = find_path_device("/options");
+	if (!options)
+		return 1;
+
+	val = (unsigned int *)get_property(options, "ibm,smt-snooze-delay",
+					   NULL);
+	if (!smt_snooze_cmdline && val) {
+		for_each_cpu(cpu)
+			per_cpu(smt_snooze_delay, cpu) = *val;
+	}
+
+	return 1;
+}
+__initcall(smt_setup);
+
+static int __init setup_smt_snooze_delay(char *str)
+{
+	unsigned int cpu;
+	int snooze;
+
+	if (!cur_cpu_spec->cpu_features & CPU_FTR_SMT)
+		return 1;
+
+	smt_snooze_cmdline = 1;
+
+	if (get_option(&str, &snooze)) {
+		for_each_cpu(cpu)
+			per_cpu(smt_snooze_delay, cpu) = snooze;
+	}
+
+	return 1;
+}
+__setup("smt-snooze-delay=", setup_smt_snooze_delay);
+
+#endif
+
 
 /* PMC stuff */
 
@@ -235,6 +319,11 @@ static int __init topology_init(void)
 		register_cpu_pmc(&c->sysdev);
 
 		sysdev_create_file(&c->sysdev, &attr_physical_id);
+
+#ifndef CONFIG_PPC_ISERIES
+		if (cur_cpu_spec->cpu_features & CPU_FTR_SMT)
+			sysdev_create_file(&c->sysdev, &attr_smt_snooze_delay);
+#endif
 	}
 
 	return 0;
diff --git a/include/asm-ppc64/naca.h b/include/asm-ppc64/naca.h
index b93cdf160dd3..a50189402420 100644
--- a/include/asm-ppc64/naca.h
+++ b/include/asm-ppc64/naca.h
@@ -37,12 +37,10 @@ struct naca_struct {
 	u32 dCacheL1LinesPerPage;	/* L1 d-cache lines / page   0x64 */
 	u32 iCacheL1LogLineSize;	/* L1 i-cache line size Log2 0x68 */
 	u32 iCacheL1LinesPerPage;	/* L1 i-cache lines / page   0x6c */
-	u64 smt_snooze_delay;           /* Delay (in usec) before    0x70 */
-                                        /* entering ST mode               */
-	u8  smt_state;                  /* 0 = SMT off               0x78 */
+	u8  smt_state;                  /* 0 = SMT off               0x70 */
 	                                /* 1 = SMT on                     */
 	                                /* 2 = SMT dynamic                */
-	u8  resv0[7];                   /* Reserved           0x70 - 0x7F */
+	u8  resv0[15];                  /* Reserved           0x71 - 0x7F */
 };
 
 extern struct naca_struct *naca;
-- 
cgit v1.2.3


From 9ed9e7e5eb0ab64f58ef41e36779cb96abc17076 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:50:11 -0700
Subject: [PATCH] ppc64: DMA API updates

From: Anton Blanchard <anton@samba.org>

DMA API updates, in particular adding the new cache flush interfaces.
---
 arch/ppc64/kernel/dma.c         | 24 --------------
 include/asm-ppc64/dma-mapping.h | 69 +++++++++++++++++++++++++++++++----------
 include/asm-ppc64/vio.h         | 17 ++--------
 3 files changed, 54 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/dma.c b/arch/ppc64/kernel/dma.c
index f81dcd1e8da1..dec55efcc873 100644
--- a/arch/ppc64/kernel/dma.c
+++ b/arch/ppc64/kernel/dma.c
@@ -131,27 +131,3 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries,
 		BUG();
 }
 EXPORT_SYMBOL(dma_unmap_sg);
-
-void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size,
-		enum dma_data_direction direction)
-{
-	if (dev->bus == &pci_bus_type)
-		pci_dma_sync_single(to_pci_dev(dev), dma_handle, size, (int)direction);
-	else if (dev->bus == &vio_bus_type)
-		vio_dma_sync_single(to_vio_dev(dev), dma_handle, size, direction);
-	else
-		BUG();
-}
-EXPORT_SYMBOL(dma_sync_single);
-
-void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
-		enum dma_data_direction direction)
-{
-	if (dev->bus == &pci_bus_type)
-		pci_dma_sync_sg(to_pci_dev(dev), sg, nelems, (int)direction);
-	else if (dev->bus == &vio_bus_type)
-		vio_dma_sync_sg(to_vio_dev(dev), sg, nelems, direction);
-	else
-		BUG();
-}
-EXPORT_SYMBOL(dma_sync_sg);
diff --git a/include/asm-ppc64/dma-mapping.h b/include/asm-ppc64/dma-mapping.h
index e12753cf8861..0cdc5346f6f8 100644
--- a/include/asm-ppc64/dma-mapping.h
+++ b/include/asm-ppc64/dma-mapping.h
@@ -36,10 +36,43 @@ extern int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		enum dma_data_direction direction);
 extern void dma_unmap_sg(struct device *dev, struct scatterlist *sg,
 		int nhwentries, enum dma_data_direction direction);
-extern void dma_sync_single(struct device *dev, dma_addr_t dma_handle,
-		size_t size, enum dma_data_direction direction);
-extern void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems,
-		enum dma_data_direction direction);
+
+static inline void
+dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
+			enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
+}
+
+static inline void
+dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
+			   enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
+}
+
+static inline void
+dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems,
+		    enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
+}
+
+static inline void
+dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems,
+		       enum dma_data_direction direction)
+{
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
+}
+
+static inline int dma_mapping_error(dma_addr_t dma_addr)
+{
+	return (dma_addr == DMA_ERROR_CODE);
+}
 
 /* Now for the API extensions over the pci_ one */
 
@@ -56,27 +89,29 @@ dma_get_cache_alignment(void)
 }
 
 static inline void
-dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
-		      unsigned long offset, size_t size,
-		      enum dma_data_direction direction)
+dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle,
+			      unsigned long offset, size_t size,
+			      enum dma_data_direction direction)
 {
-	/* just sync everything, that's all the pci API can do */
-	dma_sync_single(dev, dma_handle, offset+size, direction);
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
 }
 
 static inline void
-dma_cache_sync(void *vaddr, size_t size,
-	       enum dma_data_direction direction)
+dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle,
+				 unsigned long offset, size_t size,
+				 enum dma_data_direction direction)
 {
-	/* could define this in terms of the dma_cache ... operations,
-	 * but if you get this on a platform, you should convert the platform
-	 * to using the generic device DMA API */
-	BUG();
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
 }
 
-static inline int dma_mapping_error(dma_addr_t dma_addr)
+static inline void
+dma_cache_sync(void *vaddr, size_t size,
+	       enum dma_data_direction direction)
 {
-	return (dma_addr == DMA_ERROR_CODE);
+	BUG_ON(direction == DMA_NONE);
+	/* nothing to do */
 }
 
 #endif	/* _ASM_DMA_MAPPING_H */
diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h
index 107201b25008..1ef9a270e7f7 100644
--- a/include/asm-ppc64/vio.h
+++ b/include/asm-ppc64/vio.h
@@ -78,23 +78,10 @@ static inline int vio_dma_supported(struct vio_dev *hwdev, u64 mask)
 		vio_map_single(dev, (page_address(page) + (off)), size, dir)
 #define vio_unmap_page(dev,addr,sz,dir) vio_unmap_single(dev,addr,sz,dir)
 
-
-static inline void vio_dma_sync_single(struct vio_dev *hwdev,
-				       dma_addr_t dma_handle, size_t size,
-				       enum dma_data_direction direction)
-{
-	BUG_ON(direction == DMA_NONE);
-	/* nothing to do */
-}
-
-static inline void vio_dma_sync_sg(struct vio_dev *hwdev,
-				   struct scatterlist *sg, int nelems,
-				   enum dma_data_direction direction)
+static inline int vio_set_dma_mask(struct vio_dev *dev, u64 mask)
 {
-	BUG_ON(direction == DMA_NONE);
-	/* nothing to do */
+	return -EIO;
 }
-static inline int vio_set_dma_mask(struct vio_dev *dev, u64 mask) { return -EIO; }
 
 extern struct bus_type vio_bus_type;
 
-- 
cgit v1.2.3


From ec19a28db34aeb29720c658e5ebf4e60ccee4f6d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:50:24 -0700
Subject: [PATCH] ppc64: Remove unused rtas functions

From: Joel Schopp <jschopp@austin.ibm.com>

I was looking at rtas serialization for reasons I won't go into here.
While wandering through the code I found that two functions were not
properly serialized.  phys_call_rtas and phys_call_rtas_display_status are
the functions.  After looking further they are redundant and not
used anywhere at all.
---
 arch/ppc64/kernel/rtas.c | 36 ------------------------------------
 include/asm-ppc64/rtas.h |  2 --
 2 files changed, 38 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c
index 4a27c3d8312c..ff0453726c9e 100644
--- a/arch/ppc64/kernel/rtas.c
+++ b/arch/ppc64/kernel/rtas.c
@@ -65,42 +65,6 @@ extern unsigned long reloc_offset(void);
 spinlock_t rtas_data_buf_lock = SPIN_LOCK_UNLOCKED;
 char rtas_data_buf[RTAS_DATA_BUF_SIZE]__page_aligned;
 
-void
-phys_call_rtas(int token, int nargs, int nret, ...)
-{
-	va_list list;
-	unsigned long offset = reloc_offset();
-	struct rtas_args *rtas = PTRRELOC(&(get_paca()->xRtas));
-	int i;
-
-	rtas->token = token;
-	rtas->nargs = nargs;
-	rtas->nret  = nret;
-	rtas->rets  = (rtas_arg_t *)PTRRELOC(&(rtas->args[nargs]));
-
-	va_start(list, nret);
-	for (i = 0; i < nargs; i++)
-	  rtas->args[i] = (rtas_arg_t)LONG_LSW(va_arg(list, ulong));
-	va_end(list);
-
-        enter_rtas(rtas);	
-}
-
-void
-phys_call_rtas_display_status(char c)
-{
-	unsigned long offset = reloc_offset();
-	struct rtas_args *rtas = PTRRELOC(&(get_paca()->xRtas));
-
-	rtas->token = 10;
-	rtas->nargs = 1;
-	rtas->nret  = 1;
-	rtas->rets  = (rtas_arg_t *)PTRRELOC(&(rtas->args[1]));
-	rtas->args[0] = (int)c;
-
-	enter_rtas(rtas);	
-}
-
 void
 call_rtas_display_status(char c)
 {
diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h
index 7f6139064c7c..47232af7b278 100644
--- a/include/asm-ppc64/rtas.h
+++ b/include/asm-ppc64/rtas.h
@@ -169,8 +169,6 @@ extern struct rtas_t rtas;
 extern void enter_rtas(struct rtas_args *);
 extern int rtas_token(const char *service);
 extern long rtas_call(int token, int, int, unsigned long *, ...);
-extern void phys_call_rtas(int, int, int, ...);
-extern void phys_call_rtas_display_status(char);
 extern void call_rtas_display_status(char);
 extern void rtas_restart(char *cmd);
 extern void rtas_power_off(void);
-- 
cgit v1.2.3


From a97de48b693b787fab0a47a1cf35de001ac50a6b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:51:02 -0700
Subject: [PATCH] ppc44x: fix memory leak

From: Matt Porter <mporter@kernel.crashing.org>

This fixes a memory leak when freeing pgds on PPC44x.
---
 arch/ppc/kernel/misc.S | 7 +++++--
 arch/ppc/mm/pgtable.c  | 4 ++--
 include/asm-ppc/page.h | 3 ++-
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S
index 3f9b6a206937..bb4a5ec77429 100644
--- a/arch/ppc/kernel/misc.S
+++ b/arch/ppc/kernel/misc.S
@@ -738,12 +738,15 @@ _GLOBAL(__flush_dcache_icache_phys)
 	blr
 
 /*
- * Clear a page using the dcbz instruction, which doesn't cause any
+ * Clear pages using the dcbz instruction, which doesn't cause any
  * memory traffic (except to write out any cache lines which get
  * displaced).  This only works on cacheable memory.
+ *
+ * void clear_pages(void *page, int order) ;
  */
-_GLOBAL(clear_page)
+_GLOBAL(clear_pages)
 	li	r0,4096/L1_CACHE_LINE_SIZE
+	slw	r0,r0,r4
 	mtctr	r0
 #ifdef CONFIG_8xx
 	li	r4, 0
diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c
index 78ea44090efa..b1b93fc18d4d 100644
--- a/arch/ppc/mm/pgtable.c
+++ b/arch/ppc/mm/pgtable.c
@@ -71,13 +71,13 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	pgd_t *ret;
 
 	if ((ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER)) != NULL)
-		clear_page(ret);
+		clear_pages(ret, PGDIR_ORDER);
 	return ret;
 }
 
 void pgd_free(pgd_t *pgd)
 {
-	free_page((unsigned long)pgd);
+	free_pages((unsigned long)pgd, PGDIR_ORDER);
 }
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
diff --git a/include/asm-ppc/page.h b/include/asm-ppc/page.h
index e47e77327ee5..57838e8e00f1 100644
--- a/include/asm-ppc/page.h
+++ b/include/asm-ppc/page.h
@@ -84,7 +84,8 @@ typedef unsigned long pgprot_t;
 #define PAGE_ALIGN(addr)	(((addr)+PAGE_SIZE-1)&PAGE_MASK)
 
 struct page;
-extern void clear_page(void *page);
+extern void clear_pages(void *page, int order);
+static inline void clear_page(void *page) { clear_pages(page, 0); }
 extern void copy_page(void *to, void *from);
 extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg);
 extern void copy_user_page(void *to, void *from, unsigned long vaddr,
-- 
cgit v1.2.3


From ed678f13aec6fdd86c952b05200f741aa473dba8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:51:16 -0700
Subject: [PATCH] Quota locking fixes

From: Jan Kara <jack@ucw.cz>

Change locking rules in quota code to fix lock ordering especially wrt
journal lock.  Also some unnecessary spinlocking is removed.  The locking
changes are mainly: dqptr_sem, dqio_sem are acquired only when transaction is
already started, dqonoff_sem before a transaction is started.  This change
requires some callbacks to ext3 (also implemented in this patch) to start
transaction before the locks are acquired.
---
 fs/Kconfig               |   6 +-
 fs/dquot.c               | 204 ++++++++++++++++++++++++++---------------------
 fs/ext3/super.c          |  51 +++++++++---
 fs/inode.c               |  16 ++--
 include/linux/quotaops.h |  15 +---
 5 files changed, 165 insertions(+), 127 deletions(-)

(limited to 'include')

diff --git a/fs/Kconfig b/fs/Kconfig
index ef8e47fb1c39..c748a2ce35ee 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -417,7 +417,7 @@ config QFMT_V1
 	tristate "Old quota format support"
 	depends on QUOTA
 	help
-	  This quota format was (is) used by kernels earlier than 2.4.??. If
+	  This quota format was (is) used by kernels earlier than 2.4.22. If
 	  you have quota working and you don't want to convert to new quota
 	  format say Y here.
 
@@ -426,8 +426,8 @@ config QFMT_V2
 	depends on QUOTA
 	help
 	  This quota format allows using quotas with 32-bit UIDs/GIDs. If you
-	  need this functionality say Y here. Note that you will need latest
-	  quota utilities for new quota format with this kernel.
+	  need this functionality say Y here. Note that you will need recent
+	  quota utilities (>= 3.01) for new quota format with this kernel.
 
 config QUOTACTL
 	bool
diff --git a/fs/dquot.c b/fs/dquot.c
index b7b9b5c44277..e6b39e66207a 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -85,12 +85,31 @@
  * and quota formats and also dqstats structure containing statistics about the
  * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures
  * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes.
- * Note that we don't have to do the locking of i_blocks and i_bytes when the
- * quota is disabled - i_sem should serialize the access. dq_data_lock should
- * be always grabbed before dq_list_lock.
+ * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly
+ * in inode_add_bytes() and inode_sub_bytes().
+ *
+ * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock
  *
  * Note that some things (eg. sb pointer, type, id) doesn't change during
  * the life of the dquot structure and so needn't to be protected by a lock
+ *
+ * Any operation working on dquots via inode pointers must hold dqptr_sem.  If
+ * operation is just reading pointers from inode (or not using them at all) the
+ * read lock is enough. If pointers are altered function must hold write lock.
+ * If operation is holding reference to dquot in other way (e.g. quotactl ops)
+ * it must be guarded by dqonoff_sem.
+ * This locking assures that:
+ *   a) update/access to dquot pointers in inode is serialized
+ *   b) everyone is guarded against invalidate_dquots()
+ *
+ * Each dquot has its dq_lock semaphore. Locked dquots might not be referenced
+ * from inodes (dquot_alloc_space() and such don't check the dq_lock).
+ * Currently dquot is locked only when it is being read to memory on the first
+ * dqget(). Write operations on dquots don't hold dq_lock as they copy data
+ * under dq_data_lock spinlock to internal buffers before writing.
+ *
+ * Lock ordering (including journal_lock) is following:
+ *  dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > dqio_sem
  */
 spinlock_t dq_list_lock = SPIN_LOCK_UNLOCKED;
 spinlock_t dq_data_lock = SPIN_LOCK_UNLOCKED;
@@ -169,23 +188,6 @@ static void put_quota_format(struct quota_format_type *fmt)
  * mechanism to locate a specific dquot.
  */
 
-/*
- * Note that any operation which operates on dquot data (ie. dq_dqb) must
- * hold dq_data_lock.
- *
- * Any operation working with dquots must hold dqptr_sem. If operation is
- * just reading pointers from inodes than read lock is enough. If pointers
- * are altered function must hold write lock.
- *
- * Locked dquots might not be referenced in inodes. Currently dquot it locked
- * only once in its existence - when it's being read to memory on first dqget()
- * and at that time it can't be referenced from inode. Write operations on
- * dquots don't hold dquot lock as they copy data to internal buffers before
- * writing anyway and copying as well as any data update should be atomic. Also
- * nobody can change used entries in dquot structure as this is done only when
- * quota is destroyed and invalidate_dquots() is called only when dq_count == 0.
- */
-
 static LIST_HEAD(inuse_list);
 static LIST_HEAD(free_dquots);
 static struct list_head dquot_hash[NR_DQHASH];
@@ -286,9 +288,9 @@ static int commit_dqblk(struct dquot *dquot)
 }
 
 /* Invalidate all dquots on the list. Note that this function is called after
- * quota is disabled so no new quota might be created. Because we hold dqptr_sem
- * for writing and pointers were already removed from inodes we actually know that
- * no quota for this sb+type should be held. */
+ * quota is disabled so no new quota might be created. Because we hold
+ * dqonoff_sem and pointers were already removed from inodes we actually know
+ * that no quota for this sb+type should be held. */
 static void invalidate_dquots(struct super_block *sb, int type)
 {
 	struct dquot *dquot;
@@ -302,12 +304,11 @@ static void invalidate_dquots(struct super_block *sb, int type)
 			continue;
 		if (dquot->dq_type != type)
 			continue;
-#ifdef __DQUOT_PARANOIA	
-		/* There should be no users of quota - we hold dqptr_sem for writing */
+#ifdef __DQUOT_PARANOIA
 		if (atomic_read(&dquot->dq_count))
 			BUG();
 #endif
-		/* Quota now have no users and it has been written on last dqput() */
+		/* Quota now has no users and it has been written on last dqput() */
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
@@ -323,7 +324,7 @@ static int vfs_quota_sync(struct super_block *sb, int type)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int cnt;
 
-	down_read(&dqopt->dqptr_sem);
+	down(&dqopt->dqonoff_sem);
 restart:
 	/* At this point any dirty dquot will definitely be written so we can clear
 	   dirty flag from info */
@@ -359,7 +360,7 @@ restart:
 	spin_lock(&dq_list_lock);
 	dqstats.syncs++;
 	spin_unlock(&dq_list_lock);
-	up_read(&dqopt->dqptr_sem);
+	up(&dqopt->dqonoff_sem);
 
 	return 0;
 }
@@ -402,7 +403,7 @@ static int shrink_dqcache_memory(int nr, unsigned int gfp_mask)
 /*
  * Put reference to dquot
  * NOTE: If you change this function please check whether dqput_blocks() works right...
- * MUST be called with dqptr_sem held
+ * MUST be called with either dqptr_sem or dqonoff_sem held
  */
 static void dqput(struct dquot *dquot)
 {
@@ -467,7 +468,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
 
 /*
  * Get reference to dquot
- * MUST be called with dqptr_sem held
+ * MUST be called with either dqptr_sem or dqonoff_sem held
  */
 static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
 {
@@ -528,7 +529,7 @@ static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-/* This routine is guarded by dqptr_sem semaphore */
+/* This routine is guarded by dqonoff_sem semaphore */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct list_head *p;
@@ -594,7 +595,7 @@ put_it:
 
 /* Free list of dquots - called from inode.c */
 /* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */
-void put_dquot_list(struct list_head *tofree_head)
+static void put_dquot_list(struct list_head *tofree_head)
 {
 	struct list_head *act_head;
 	struct dquot *dquot;
@@ -609,6 +610,20 @@ void put_dquot_list(struct list_head *tofree_head)
 	}
 }
 
+/* Function in inode.c - remove pointers to dquots in icache */
+extern void remove_dquot_ref(struct super_block *, int, struct list_head *);
+
+/* Gather all references from inodes and drop them */
+static void drop_dquot_ref(struct super_block *sb, int type)
+{
+	LIST_HEAD(tofree_head);
+
+	down_write(&sb_dqopt(sb)->dqptr_sem);
+	remove_dquot_ref(sb, type, &tofree_head);
+	up_write(&sb_dqopt(sb)->dqptr_sem);
+	put_dquot_list(&tofree_head);
+}
+
 static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
 {
 	dquot->dq_dqb.dqb_curinodes += number;
@@ -804,6 +819,9 @@ void dquot_initialize(struct inode *inode, int type)
 	unsigned int id = 0;
 	int cnt;
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return;
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	/* Having dqptr_sem we know NOQUOTA flags can't be altered... */
 	if (IS_NOQUOTA(inode)) {
@@ -831,50 +849,23 @@ void dquot_initialize(struct inode *inode, int type)
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 }
 
-/*
- *	Remove references to quota from inode
- *	This function needs dqptr_sem for writing
- */
-static void dquot_drop_iupdate(struct inode *inode, struct dquot **to_drop)
-{
-	int cnt;
-
-	inode->i_flags &= ~S_QUOTA;
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		to_drop[cnt] = inode->i_dquot[cnt];
-		inode->i_dquot[cnt] = NODQUOT;
-	}
-}
-
 /*
  * 	Release all quotas referenced by inode
+ *	Transaction must be started at an entry
  */
 void dquot_drop(struct inode *inode)
 {
-	struct dquot *to_drop[MAXQUOTAS];
 	int cnt;
 
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	dquot_drop_iupdate(inode, to_drop);
+	inode->i_flags &= ~S_QUOTA;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (inode->i_dquot[cnt] != NODQUOT) {
+			dqput(inode->i_dquot[cnt]);
+			inode->i_dquot[cnt] = NODQUOT;
+		}
+	}
 	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (to_drop[cnt] != NODQUOT)
-			dqput(to_drop[cnt]);
-}
-
-/*
- *	Release all quotas referenced by inode.
- *	This function assumes dqptr_sem for writing
- */
-void dquot_drop_nolock(struct inode *inode)
-{
-	struct dquot *to_drop[MAXQUOTAS];
-	int cnt;
-
-	dquot_drop_iupdate(inode, to_drop);
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (to_drop[cnt] != NODQUOT)
-			dqput(to_drop[cnt]);
 }
 
 /*
@@ -885,11 +876,17 @@ int dquot_alloc_space(struct inode *inode, qsize_t number, int warn)
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode)) {
+		inode_add_bytes(inode, number);
+		return QUOTA_OK;
+	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = NOWARN;
 
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	spin_lock(&dq_data_lock);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode))
 		goto add_bytes;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -921,9 +918,13 @@ int dquot_alloc_inode(const struct inode *inode, unsigned long number)
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return QUOTA_OK;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
 		warntype[cnt] = NOWARN;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
 		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 		return QUOTA_OK;
@@ -956,8 +957,14 @@ void dquot_free_space(struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode)) {
+		inode_sub_bytes(inode, number);
+		return;
+	}
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	spin_lock(&dq_data_lock);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode))
 		goto sub_bytes;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -978,7 +985,11 @@ void dquot_free_inode(const struct inode *inode, unsigned long number)
 {
 	unsigned int cnt;
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return;
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {
 		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 		return;
@@ -1007,14 +1018,20 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	    chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid;
 	char warntype[MAXQUOTAS];
 
+	/* Solve deadlock when we recurse when holding dqptr_sem... */
+	if (IS_NOQUOTA(inode))
+		return QUOTA_OK;
 	/* Clear the arrays */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		transfer_to[cnt] = transfer_from[cnt] = NODQUOT;
 		warntype[cnt] = NOWARN;
 	}
+	down(&sb_dqopt(inode->i_sb)->dqonoff_sem);
 	down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	/* Now recheck reliably when holding dqptr_sem */
 	if (IS_NOQUOTA(inode)) {	/* File without quota accounting? */
 		up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+		up(&sb_dqopt(inode->i_sb)->dqonoff_sem);
 		return QUOTA_OK;
 	}
 	/* First build the transfer_to list - here we can block on reading of dquots... */
@@ -1065,6 +1082,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
 	ret = QUOTA_OK;
 warn_put_all:
 	spin_unlock(&dq_data_lock);
+	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	flush_warnings(transfer_to, warntype);
 	
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1073,7 +1091,7 @@ warn_put_all:
 		if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT)
 			dqput(transfer_to[cnt]);
 	}
-	up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+	up(&sb_dqopt(inode->i_sb)->dqonoff_sem);
 	return ret;
 }
 
@@ -1121,9 +1139,6 @@ static inline void reset_enable_flags(struct quota_info *dqopt, int type)
 	}
 }
 
-/* Function in inode.c - remove pointers to dquots in icache */
-extern void remove_dquot_ref(struct super_block *, int);
-
 /*
  * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
  */
@@ -1137,7 +1152,6 @@ int vfs_quota_off(struct super_block *sb, int type)
 
 	/* We need to serialize quota_off() for device */
 	down(&dqopt->dqonoff_sem);
-	down_write(&dqopt->dqptr_sem);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
@@ -1146,7 +1160,7 @@ int vfs_quota_off(struct super_block *sb, int type)
 		reset_enable_flags(dqopt, cnt);
 
 		/* Note: these are blocking operations */
-		remove_dquot_ref(sb, cnt);
+		drop_dquot_ref(sb, cnt);
 		invalidate_dquots(sb, cnt);
 		/*
 		 * Now all dquots should be invalidated, all writes done so we should be only
@@ -1168,7 +1182,6 @@ int vfs_quota_off(struct super_block *sb, int type)
 		dqopt->info[cnt].dqi_bgrace = 0;
 		dqopt->ops[cnt] = NULL;
 	}
-	up_write(&dqopt->dqptr_sem);
 	up(&dqopt->dqonoff_sem);
 out:
 	return 0;
@@ -1180,7 +1193,8 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	struct inode *inode;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct quota_format_type *fmt = find_quota_format(format_id);
-	int error;
+	int error, cnt;
+	struct dquot *to_drop[MAXQUOTAS];
 	unsigned int oldflags;
 
 	if (!fmt)
@@ -1202,7 +1216,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 		goto out_f;
 
 	down(&dqopt->dqonoff_sem);
-	down_write(&dqopt->dqptr_sem);
 	if (sb_has_quota_enabled(sb, type)) {
 		error = -EBUSY;
 		goto out_lock;
@@ -1213,8 +1226,20 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	if (!fmt->qf_ops->check_quota_file(sb, type))
 		goto out_file_init;
 	/* We don't want quota and atime on quota files (deadlocks possible) */
-	dquot_drop_nolock(inode);
+	down_write(&dqopt->dqptr_sem);
 	inode->i_flags |= S_NOQUOTA | S_NOATIME;
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		to_drop[cnt] = inode->i_dquot[cnt];
+		inode->i_dquot[cnt] = NODQUOT;
+	}
+	inode->i_flags &= ~S_QUOTA;
+	up_write(&dqopt->dqptr_sem);
+	/* We must put dquots outside of dqptr_sem because we may need to
+	 * start transaction for write */
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		if (to_drop[cnt])
+			dqput(to_drop[cnt]);
+	}
 
 	dqopt->ops[type] = fmt->qf_ops;
 	dqopt->info[type].dqi_format = fmt;
@@ -1225,7 +1250,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path)
 	}
 	up(&dqopt->dqio_sem);
 	set_enable_flags(dqopt, type);
-	up_write(&dqopt->dqptr_sem);
 
 	add_dquot_ref(sb, type);
 	up(&dqopt->dqonoff_sem);
@@ -1268,14 +1292,14 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!(dquot = dqget(sb, id, type))) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	do_get_dqblk(dquot, di);
 	dqput(dquot);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
@@ -1337,14 +1361,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
 {
 	struct dquot *dquot;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!(dquot = dqget(sb, id, type))) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	do_set_dqblk(dquot, di);
 	dqput(dquot);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
@@ -1353,9 +1377,9 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
   
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!sb_has_quota_enabled(sb, type)) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	mi = sb_dqopt(sb)->info + type;
@@ -1365,7 +1389,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 	ii->dqi_flags = mi->dqi_flags & DQF_MASK;
 	ii->dqi_valid = IIF_ALL;
 	spin_unlock(&dq_data_lock);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
@@ -1374,9 +1398,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
 	struct mem_dqinfo *mi;
 
-	down_read(&sb_dqopt(sb)->dqptr_sem);
+	down(&sb_dqopt(sb)->dqonoff_sem);
 	if (!sb_has_quota_enabled(sb, type)) {
-		up_read(&sb_dqopt(sb)->dqptr_sem);
+		up(&sb_dqopt(sb)->dqonoff_sem);
 		return -ESRCH;
 	}
 	mi = sb_dqopt(sb)->info + type;
@@ -1389,7 +1413,7 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 		mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK);
 	mark_info_dirty(mi);
 	spin_unlock(&dq_data_lock);
-	up_read(&sb_dqopt(sb)->dqptr_sem);
+	up(&sb_dqopt(sb)->dqonoff_sem);
 	return 0;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index baf30c5045ec..e6ae6c9e0f46 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1958,6 +1958,18 @@ int ext3_statfs (struct super_block * sb, struct kstatfs * buf)
 #define EXT3_V0_QFMT_BLOCKS 27
 
 static int (*old_write_dquot)(struct dquot *dquot);
+static void (*old_drop_dquot)(struct inode *inode);
+
+static int fmt_to_blocks(int fmt)
+{
+	switch (fmt) {
+		case QFMT_VFS_OLD:
+			return  EXT3_OLD_QFMT_BLOCKS;
+		case QFMT_VFS_V0:
+			return EXT3_V0_QFMT_BLOCKS;
+	}
+	return EXT3_MAX_TRANS_DATA;
+}
 
 static int ext3_write_dquot(struct dquot *dquot)
 {
@@ -1965,20 +1977,11 @@ static int ext3_write_dquot(struct dquot *dquot)
 	int ret;
 	int err;
 	handle_t *handle;
-	struct quota_info *dqops = sb_dqopt(dquot->dq_sb);
+	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 	struct inode *qinode;
 
-	switch (dqops->info[dquot->dq_type].dqi_format->qf_fmt_id) {
-		case QFMT_VFS_OLD:
-			nblocks = EXT3_OLD_QFMT_BLOCKS;
-			break;
-		case QFMT_VFS_V0:
-			nblocks = EXT3_V0_QFMT_BLOCKS;
-			break;
-		default:
-			nblocks = EXT3_MAX_TRANS_DATA;
-	}
-	qinode = dqops->files[dquot->dq_type]->f_dentry->d_inode;
+	nblocks = fmt_to_blocks(dqopt->info[dquot->dq_type].dqi_format->qf_fmt_id);
+	qinode = dqopt->files[dquot->dq_type]->f_dentry->d_inode;
 	handle = ext3_journal_start(qinode, nblocks);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -1991,6 +1994,28 @@ static int ext3_write_dquot(struct dquot *dquot)
 out:
 	return ret;
 }
+
+static void ext3_drop_dquot(struct inode *inode)
+{
+	int nblocks, type;
+	struct quota_info *dqopt = sb_dqopt(inode->i_sb);
+	handle_t *handle;
+
+	for (type = 0; type < MAXQUOTAS; type++) {
+		if (sb_has_quota_enabled(inode->i_sb, type))
+			break;
+	}
+	if (type < MAXQUOTAS)
+		nblocks = fmt_to_blocks(dqopt->info[type].dqi_format->qf_fmt_id);
+	else
+		nblocks = 0;	/* No quota => no drop */
+	handle = ext3_journal_start(inode, 2*nblocks);
+	if (IS_ERR(handle))
+		return;
+	old_drop_dquot(inode);
+	ext3_journal_stop(handle);
+	return;
+}
 #endif
 
 static struct super_block *ext3_get_sb(struct file_system_type *fs_type,
@@ -2018,7 +2043,9 @@ static int __init init_ext3_fs(void)
 #ifdef CONFIG_QUOTA
 	init_dquot_operations(&ext3_qops);
 	old_write_dquot = ext3_qops.write_dquot;
+	old_drop_dquot = ext3_qops.drop;
 	ext3_qops.write_dquot = ext3_write_dquot;
+	ext3_qops.drop = ext3_drop_dquot;
 #endif
         err = register_filesystem(&ext3_fs_type);
 	if (err)
diff --git a/fs/inode.c b/fs/inode.c
index 01c5740aacdd..d367d4629f3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1216,15 +1216,13 @@ EXPORT_SYMBOL(inode_needs_sync);
  */
 #ifdef CONFIG_QUOTA
 
-/* Functions back in dquot.c */
-void put_dquot_list(struct list_head *);
+/* Function back in dquot.c */
 int remove_inode_dquot_ref(struct inode *, int, struct list_head *);
 
-void remove_dquot_ref(struct super_block *sb, int type)
+void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head)
 {
 	struct inode *inode;
 	struct list_head *act_head;
-	LIST_HEAD(tofree_head);
 
 	if (!sb->dq_op)
 		return;	/* nothing to do */
@@ -1234,26 +1232,24 @@ void remove_dquot_ref(struct super_block *sb, int type)
 	list_for_each(act_head, &inode_in_use) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	list_for_each(act_head, &inode_unused) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (inode->i_sb == sb && IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	list_for_each(act_head, &sb->s_dirty) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	list_for_each(act_head, &sb->s_io) {
 		inode = list_entry(act_head, struct inode, i_list);
 		if (IS_QUOTAINIT(inode))
-			remove_inode_dquot_ref(inode, type, &tofree_head);
+			remove_inode_dquot_ref(inode, type, tofree_head);
 	}
 	spin_unlock(&inode_lock);
-
-	put_dquot_list(&tofree_head);
 }
 
 #endif
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 155c9a2af016..e5a9e6bed751 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -64,11 +64,8 @@ static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA)
 			return 1;
 	}
-	else {
-		spin_lock(&dq_data_lock);
+	else
 		inode_add_bytes(inode, nr);
-		spin_unlock(&dq_data_lock);
-	}
 	return 0;
 }
 
@@ -87,11 +84,8 @@ static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 		if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA)
 			return 1;
 	}
-	else {
-		spin_lock(&dq_data_lock);
+	else
 		inode_add_bytes(inode, nr);
-		spin_unlock(&dq_data_lock);
-	}
 	return 0;
 }
 
@@ -117,11 +111,8 @@ static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr)
 {
 	if (sb_any_quota_enabled(inode->i_sb))
 		inode->i_sb->dq_op->free_space(inode, nr);
-	else {
-		spin_lock(&dq_data_lock);
+	else
 		inode_sub_bytes(inode, nr);
-		spin_unlock(&dq_data_lock);
-	}
 }
 
 static __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr)
-- 
cgit v1.2.3


From 94b1c3ebf78bd58c2f45b78f2c24c7c939c34a9e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:52:32 -0700
Subject: [PATCH] knfsd: Remove name_lookup.h that noone is using anymore.

From: NeilBrown <neilb@cse.unsw.edu.au>
---
 include/linux/sunrpc/name_lookup.h | 38 --------------------------------------
 1 file changed, 38 deletions(-)
 delete mode 100644 include/linux/sunrpc/name_lookup.h

(limited to 'include')

diff --git a/include/linux/sunrpc/name_lookup.h b/include/linux/sunrpc/name_lookup.h
deleted file mode 100644
index 0c97ec324ada..000000000000
--- a/include/linux/sunrpc/name_lookup.h
+++ /dev/null
@@ -1,38 +0,0 @@
-
-/*
- * map between user/group name and id for a given 'client' 
- */
-
-struct name_ent {
-	char name[20];
-};
-static inline int name_get_user(int uid, struct name_ent **namep)
-{
-	struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL);
-	if (n) sprintf(n->name, "%d",uid);
-	*namep = n;
-	return n ? 0 : -ENOMEM;
-}
-static inline int name_get_group(int uid, struct name_ent **namep)
-{
-	struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL);
-	if (n) sprintf(n->name, "%d",uid);
-	*namep = n;
-	return n ? 0 : -ENOMEM;
-}
-static inline int name_get_uid(char *name, int name_len, int *uidp)
-{
-	*uidp = simple_strtoul(name, NULL, 0);
-	return 0;
-}
-
-static inline int name_get_gid(char *name, int name_len, int *gidp)
-{
-	*gidp = simple_strtoul(name, NULL, 0);
-	return 0;
-}
-
-static inline void name_put(struct name_ent *ent) 
-{
-	kfree(ent);
-}
-- 
cgit v1.2.3


From c02c0886973521cd77904d8f07aa98d99c63cb3b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:52:44 -0700
Subject: [PATCH] knfsd: Add server-side support for the nfsv4
 mounted_on_fileid attribute.

From: NeilBrown <neilb@cse.unsw.edu.au>
---
 fs/nfsd/nfs4xdr.c         | 11 +++++++++++
 include/linux/nfs4.h      |  1 +
 include/linux/nfsd/nfsd.h |  2 +-
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index d19b1c6b7f45..8908bfc17184 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1588,7 +1588,18 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
 		WRITE32(stat.mtime.tv_sec);
 		WRITE32(stat.mtime.tv_nsec);
 	}
+	if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) {
+		struct dentry *mnt_pnt, *mnt_root;
 
+		if ((buflen -= 8) < 0)
+                	goto out_resource;
+		mnt_root = exp->ex_mnt->mnt_root;
+		if (mnt_root->d_inode == dentry->d_inode) {
+			mnt_pnt = exp->ex_mnt->mnt_mountpoint;
+			WRITE64((u64) mnt_pnt->d_inode->i_ino);
+		} else
+                	WRITE64((u64) stat.ino);
+	}
 	*attrlenp = htonl((char *)p - (char *)attrlenp - 4);
 	*countp = p - buffer;
 	status = nfs_ok;
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index e8ea2239a213..520545881a52 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -274,6 +274,7 @@ enum lock_type4 {
 #define FATTR4_WORD1_TIME_METADATA      (1 << 20)
 #define FATTR4_WORD1_TIME_MODIFY        (1 << 21)
 #define FATTR4_WORD1_TIME_MODIFY_SET    (1 << 22)
+#define FATTR4_WORD1_MOUNTED_ON_FILEID  (1 << 23)
 
 #define NFSPROC4_NULL 0
 #define NFSPROC4_COMPOUND 1
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 6e6a66208308..418356558209 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -278,7 +278,7 @@ static inline int is_fsid(struct svc_fh *fh, struct knfsd_fh *reffh)
  | FATTR4_WORD1_SPACE_AVAIL     | FATTR4_WORD1_SPACE_FREE   | FATTR4_WORD1_SPACE_TOTAL      \
  | FATTR4_WORD1_SPACE_USED      | FATTR4_WORD1_TIME_ACCESS  | FATTR4_WORD1_TIME_ACCESS_SET  \
  | FATTR4_WORD1_TIME_CREATE     | FATTR4_WORD1_TIME_DELTA   | FATTR4_WORD1_TIME_METADATA    \
- | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET)
+ | FATTR4_WORD1_TIME_MODIFY     | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID)
 
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
 #define NFSD_WRITEONLY_ATTRS_WORD1							    \
-- 
cgit v1.2.3


From 238a06e203a96960843faec4ec8f553f453082b9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:09 -0700
Subject: [PATCH] knfsd: Export a symbol needed by auth_gss

From: NeilBrown <neilb@cse.unsw.edu.au>

From: "J. Bruce Fields" <bfields@fieldses.org>

Without this compiling auth_gss as module fails.
---
 include/linux/sunrpc/xdr.h | 1 +
 net/sunrpc/sunrpc_syms.c   | 1 +
 net/sunrpc/xdr.c           | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 0ccaff2cdee2..2b334dc19962 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -145,6 +145,7 @@ extern void _copy_from_pages(char *, struct page **, size_t, size_t);
 extern void xdr_buf_from_iov(struct iovec *, struct xdr_buf *);
 extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, int, int);
 extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, int);
+extern int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len);
 
 /*
  * Helper structure for copying from an sk_buff.
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index 9061f6498cc4..1ae41edbb0f1 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -134,6 +134,7 @@ EXPORT_SYMBOL(xdr_read_pages);
 EXPORT_SYMBOL(xdr_buf_from_iov);
 EXPORT_SYMBOL(xdr_buf_subsegment);
 EXPORT_SYMBOL(xdr_buf_read_netobj);
+EXPORT_SYMBOL(read_bytes_from_xdr_buf);
 
 /* Debugging symbols */
 #ifdef RPC_DEBUG
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index accfdd9284df..cae451e8db8d 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -799,7 +799,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
 }
 
 /* obj is assumed to point to allocated memory of size at least len: */
-static int
+int
 read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len)
 {
 	struct xdr_buf subbuf;
-- 
cgit v1.2.3


From 9abdc6608d7c5e3cb09c05bd6c726d04dc59ace4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:24 -0700
Subject: [PATCH] knfsd: Add data integrity to serve rside gss

From: NeilBrown <neilb@cse.unsw.edu.au>

From: "J. Bruce Fields" <bfields@fieldses.org>

rpcsec_gss supports three security levels:

1.  authentication only: sign the header of each rpc request and response.

2. integrity: sign the header and body of each rpc request and response.

3.  privacy: sign the header and encrypt the body of each rpc request and
   response.

The first 2 are already supported on the client; this adds integrity support
on the server.
---
 include/linux/sunrpc/svcauth_gss.h  |   9 --
 net/sunrpc/auth_gss/gss_krb5_mech.c |   2 +
 net/sunrpc/auth_gss/svcauth_gss.c   | 172 ++++++++++++++++++++++++++++++++++--
 3 files changed, 168 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h
index 73ca6ef2c4a8..a444c9edb9e9 100644
--- a/include/linux/sunrpc/svcauth_gss.h
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -22,14 +22,5 @@
 int gss_svc_init(void);
 int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
 
-
-struct gss_svc_data {
-	/* decoded gss client cred: */
-	struct rpc_gss_wire_cred	clcred;
-	/* pointer to the beginning of the procedure-specific results, which
-	 * may be encrypted/checksummed in svcauth_gss_release: */
-	u32				*body_start;
-};
-
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 42ceee1907d7..57c074a06970 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -236,6 +236,8 @@ static int __init init_kerberos_module(void)
 	gss_register_triple(RPC_AUTH_GSS_KRB5I, gm, 0, RPC_GSS_SVC_INTEGRITY);
 	if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5, "krb5"))
 		printk("Failed to register %s with server!\n", "krb5");
+	if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5I, "krb5i"))
+		printk("Failed to register %s with server!\n", "krb5i");
 	gss_mech_put(gm);
 	return 0;
 }
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 9e13aaa2bc79..2277667d3397 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -670,6 +670,68 @@ out:
 	return stat;
 }
 
+static inline int
+read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj)
+{
+	u32     raw;
+	int     status;
+
+	status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj));
+	if (status)
+		return status;
+	*obj = ntohl(raw);
+	return 0;
+}
+
+/* It would be nice if this bit of code could be shared with the client.
+ * Obstacles:
+ *	The client shouldn't malloc(), would have to pass in own memory.
+ *	The server uses base of head iovec as read pointer, while the
+ *	client uses separate pointer. */
+static int
+unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx)
+{
+	int stat = -EINVAL;
+	u32 integ_len, maj_stat;
+	struct xdr_netobj mic;
+	struct xdr_buf integ_buf;
+
+	integ_len = ntohl(svc_getu32(&buf->head[0]));
+	if (integ_len & 3)
+		goto out;
+	if (integ_len > buf->len)
+		goto out;
+	if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len))
+		BUG();
+	/* copy out mic... */
+	if (read_u32_from_xdr_buf(buf, integ_len, &mic.len))
+		BUG();
+	if (mic.len > RPC_MAX_AUTH_SIZE)
+		goto out;
+	mic.data = kmalloc(mic.len, GFP_KERNEL);
+	if (!mic.data)
+		goto out;
+	if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len))
+		goto out;
+	maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL);
+	if (maj_stat != GSS_S_COMPLETE)
+		goto out;
+	if (ntohl(svc_getu32(&buf->head[0])) != seq)
+		goto out;
+	stat = 0;
+out:
+	return stat;
+}
+
+struct gss_svc_data {
+	/* decoded gss client cred: */
+	struct rpc_gss_wire_cred	clcred;
+	/* pointer to the beginning of the procedure-specific results,
+	 * which may be encrypted/checksummed in svcauth_gss_release: */
+	u32				*body_start;
+	struct rsc			*rsci;
+};
+
 /*
  * Accept an rpcsec packet.
  * If context establishment, punt to user space
@@ -701,6 +763,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp)
 	if (!svcdata)
 		goto auth_err;
 	rqstp->rq_auth_data = svcdata;
+	svcdata->body_start = 0;
+	svcdata->rsci = NULL;
 	gc = &svcdata->clcred;
 
 	/* start of rpc packet is 7 u32's back from here:
@@ -754,9 +818,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp)
 		break;
 	case RPC_GSS_PROC_DATA:
 	case RPC_GSS_PROC_DESTROY:
-		/* integrity and privacy unsupported: */
-		if (gc->gc_svc != RPC_GSS_SVC_NONE)
-			goto auth_err;
 		*authp = rpcsec_gsserr_credproblem;
 		rsci = gss_svc_searchbyctx(&gc->gc_ctx);
 		if (!rsci)
@@ -841,10 +902,28 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp)
 		*authp = rpcsec_gsserr_ctxproblem;
 		if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq))
 			goto auth_err;
-		/* For use when wrapping: */
-		svcdata->body_start = resv->iov_base + 1;
 		rqstp->rq_cred = rsci->cred;
 		get_group_info(rsci->cred.cr_group_info);
+		*authp = rpc_autherr_badcred;
+		switch (gc->gc_svc) {
+		case RPC_GSS_SVC_NONE:
+			break;
+		case RPC_GSS_SVC_INTEGRITY:
+			if (unwrap_integ_data(&rqstp->rq_arg,
+					gc->gc_seq, rsci->mechctx))
+				goto auth_err;
+			svcdata->rsci = rsci;
+			cache_get(&rsci->h);
+			/* placeholders for length and seq. number: */
+			svcdata->body_start = resv->iov_base + resv->iov_len;
+			svc_putu32(resv, 0);
+			svc_putu32(resv, 0);
+			break;
+		case RPC_GSS_SVC_PRIVACY:
+			/* currently unsupported */
+		default:
+			goto auth_err;
+		}
 		ret = SVC_OK;
 		goto out;
 	}
@@ -867,14 +946,95 @@ out:
 static int
 svcauth_gss_release(struct svc_rqst *rqstp)
 {
+	struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data;
+	struct rpc_gss_wire_cred *gc = &gsd->clcred;
+	struct xdr_buf *resbuf = &rqstp->rq_res;
+	struct xdr_buf integ_buf;
+	struct xdr_netobj mic;
+	struct iovec *resv;
+	u32 *p;
+	int integ_offset, integ_len;
+	int stat = -EINVAL;
+
+	if (gc->gc_proc != RPC_GSS_PROC_DATA)
+		goto out;
+	/* Release can be called twice, but we only wrap once. */
+	if (gsd->body_start == 0)
+		goto out;
+	/* normally not set till svc_send, but we need it here: */
+	resbuf->len = resbuf->head[0].iov_len
+		+ resbuf->page_len + resbuf->tail[0].iov_len;
+	switch (gc->gc_svc) {
+	case RPC_GSS_SVC_NONE:
+		break;
+	case RPC_GSS_SVC_INTEGRITY:
+		p = gsd->body_start;
+		gsd->body_start = 0;
+		/* move accept_stat to right place: */
+		memcpy(p, p + 2, 4);
+		/* don't wrap in failure case: */
+		/* Note: counting on not getting here if call was not even
+		 * accepted! */
+		if (*p != rpc_success) {
+			resbuf->head[0].iov_len -= 2 * 4;
+			goto out;
+		}
+		p++;
+		integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base;
+		integ_len = resbuf->len - integ_offset;
+		BUG_ON(integ_len % 4);
+		*p++ = htonl(integ_len);
+		*p++ = htonl(gc->gc_seq);
+		if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset,
+					integ_len))
+			BUG();
+		if (resbuf->page_len == 0
+			&& resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE
+				< PAGE_SIZE) {
+			BUG_ON(resbuf->tail[0].iov_len);
+			/* Use head for everything */
+			resv = &resbuf->head[0];
+		} else if (resbuf->tail[0].iov_base == NULL) {
+			/* copied from nfsd4_encode_read */
+			svc_take_page(rqstp);
+			resbuf->tail[0].iov_base = page_address(rqstp
+					->rq_respages[rqstp->rq_resused-1]);
+			rqstp->rq_restailpage = rqstp->rq_resused-1;
+			resbuf->tail[0].iov_len = 0;
+			resv = &resbuf->tail[0];
+		} else {
+			resv = &resbuf->tail[0];
+		}
+		mic.data = (u8 *)resv->iov_base + resv->iov_len + 4;
+		if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic))
+			goto out_err;
+		svc_putu32(resv, htonl(mic.len));
+		memset(mic.data + mic.len, 0,
+				round_up_to_quad(mic.len) - mic.len);
+		resv->iov_len += XDR_QUADLEN(mic.len) << 2;
+		/* not strictly required: */
+		resbuf->len += XDR_QUADLEN(mic.len) << 2;
+		BUG_ON(resv->iov_len > PAGE_SIZE);
+		break;
+	case RPC_GSS_SVC_PRIVACY:
+	default:
+		goto out_err;
+	}
+
+out:
+	stat = 0;
+out_err:
 	if (rqstp->rq_client)
 		auth_domain_put(rqstp->rq_client);
 	rqstp->rq_client = NULL;
 	if (rqstp->rq_cred.cr_group_info)
 		put_group_info(rqstp->rq_cred.cr_group_info);
 	rqstp->rq_cred.cr_group_info = NULL;
+	if (gsd->rsci)
+		rsc_put(&gsd->rsci->h, &rsc_cache);
+	gsd->rsci = NULL;
 
-	return 0;
+	return stat;
 }
 
 static void
-- 
cgit v1.2.3


From c334f752d8e9d3847d4459d06f7544dea9a49923 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:53:50 -0700
Subject: [PATCH] posix message queues: code move

From: Manfred Spraul <manfred@colorfullife.com>

cleanup of sysv ipc as a preparation for posix message queues:

- replace !CONFIG_SYSVIPC wrappers for copy_semundo and exit_sem with
  static inline wrappers.  Now the whole ipc/util.c file is only used if
  CONFIG_SYSVIPC is set, use makefile magic instead of #ifdef.

- remove the prototypes for copy_semundo and exit_sem from kernel/fork.c

- they belong into a header file.

- create a new msgutil.c with the helper functions for message queues.

- cleanup the helper functions: run Lindent, add __user tags.
---
 include/linux/msg.h |   3 --
 include/linux/sem.h |  17 ++++++-
 ipc/Makefile        |   4 +-
 ipc/msg.c           | 105 -------------------------------------------
 ipc/msgutil.c       | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 ipc/util.c          |  19 --------
 ipc/util.h          |  10 +++++
 kernel/fork.c       |   4 +-
 8 files changed, 155 insertions(+), 134 deletions(-)
 create mode 100644 ipc/msgutil.c

(limited to 'include')

diff --git a/include/linux/msg.h b/include/linux/msg.h
index b235e862a3dd..2c4c6aa643ff 100644
--- a/include/linux/msg.h
+++ b/include/linux/msg.h
@@ -74,9 +74,6 @@ struct msg_msg {
 	/* the actual message follows immediately */
 };
 
-#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
-#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
-
 /* one msq_queue structure for each present queue on the system */
 struct msg_queue {
 	struct kern_ipc_perm q_perm;
diff --git a/include/linux/sem.h b/include/linux/sem.h
index b337c509ac29..aaf45764a56e 100644
--- a/include/linux/sem.h
+++ b/include/linux/sem.h
@@ -134,7 +134,22 @@ struct sysv_sem {
 	struct sem_undo_list *undo_list;
 };
 
-void exit_sem(struct task_struct *p);
+#ifdef CONFIG_SYSVIPC
+
+extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
+extern void exit_sem(struct task_struct *tsk);
+
+#else
+static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
+{
+	return 0;
+}
+
+static inline void exit_sem(struct task_struct *tsk)
+{
+	return;
+}
+#endif
 
 #endif /* __KERNEL__ */
 
diff --git a/ipc/Makefile b/ipc/Makefile
index ccc6c64c2493..6cd32a30f03f 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -2,7 +2,5 @@
 # Makefile for the linux ipc.
 #
 
-obj-y   := util.o
-
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
-obj-$(CONFIG_SYSVIPC) += msg.o sem.o shm.o
+obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
diff --git a/ipc/msg.c b/ipc/msg.c
index 709ff71bf5c1..37e2d3bb17cb 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -51,11 +51,6 @@ struct msg_sender {
 	struct task_struct* tsk;
 };
 
-struct msg_msgseg {
-	struct msg_msgseg* next;
-	/* the next part of the message follows immediately */
-};
-
 #define SEARCH_ANY		1
 #define SEARCH_EQUAL		2
 #define SEARCH_NOTEQUAL		3
@@ -129,106 +124,6 @@ static int newque (key_t key, int msgflg)
 	return msg_buildid(id,msq->q_perm.seq);
 }
 
-static void free_msg(struct msg_msg* msg)
-{
-	struct msg_msgseg* seg;
-
-	security_msg_msg_free(msg);
-
-	seg = msg->next;
-	kfree(msg);
-	while(seg != NULL) {
-		struct msg_msgseg* tmp = seg->next;
-		kfree(seg);
-		seg = tmp;
-	}
-}
-
-static struct msg_msg* load_msg(void* src, int len)
-{
-	struct msg_msg* msg;
-	struct msg_msgseg** pseg;
-	int err;
-	int alen;
-
-	alen = len;
-	if(alen > DATALEN_MSG)
-		alen = DATALEN_MSG;
-
-	msg = (struct msg_msg *) kmalloc (sizeof(*msg) + alen, GFP_KERNEL);
-	if(msg==NULL)
-		return ERR_PTR(-ENOMEM);
-
-	msg->next = NULL;
-	msg->security = NULL;
-
-	if (copy_from_user(msg+1, src, alen)) {
-		err = -EFAULT;
-		goto out_err;
-	}
-
-	len -= alen;
-	src = ((char*)src)+alen;
-	pseg = &msg->next;
-	while(len > 0) {
-		struct msg_msgseg* seg;
-		alen = len;
-		if(alen > DATALEN_SEG)
-			alen = DATALEN_SEG;
-		seg = (struct msg_msgseg *) kmalloc (sizeof(*seg) + alen, GFP_KERNEL);
-		if(seg==NULL) {
-			err=-ENOMEM;
-			goto out_err;
-		}
-		*pseg = seg;
-		seg->next = NULL;
-		if(copy_from_user (seg+1, src, alen)) {
-			err = -EFAULT;
-			goto out_err;
-		}
-		pseg = &seg->next;
-		len -= alen;
-		src = ((char*)src)+alen;
-	}
-	
-	err = security_msg_msg_alloc(msg);
-	if (err)
-		goto out_err;
-
-	return msg;
-
-out_err:
-	free_msg(msg);
-	return ERR_PTR(err);
-}
-
-static int store_msg(void* dest, struct msg_msg* msg, int len)
-{
-	int alen;
-	struct msg_msgseg *seg;
-
-	alen = len;
-	if(alen > DATALEN_MSG)
-		alen = DATALEN_MSG;
-	if(copy_to_user (dest, msg+1, alen))
-		return -1;
-
-	len -= alen;
-	dest = ((char*)dest)+alen;
-	seg = msg->next;
-	while(len > 0) {
-		alen = len;
-		if(alen > DATALEN_SEG)
-			alen = DATALEN_SEG;
-		if(copy_to_user (dest, seg+1, alen))
-			return -1;
-		len -= alen;
-		dest = ((char*)dest)+alen;
-		seg=seg->next;
-	}
-	return 0;
-}
-
 static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss)
 {
 	mss->tsk=current;
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
new file mode 100644
index 000000000000..e48d777de2a3
--- /dev/null
+++ b/ipc/msgutil.c
@@ -0,0 +1,127 @@
+/*
+ * linux/ipc/util.c
+ * Copyright (C) 1999, 2004 Manfred Spraul
+ *
+ * This file is released under GNU General Public Licence version 2 or
+ * (at your option) any later version.
+ *
+ * See the file COPYING for more details.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/security.h>
+#include <linux/slab.h>
+#include <linux/ipc.h>
+#include <asm/uaccess.h>
+
+#include "util.h"
+
+struct msg_msgseg {
+	struct msg_msgseg* next;
+	/* the next part of the message follows immediately */
+};
+
+#define DATALEN_MSG	(PAGE_SIZE-sizeof(struct msg_msg))
+#define DATALEN_SEG	(PAGE_SIZE-sizeof(struct msg_msgseg))
+
+struct msg_msg *load_msg(void __user *src, int len)
+{
+	struct msg_msg *msg;
+	struct msg_msgseg **pseg;
+	int err;
+	int alen;
+
+	alen = len;
+	if (alen > DATALEN_MSG)
+		alen = DATALEN_MSG;
+
+	msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	if (msg == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	msg->next = NULL;
+	msg->security = NULL;
+
+	if (copy_from_user(msg + 1, src, alen)) {
+		err = -EFAULT;
+		goto out_err;
+	}
+
+	len -= alen;
+	src = ((char *)src) + alen;
+	pseg = &msg->next;
+	while (len > 0) {
+		struct msg_msgseg *seg;
+		alen = len;
+		if (alen > DATALEN_SEG)
+			alen = DATALEN_SEG;
+		seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen,
+						 GFP_KERNEL);
+		if (seg == NULL) {
+			err = -ENOMEM;
+			goto out_err;
+		}
+		*pseg = seg;
+		seg->next = NULL;
+		if (copy_from_user(seg + 1, src, alen)) {
+			err = -EFAULT;
+			goto out_err;
+		}
+		pseg = &seg->next;
+		len -= alen;
+		src = ((char *)src) + alen;
+	}
+
+	err = security_msg_msg_alloc(msg);
+	if (err)
+		goto out_err;
+
+	return msg;
+
+out_err:
+	free_msg(msg);
+	return ERR_PTR(err);
+}
+
+int store_msg(void __user *dest, struct msg_msg *msg, int len)
+{
+	int alen;
+	struct msg_msgseg *seg;
+
+	alen = len;
+	if (alen > DATALEN_MSG)
+		alen = DATALEN_MSG;
+	if (copy_to_user(dest, msg + 1, alen))
+		return -1;
+
+	len -= alen;
+	dest = ((char *)dest) + alen;
+	seg = msg->next;
+	while (len > 0) {
+		alen = len;
+		if (alen > DATALEN_SEG)
+			alen = DATALEN_SEG;
+		if (copy_to_user(dest, seg + 1, alen))
+			return -1;
+		len -= alen;
+		dest = ((char *)dest) + alen;
+		seg = seg->next;
+	}
+	return 0;
+}
+
+void free_msg(struct msg_msg *msg)
+{
+	struct msg_msgseg *seg;
+
+	security_msg_msg_free(msg);
+
+	seg = msg->next;
+	kfree(msg);
+	while (seg != NULL) {
+		struct msg_msgseg *tmp = seg->next;
+		kfree(seg);
+		seg = tmp;
+	}
+}
diff --git a/ipc/util.c b/ipc/util.c
index 6d94883edae0..f74c5eef57d0 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -25,8 +25,6 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-#if defined(CONFIG_SYSVIPC)
-
 #include "util.h"
 
 /**
@@ -531,20 +529,3 @@ int ipc_parse_version (int *cmd)
 }
 
 #endif /* __ia64__ */
-
-#else
-/*
- * Dummy functions when SYSV IPC isn't configured
- */
-
-int copy_semundo(unsigned long clone_flags, struct task_struct *tsk)
-{
-	return 0;
-}
-
-void exit_sem(struct task_struct *tsk)
-{
-	return;
-}
-
-#endif /* CONFIG_SYSVIPC */
diff --git a/ipc/util.h b/ipc/util.h
index 79c8fc901317..e6434942c097 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -4,6 +4,10 @@
  *
  * ipc helper functions (c) 1999 Manfred Spraul <manfreds@colorfullife.com>
  */
+
+#ifndef _IPC_UTIL_H
+#define _IPC_UTIL_H
+
 #define USHRT_MAX 0xffff
 #define SEQ_MULTIPLIER	(IPCMNI)
 
@@ -62,3 +66,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out);
 #else
 int ipc_parse_version (int *cmd);
 #endif
+
+extern void free_msg(struct msg_msg *msg);
+extern struct msg_msg *load_msg(void __user *src, int len);
+extern int store_msg(void __user *dest, struct msg_msg *msg, int len);
+
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b17a249c50d..a1f20cabbdd3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -21,6 +21,7 @@
 #include <linux/completion.h>
 #include <linux/namespace.h>
 #include <linux/personality.h>
+#include <linux/sem.h>
 #include <linux/file.h>
 #include <linux/binfmts.h>
 #include <linux/mman.h>
@@ -39,9 +40,6 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
-extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk);
-extern void exit_sem(struct task_struct *tsk);
-
 /* The idle threads do not count..
  * Protected by write_lock_irq(&tasklist_lock)
  */
-- 
cgit v1.2.3


From c50142a5433ed504fff2b1af152f8f7628830dfb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:03 -0700
Subject: [PATCH] posix message queues: syscall stubs

From: Manfred Spraul <manfred@colorfullife.com>

Add -ENOSYS stubs for the posix message queue syscalls.  The API is a direct
mapping of the api from the unix spec, with two exceptions:

- mq_close() doesn't exist.  Message queue file descriptors can be closed
  with close().

- mq_notify(SIGEV_THREAD) cannot be implemented in the kernel.  The kernel
  returns a pollable file descriptor .  User space must poll (or read) this
  descriptor and call the notifier function if the file descriptor is
  signaled.
---
 arch/i386/kernel/entry.S  |  9 +++++++++
 include/asm-i386/unistd.h | 11 ++++++++++-
 include/linux/mqueue.h    | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/syscalls.h  |  9 +++++++++
 kernel/sys.c              |  6 ++++++
 5 files changed, 70 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/mqueue.h

(limited to 'include')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 3024740ba84c..14e64d3ea25c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -882,5 +882,14 @@ ENTRY(sys_call_table)
 	.long sys_utimes
  	.long sys_fadvise64_64
 	.long sys_ni_syscall	/* sys_vserver */
+	.long sys_ni_syscall	/* sys_mbind */
+	.long sys_ni_syscall	/* 275 sys_get_mempolicy */
+	.long sys_ni_syscall	/* sys_set_mempolicy */
+	.long sys_mq_open
+	.long sys_mq_unlink
+	.long sys_mq_timedsend
+	.long sys_mq_timedreceive	/* 280 */
+	.long sys_mq_notify
+	.long sys_mq_getsetattr
 
 syscall_table_size=(.-sys_call_table)
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index a2d58a99491e..620a232084f3 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -279,8 +279,17 @@
 #define __NR_utimes		271
 #define __NR_fadvise64_64	272
 #define __NR_vserver		273
+#define __NR_mbind		274
+#define __NR_get_mempolicy	275
+#define __NR_set_mempolicy	276
+#define __NR_mq_open 		277
+#define __NR_mq_unlink		(__NR_mq_open+1)
+#define __NR_mq_timedsend	(__NR_mq_open+2)
+#define __NR_mq_timedreceive	(__NR_mq_open+3)
+#define __NR_mq_notify		(__NR_mq_open+4)
+#define __NR_mq_getsetattr	(__NR_mq_open+5)
 
-#define NR_syscalls 274
+#define NR_syscalls 283
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
 
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
new file mode 100644
index 000000000000..c0c5fcc89f0e
--- /dev/null
+++ b/include/linux/mqueue.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2003 Krzysztof Benedyczak & Michal Wronski
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   It is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with this software; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#ifndef _LINUX_MQUEUE_H
+#define _LINUX_MQUEUE_H
+
+#define MQ_PRIO_MAX 	32768
+
+typedef int mqd_t;
+
+struct mq_attr {
+	long	mq_flags;	/* message queue flags			*/
+	long	mq_maxmsg;	/* maximum number of messages		*/
+	long	mq_msgsize;	/* maximum message size			*/
+	long	mq_curmsgs;	/* number of messages currently queued	*/
+};
+
+#define NOTIFY_NONE	0
+#define NOTIFY_WOKENUP	1
+#define NOTIFY_REMOVED	2
+
+#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index aaf87aeacafb..7ee5f67abb5f 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,6 +48,8 @@ struct timex;
 struct timezone;
 struct tms;
 struct utimbuf;
+typedef int mqd_t;
+struct mq_attr;
 
 #include <linux/config.h>
 #include <linux/types.h>
@@ -450,6 +452,13 @@ asmlinkage long sys_shmget(key_t key, size_t size, int flag);
 asmlinkage long sys_shmdt(char __user *shmaddr);
 asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
 
+asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
+asmlinkage long sys_mq_unlink(const char __user *name);
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
+asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
+asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
+
 asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
 asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
 				unsigned long off, unsigned long len,
diff --git a/kernel/sys.c b/kernel/sys.c
index bc498b12edcc..7d1bf5c57aca 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -260,6 +260,12 @@ cond_syscall(sys_msgctl)
 cond_syscall(sys_shmget)
 cond_syscall(sys_shmdt)
 cond_syscall(sys_shmctl)
+cond_syscall(sys_mq_open)
+cond_syscall(sys_mq_unlink)
+cond_syscall(sys_mq_timedsend)
+cond_syscall(sys_mq_timedreceive)
+cond_syscall(sys_mq_notify)
+cond_syscall(sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
-- 
cgit v1.2.3


From be94d44e818a56406016111fc48a1084b9f8e435 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:16 -0700
Subject: [PATCH] posix message queues: implementation

From: Manfred Spraul <manfred@colorfullife.com>

Actual implementation of the posix message queues, written by Krzysztof
Benedyczak and Michal Wronski.  The complete implementation is dependant on
CONFIG_POSIX_MQUEUE.

It passed the openposix test suite with two exceptions: one mq_unlink test
was bad and tested undefined behavior.  And Linux succeeds
mq_close(open(,,,)).  The spec mandates EBADF, but we have decided to ignore
that: we would have to add a new syscall just for the right error code.

The patch intentionally doesn't use all helpers from fs/libfs for kernel-only
filesystems: step 5 allows user space mounts of the file system.


Signal changes:

The patch redefines SI_MESGQ using __SI_CODE: The generic Linux ABI uses
a negative value (i.e.  from user) for SI_MESGQ, but the kernel internal
value must be posive to pass check_kill_value.  Additionally, the patch
adds support into copy_siginfo_to_user to copy the "new" signal type to
user space.


Changes in signal code caused by POSIX message queues patch:

General & rationale:

  mqueues generated signals (only upon notification) must have si_code
  == SI_MESGQ.  In fact such a signal is send from one process which
  caused notification (== sent message to empty message queue) to
  another which requested it.  Both processes can be of course unrelated
  in terms of uids/euids.  So SI_MESGQ signals must be classified as
  SI_FROMKERNEL to pass check_kill_permissions (not need to say that
  this signals ARE from kernel).

  Signals generated by message queues notification need the same
  fields in siginfo struct's union _sifields as POSIX.1b signals and we
  can reuse its union entry.

  SI_MESGQ was previously defined to -3 in kernel and also in glibc.
  So in userspace SI_MESGQ must be still visible as -3.

Solution:

  SI_MESGQ is defined in the same style as SI_TIMER using __SI_CODE macro.

  Details:

    Fortunately copy_siginfo_to_user copies si_code as short.  So we
    can use remaining part of int value freely.  __SI_CODE does the
    work.  SI_MESGQ is in kernel:

 		6<<16 | (-3 & 0xffff) what is > 0

    but to userspace is copied

 		(short) SI_MESGQ == -3

Actual changes:

  Changes in include/asm-generic/siginfo.h

  __SI_MESGQ added in signal.h to represent inside-kernel prefix of
  SI_MESGQ.  SI_MESGQ is redefined from -3 to __SI_CODE(__SI_MESGQ, -3)

  Except mips architecture those changes should be arch independent
  (asm-generic/siginfo.h is included in arch versions).  On mips
  SI_MESGQ is redefined to -4 in order to be compatible with IRIX.  But
  the same schema can be used.

  Change in copy_siginfo_to_user: We only add one line to order the
  same copy semantics as for _SI_RT.

  This change isn't very portable - some arch have its own
  copy_siginfo_to_user.  All those should have similar change (but
  possibly not one-line as _SI_RT case was sometimes ignored because i
  wasn't used yet, e.g.  see ia64 signal.c).

Update:
mq: only fail with invalid timespec if mq_timed{send,receive} needs to block
From: Jakub Jelinek <jakub@redhat.com>

POSIX requires EINVAL to be set if:
"The process or thread would have blocked, and the abs_timeout parameter
specified a nanoseconds field value less than zero or greater than or equal
to 1000 million."
but 2.6.5-mm3 returns -EINVAL even if the process or thread would not block
(if the queue is not empty for timedreceive or not full for timedsend).
---
 CREDITS                            |   17 +
 Documentation/filesystems/proc.txt |   25 +
 include/asm-generic/siginfo.h      |    4 +-
 init/Kconfig                       |   18 +
 ipc/Makefile                       |    2 +
 ipc/mqueue.c                       | 1165 ++++++++++++++++++++++++++++++++++++
 kernel/signal.c                    |    1 +
 7 files changed, 1231 insertions(+), 1 deletion(-)
 create mode 100644 ipc/mqueue.c

(limited to 'include')

diff --git a/CREDITS b/CREDITS
index dc9b943d10f1..52128c120f63 100644
--- a/CREDITS
+++ b/CREDITS
@@ -289,6 +289,15 @@ S: Via Delle Palme, 9
 S: Terni 05100
 S: Italy
 
+N: Krzysztof Benedyczak
+E: golbi@mat.uni.torun.pl
+W: http://www.mat.uni.torun.pl/~golbi
+D: POSIX message queues fs (with M. Wronski)
+S: ul. Podmiejska 52
+S: Radunica
+S: 83-000 Pruszcz Gdanski
+S: Poland
+
 N: Randolph Bentson
 E: bentson@grieg.seaslug.org
 W: http://www.aa.net/~bentson/
@@ -3485,6 +3494,14 @@ S: 12725 SW Millikan Way, Suite 400
 S: Beaverton, OR 97005
 S: USA
 
+N: Michal Wronski
+E: wrona@mat.uni.torun.pl
+W: http://www.mat.uni.torun.pl/~wrona
+D: POSIX message queues fs (with K. Benedyczak)
+S: ul. Teczowa 23/12
+S: 80-680 Gdansk-Sobieszewo
+S: Poland
+
 N: Frank Xia
 E: qx@math.columbia.edu
 D: Xiafs filesystem [defunct]
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22fd3adcc96e..378722d5bb70 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -38,6 +38,7 @@ Table of Contents
   2.8	/proc/sys/net/ipv4 - IPV4 settings
   2.9	Appletalk
   2.10	IPX
+  2.11	/proc/sys/fs/mqueue - POSIX message queues filesystem
 
 ------------------------------------------------------------------------------
 Preface
@@ -1814,6 +1815,30 @@ The /proc/net/ipx_route  table  holds  a list of IPX routes. For each route it
 gives the  destination  network, the router node (or Directly) and the network
 address of the router (or Connected) for internal networks.
 
+2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem
+----------------------------------------------------------
+
+The "mqueue"  filesystem provides  the necessary kernel features to enable the
+creation of a  user space  library that  implements  the  POSIX message queues
+API (as noted by the  MSG tag in the  POSIX 1003.1-2001 version  of the System
+Interfaces specification.)
+
+The "mqueue" filesystem contains values for determining/setting  the amount of
+resources used by the file system.
+
+/proc/sys/fs/mqueue/queues_max is a read/write  file for  setting/getting  the
+maximum number of message queues allowed on the system.
+
+/proc/sys/fs/mqueue/msg_max  is  a  read/write file  for  setting/getting  the
+maximum number of messages in a queue value.  In fact it is the limiting value
+for another (user) limit which is set in mq_open invocation. This attribute of
+a queue must be less or equal then msg_max.
+
+/proc/sys/fs/mqueue/msgsize_max is  a read/write  file for setting/getting the
+maximum  message size value (it is every  message queue's attribute set during
+its creation).
+
+
 ------------------------------------------------------------------------------
 Summary
 ------------------------------------------------------------------------------
diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h
index e95efd9e00c6..fe02b1a4d286 100644
--- a/include/asm-generic/siginfo.h
+++ b/include/asm-generic/siginfo.h
@@ -123,6 +123,7 @@ typedef struct siginfo {
 #define __SI_FAULT	(3 << 16)
 #define __SI_CHLD	(4 << 16)
 #define __SI_RT		(5 << 16)
+#define __SI_MESGQ	(6 << 16)
 #define __SI_CODE(T,N)	((T) | ((N) & 0xffff))
 #else
 #define __SI_KILL	0
@@ -131,6 +132,7 @@ typedef struct siginfo {
 #define __SI_FAULT	0
 #define __SI_CHLD	0
 #define __SI_RT		0
+#define __SI_MESGQ	0
 #define __SI_CODE(T,N)	(N)
 #endif
 
@@ -142,7 +144,7 @@ typedef struct siginfo {
 #define SI_KERNEL	0x80		/* sent by the kernel from somewhere */
 #define SI_QUEUE	-1		/* sent by sigqueue */
 #define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */
-#define SI_MESGQ	-3		/* sent by real time mesq state change */
+#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */
 #define SI_ASYNCIO	-4		/* sent by AIO completion */
 #define SI_SIGIO	-5		/* sent by queued SIGIO */
 #define SI_TKILL	-6		/* sent by tkill system call */
diff --git a/init/Kconfig b/init/Kconfig
index c10fec8ebe9e..9eff25e8f6ed 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -90,6 +90,24 @@ config SYSVIPC
 	  section 6.4 of the Linux Programmer's Guide, available from
 	  <http://www.tldp.org/guides.html>.
 
+config POSIX_MQUEUE
+	bool "POSIX Message Queues"
+	depends on EXPERIMENTAL
+	---help---
+	  POSIX variant of message queues is a part of IPC. In POSIX message
+	  queues every message has a priority which decides about succession
+	  of receiving it by a process. If you want to compile and run
+	  programs written e.g. for Solaris with use of its POSIX message
+	  queues (functions mq_*) say Y here. To use this feature you will
+	  also need mqueue library, available from
+	  <http://www.mat.uni.torun.pl/~wrona/posix_ipc/>
+
+	  POSIX message queues are visible as a filesystem called 'mqueue'
+	  and can be mounted somewhere if you want to do filesystem
+	  operations on message queues.
+
+	  If unsure, say Y.
+
 config BSD_PROCESS_ACCT
 	bool "BSD Process Accounting"
 	help
diff --git a/ipc/Makefile b/ipc/Makefile
index 6cd32a30f03f..913790207d85 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -4,3 +4,5 @@
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
 obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
new file mode 100644
index 000000000000..4de249718675
--- /dev/null
+++ b/ipc/mqueue.c
@@ -0,0 +1,1165 @@
+/*
+ * POSIX message queues filesystem for Linux.
+ *
+ * Copyright (C) 2003,2004  Krzysztof Benedyczak    (golbi@mat.uni.torun.pl)
+ *                          Michal Wronski          (wrona@mat.uni.torun.pl)
+ *
+ * Spinlocks:               Mohamed Abbas           (abbas.mohamed@intel.com)
+ * Lockless receive & send, fd based notify:
+ * 			    Manfred Spraul	    (manfred@colorfullife.com)
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/init.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sysctl.h>
+#include <linux/poll.h>
+#include <linux/mqueue.h>
+#include <linux/msg.h>
+#include "util.h"
+
+#define MQUEUE_MAGIC	0x19800202
+#define DIRENT_SIZE	20
+#define FILENT_SIZE	80
+
+#define SEND		0
+#define RECV		1
+
+#define STATE_NONE	0
+#define STATE_PENDING	1
+#define STATE_READY	2
+
+#define NP_NONE		((void*)NOTIFY_NONE)
+#define NP_WOKENUP	((void*)NOTIFY_WOKENUP)
+#define NP_REMOVED	((void*)NOTIFY_REMOVED)
+/* used by sysctl */
+#define FS_MQUEUE 	1
+#define CTL_QUEUESMAX 	2
+#define CTL_MSGMAX 	3
+#define CTL_MSGSIZEMAX 	4
+
+/* default values */
+#define DFLT_QUEUESMAX	64	/* max number of message queues */
+#define DFLT_MSGMAX 	40	/* max number of messages in each queue */
+#define HARD_MSGMAX 	(131072/sizeof(void*))
+#define DFLT_MSGSIZEMAX 16384	/* max message size */
+
+struct ext_wait_queue {		/* queue of sleeping tasks */
+	struct task_struct *task;
+	struct list_head list;
+	struct msg_msg *msg;	/* ptr of loaded message */
+	int state;		/* one of STATE_* values */
+};
+
+struct mqueue_inode_info {
+	struct mq_attr attr;
+	struct msg_msg **messages;
+
+	pid_t notify_owner;	/* != 0 means notification registered */
+	struct sigevent notify;
+	struct file *notify_filp;
+
+	/* for tasks waiting for free space and messages, respectively */
+	struct ext_wait_queue e_wait_q[2];
+	wait_queue_head_t wait_q;
+
+	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
+	spinlock_t lock;
+	struct inode vfs_inode;
+};
+
+static struct inode_operations mqueue_dir_inode_operations;
+static struct file_operations mqueue_file_operations;
+static struct file_operations mqueue_notify_fops;
+static struct super_operations mqueue_super_ops;
+static void remove_notification(struct mqueue_inode_info *info);
+
+static spinlock_t mq_lock;
+static kmem_cache_t *mqueue_inode_cachep;
+static struct vfsmount *mqueue_mnt;
+
+static unsigned int queues_count;
+static unsigned int queues_max 	= DFLT_QUEUESMAX;
+static unsigned int msg_max 	= DFLT_MSGMAX;
+static unsigned int msgsize_max = DFLT_MSGSIZEMAX;
+
+static struct ctl_table_header * mq_sysctl_table;
+
+static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode)
+{
+	return container_of(inode, struct mqueue_inode_info, vfs_inode);
+}
+
+static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode) {
+		inode->i_mode = mode;
+		inode->i_uid = current->fsuid;
+		inode->i_gid = current->fsgid;
+		inode->i_blksize = PAGE_CACHE_SIZE;
+		inode->i_blocks = 0;
+		inode->i_mtime = inode->i_ctime = inode->i_atime =
+				CURRENT_TIME;
+
+		if (S_ISREG(mode)) {
+			struct mqueue_inode_info *info;
+
+			inode->i_fop = &mqueue_file_operations;
+			inode->i_size = FILENT_SIZE;
+			/* mqueue specific info */
+			info = MQUEUE_I(inode);
+			spin_lock_init(&info->lock);
+			init_waitqueue_head(&info->wait_q);
+			INIT_LIST_HEAD(&info->e_wait_q[0].list);
+			INIT_LIST_HEAD(&info->e_wait_q[1].list);
+			info->notify_owner = 0;
+			info->qsize = 0;
+			info->attr.mq_curmsgs = 0;
+			info->messages = NULL;
+		} else if (S_ISDIR(mode)) {
+			inode->i_nlink++;
+			inode->i_op = &mqueue_dir_inode_operations;
+			inode->i_fop = &simple_dir_operations;
+		}
+	}
+	return inode;
+}
+
+static int mqueue_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode;
+
+	sb->s_flags = MS_NOUSER;
+	sb->s_blocksize = PAGE_CACHE_SIZE;
+	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+	sb->s_magic = MQUEUE_MAGIC;
+	sb->s_op = &mqueue_super_ops;
+
+	inode = mqueue_get_inode(sb, S_IFDIR | S_IRWXUGO);
+	if (!inode)
+		return -ENOMEM;
+
+	sb->s_root = d_alloc_root(inode);
+	if (!sb->s_root) {
+		iput(inode);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct super_block *mqueue_get_sb(struct file_system_type *fs_type,
+					 int flags, const char *dev_name,
+					 void *data)
+{
+	return get_sb_single(fs_type, flags, data, mqueue_fill_super);
+}
+
+static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags)
+{
+	struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo;
+
+	if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) ==
+		SLAB_CTOR_CONSTRUCTOR)
+		inode_init_once(&p->vfs_inode);
+}
+
+static struct inode *mqueue_alloc_inode(struct super_block *sb)
+{
+	struct mqueue_inode_info *ei;
+
+	ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL);
+	if (!ei)
+		return NULL;
+	return &ei->vfs_inode;
+}
+
+static void mqueue_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode));
+}
+
+static void mqueue_delete_inode(struct inode *inode)
+{
+	struct mqueue_inode_info *info;
+	int i;
+
+	if (S_ISDIR(inode->i_mode)) {
+		clear_inode(inode);
+		return;
+	}
+	info = MQUEUE_I(inode);
+	spin_lock(&info->lock);
+	for (i = 0; i < info->attr.mq_curmsgs; i++)
+		free_msg(info->messages[i]);
+	kfree(info->messages);
+	spin_unlock(&info->lock);
+
+	clear_inode(inode);
+
+	spin_lock(&mq_lock);
+	queues_count--;
+	spin_unlock(&mq_lock);
+}
+
+static int mqueue_create(struct inode *dir, struct dentry *dentry,
+				int mode, struct nameidata *nd)
+{
+	struct inode *inode;
+	int error;
+
+	spin_lock(&mq_lock);
+	if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) {
+		error = -ENOSPC;
+		goto out_lock;
+	}
+	queues_count++;
+	spin_unlock(&mq_lock);
+
+	inode = mqueue_get_inode(dir->i_sb, mode);
+	if (!inode) {
+		error = -ENOMEM;
+		spin_lock(&mq_lock);
+		queues_count--;
+		goto out_lock;
+	}
+
+	d_instantiate(dentry, inode);
+	dget(dentry);
+	return 0;
+out_lock:
+	spin_unlock(&mq_lock);
+	return error;
+}
+
+static int mqueue_flush_file(struct file *filp)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+
+	spin_lock(&info->lock);
+	if (current->tgid == info->notify_owner)
+		remove_notification(info);
+
+	spin_unlock(&info->lock);
+	return 0;
+}
+
+/* Adds current to info->e_wait_q[sr] before element with smaller prio */
+static void wq_add(struct mqueue_inode_info *info, int sr,
+			struct ext_wait_queue *ewp)
+{
+	struct ext_wait_queue *walk;
+
+	ewp->task = current;
+
+	list_for_each_entry(walk, &info->e_wait_q[sr].list, list) {
+		if (walk->task->static_prio <= current->static_prio) {
+			list_add_tail(&ewp->list, &walk->list);
+			return;
+		}
+	}
+	list_add_tail(&ewp->list, &info->e_wait_q[sr].list);
+}
+
+/*
+ * Puts current task to sleep. Caller must hold queue lock. After return
+ * lock isn't held.
+ * sr: SEND or RECV
+ */
+static int wq_sleep(struct mqueue_inode_info *info, int sr,
+			long timeout, struct ext_wait_queue *ewp)
+{
+	int retval;
+	signed long time;
+
+	wq_add(info, sr, ewp);
+
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		spin_unlock(&info->lock);
+		time = schedule_timeout(timeout);
+
+		while (ewp->state == STATE_PENDING)
+			cpu_relax();
+
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out;
+		}
+		spin_lock(&info->lock);
+		if (ewp->state == STATE_READY) {
+			retval = 0;
+			goto out_unlock;
+		}
+		if (signal_pending(current)) {
+			retval = -ERESTARTSYS;
+			break;
+		}
+		if (time == 0) {
+			retval = -ETIMEDOUT;
+			break;
+		}
+	}
+	list_del(&ewp->list);
+out_unlock:
+	spin_unlock(&info->lock);
+out:
+	return retval;
+}
+
+/*
+ * Returns waiting task that should be serviced first or NULL if none exists
+ */
+static struct ext_wait_queue *wq_get_first_waiter(
+		struct mqueue_inode_info *info, int sr)
+{
+	struct list_head *ptr;
+
+	ptr = info->e_wait_q[sr].list.prev;
+	if (ptr == &info->e_wait_q[sr].list)
+		return NULL;
+	return list_entry(ptr, struct ext_wait_queue, list);
+}
+
+/* Auxiliary functions to manipulate messages' list */
+static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info)
+{
+	int k;
+
+	k = info->attr.mq_curmsgs - 1;
+	while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) {
+		info->messages[k + 1] = info->messages[k];
+		k--;
+	}
+	info->attr.mq_curmsgs++;
+	info->qsize += ptr->m_ts;
+	info->messages[k + 1] = ptr;
+}
+
+static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
+{
+	info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts;
+	return info->messages[info->attr.mq_curmsgs];
+}
+
+/*
+ * The next function is only to split too long sys_mq_timedsend
+ */
+static void __do_notify(struct mqueue_inode_info *info)
+{
+	/* notification
+	 * invoked when there is registered process and there isn't process
+	 * waiting synchronously for message AND state of queue changed from
+	 * empty to not empty. Here we are sure that no one is waiting
+	 * synchronously. */
+	if (info->notify_owner && info->attr.mq_curmsgs == 1) {
+		/* sends signal */
+		if (info->notify.sigev_notify == SIGEV_SIGNAL) {
+			struct siginfo sig_i;
+
+			sig_i.si_signo = info->notify.sigev_signo;
+			sig_i.si_errno = 0;
+			sig_i.si_code = SI_MESGQ;
+			sig_i.si_value = info->notify.sigev_value;
+			sig_i.si_pid = current->tgid;
+			sig_i.si_uid = current->uid;
+
+			kill_proc_info(info->notify.sigev_signo,
+				       &sig_i, info->notify_owner);
+		} else if (info->notify.sigev_notify == SIGEV_THREAD) {
+			info->notify_filp->private_data = (void*)NP_WOKENUP;
+			wake_up(&info->wait_q);
+		}
+		/* after notification unregisters process */
+		info->notify_owner = 0;
+	}
+}
+
+static long prepare_timeout(const struct timespec __user *u_arg)
+{
+	struct timespec ts, nowts;
+	long timeout;
+
+	if (u_arg) {
+		if (unlikely(copy_from_user(&ts, u_arg,
+					sizeof(struct timespec))))
+			return -EFAULT;
+
+		if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0
+			|| ts.tv_nsec >= NSEC_PER_SEC))
+			return -EINVAL;
+		nowts = CURRENT_TIME;
+		/* first subtract as jiffies can't be too big */
+		ts.tv_sec -= nowts.tv_sec;
+		if (ts.tv_nsec < nowts.tv_nsec) {
+			ts.tv_nsec += NSEC_PER_SEC;
+			ts.tv_sec--;
+		}
+		ts.tv_nsec -= nowts.tv_nsec;
+		if (ts.tv_sec < 0)
+			return 0;
+
+		timeout = timespec_to_jiffies(&ts) + 1;
+	} else
+		return MAX_SCHEDULE_TIMEOUT;
+
+	return timeout;
+}
+
+/*
+ * File descriptor based notification, intended to be used to implement
+ * SIGEV_THREAD:
+ * SIGEV_THREAD means that a notification function should be called in the
+ * context of a new thread. The kernel can't do that. Therefore mq_notify
+ * calls with SIGEV_THREAD return a new file descriptor. A user space helper
+ * must create a new thread and then read from the given file descriptor.
+ * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must
+ * call the notification function. If it's NOTIFY_REMOVED, then the
+ * notification was removed. The file descriptor supports poll, thus one
+ * supervisor thread can manage multiple message queue notifications.
+ *
+ * The implementation must support multiple outstanding notifications:
+ * It's possible that a new notification is added and signaled before user
+ * space calls mqueue_notify_read for the previous notification.
+ * Therefore the notification state is stored in the private_data field of
+ * the file descriptor.
+ */
+static unsigned int mqueue_notify_poll(struct file *filp,
+					struct poll_table_struct *poll_tab)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+	int retval;
+
+	poll_wait(filp, &info->wait_q, poll_tab);
+
+	if (filp->private_data == NP_NONE)
+		retval = 0;
+	else
+		retval = POLLIN | POLLRDNORM;
+	return retval;
+}
+
+static ssize_t mqueue_notify_read(struct file *filp, char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+	char result;
+
+	if (!count)
+		return 0;
+	if (*ppos != 0)
+		return 0;
+	spin_lock(&info->lock);
+	while (filp->private_data == NP_NONE) {
+		DEFINE_WAIT(wait);
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			return -EAGAIN;
+		}
+		prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock(&info->lock);
+		schedule();
+		finish_wait(&info->wait_q, &wait);
+		spin_lock(&info->lock);
+	}
+	spin_unlock(&info->lock);
+	result = (char)(unsigned long)filp->private_data;
+	if (put_user(result, buf))
+		return -EFAULT;
+	*ppos = 1;
+	return 1;
+}
+
+static int mqueue_notify_release(struct inode *inode, struct file *filp)
+{
+	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
+
+	spin_lock(&info->lock);
+	if (info->notify_owner && info->notify_filp == filp)
+		info->notify_owner = 0;
+	filp->private_data = NP_REMOVED;
+	spin_unlock(&info->lock);
+
+	return 0;
+}
+
+static void remove_notification(struct mqueue_inode_info *info)
+{
+	if (info->notify.sigev_notify == SIGEV_THREAD) {
+		info->notify_filp->private_data = NP_REMOVED;
+		wake_up(&info->wait_q);
+	}
+	info->notify_owner = 0;
+}
+
+/*
+ * Invoked when creating a new queue via sys_mq_open
+ */
+static struct file *do_create(struct dentry *dir, struct dentry *dentry,
+			int oflag, mode_t mode, struct mq_attr __user *u_attr)
+{
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct msg_msg **msgs = NULL;
+	struct mq_attr attr;
+	int ret;
+
+	if (u_attr != NULL) {
+		if (copy_from_user(&attr, u_attr, sizeof(attr)))
+			return ERR_PTR(-EFAULT);
+
+		if (attr.mq_maxmsg <= 0 || attr.mq_msgsize <= 0)
+			return ERR_PTR(-EINVAL);
+		if (capable(CAP_SYS_RESOURCE)) {
+			if (attr.mq_maxmsg > HARD_MSGMAX)
+				return ERR_PTR(-EINVAL);
+		} else {
+			if (attr.mq_maxmsg > msg_max ||
+					attr.mq_msgsize > msgsize_max)
+				return ERR_PTR(-EINVAL);
+		}
+	} else {
+		attr.mq_maxmsg = DFLT_MSGMAX;
+		attr.mq_msgsize = DFLT_MSGSIZEMAX;
+	}
+	msgs = kmalloc(attr.mq_maxmsg * sizeof(*msgs), GFP_KERNEL);
+	if (!msgs)
+		return ERR_PTR(-ENOMEM);
+
+	ret = vfs_create(dir->d_inode, dentry, mode, NULL);
+	if (ret) {
+		kfree(msgs);
+		return ERR_PTR(ret);
+	}
+
+	inode = dentry->d_inode;
+	info = MQUEUE_I(inode);
+
+	info->attr.mq_maxmsg = attr.mq_maxmsg;
+	info->attr.mq_msgsize = attr.mq_msgsize;
+	info->messages = msgs;
+
+	filp = dentry_open(dentry, mqueue_mnt, oflag);
+	if (!IS_ERR(filp))
+		dget(dentry);
+
+	return filp;
+}
+
+/* Opens existing queue */
+static struct file *do_open(struct dentry *dentry, int oflag)
+{
+static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE,
+					MAY_READ | MAY_WRITE };
+	struct file *filp;
+
+	if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
+		return ERR_PTR(-EINVAL);
+
+	if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL))
+		return ERR_PTR(-EACCES);
+
+	filp = dentry_open(dentry, mqueue_mnt, oflag);
+
+	if (!IS_ERR(filp))
+		dget(dentry);
+
+	return filp;
+}
+
+asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode,
+				struct mq_attr __user *u_attr)
+{
+	struct dentry *dentry;
+	struct file *filp;
+	char *name;
+	int fd, error;
+
+	if (IS_ERR(name = getname(u_name)))
+		return PTR_ERR(name);
+
+	fd = get_unused_fd();
+	if (fd < 0)
+		goto out_putname;
+
+	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	if (IS_ERR(dentry)) {
+		error = PTR_ERR(dentry);
+		goto out_err;
+	}
+	mntget(mqueue_mnt);
+
+	if (oflag & O_CREAT) {
+		if (dentry->d_inode) {	/* entry already exists */
+			filp = (oflag & O_EXCL) ? ERR_PTR(-EEXIST) :
+					do_open(dentry, oflag);
+		} else {
+			filp = do_create(mqueue_mnt->mnt_root, dentry,
+						oflag, mode, u_attr);
+		}
+	} else
+		filp = (dentry->d_inode) ? do_open(dentry, oflag) :
+					ERR_PTR(-ENOENT);
+
+	dput(dentry);
+
+	if (IS_ERR(filp)) {
+		error = PTR_ERR(filp);
+		goto out_putfd;
+	}
+
+	fd_install(fd, filp);
+	goto out_upsem;
+
+out_putfd:
+	mntput(mqueue_mnt);
+	put_unused_fd(fd);
+out_err:
+	fd = error;
+out_upsem:
+	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+out_putname:
+	putname(name);
+	return fd;
+}
+
+asmlinkage long sys_mq_unlink(const char __user *u_name)
+{
+	int err;
+	char *name;
+	struct dentry *dentry;
+	struct inode *inode = NULL;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	down(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name));
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto out_unlock;
+	}
+
+	if (!dentry->d_inode) {
+		err = -ENOENT;
+		goto out_err;
+	}
+
+	if (permission(dentry->d_inode, MAY_WRITE, NULL)) {
+		err = -EACCES;
+		goto out_err;
+	}
+	inode = dentry->d_inode;
+	if (inode)
+		atomic_inc(&inode->i_count);
+
+	err = vfs_unlink(dentry->d_parent->d_inode, dentry);
+out_err:
+	dput(dentry);
+
+out_unlock:
+	up(&mqueue_mnt->mnt_root->d_inode->i_sem);
+	putname(name);
+	if (inode)
+		iput(inode);
+
+	return err;
+}
+
+/* Pipelined send and receive functions.
+ *
+ * If a receiver finds no waiting message, then it registers itself in the
+ * list of waiting receivers. A sender checks that list before adding the new
+ * message into the message array. If there is a waiting receiver, then it
+ * bypasses the message array and directly hands the message over to the
+ * receiver.
+ * The receiver accepts the message and returns without grabbing the queue
+ * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers
+ * are necessary. The same algorithm is used for sysv semaphores, see
+ * ipc/sem.c fore more details.
+ *
+ * The same algorithm is used for senders.
+ */
+
+/* pipelined_send() - send a message directly to the task waiting in
+ * sys_mq_timedreceive() (without inserting message into a queue). */
+static inline void pipelined_send(struct mqueue_inode_info *info,
+				  struct msg_msg *message,
+				  struct ext_wait_queue *receiver)
+{
+	receiver->msg = message;
+	list_del(&receiver->list);
+	receiver->state = STATE_PENDING;
+	wake_up_process(receiver->task);
+	wmb();
+	receiver->state = STATE_READY;
+}
+
+/* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
+ * gets its message and put to the queue (we have one free place for sure). */
+static inline void pipelined_receive(struct mqueue_inode_info *info)
+{
+	struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND);
+
+	if (!sender)
+		return;
+
+	msg_insert(sender->msg, info);
+	list_del(&sender->list);
+	sender->state = STATE_PENDING;
+	wake_up_process(sender->task);
+	wmb();
+	sender->state = STATE_READY;
+}
+
+asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr,
+	size_t msg_len, unsigned int msg_prio,
+	const struct timespec __user *u_abs_timeout)
+{
+	struct file *filp;
+	struct inode *inode;
+	struct ext_wait_queue wait;
+	struct ext_wait_queue *receiver;
+	struct msg_msg *msg_ptr;
+	struct mqueue_inode_info *info;
+	long timeout;
+	int ret;
+
+	if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX))
+		return -EINVAL;
+
+	timeout = prepare_timeout(u_abs_timeout);
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (unlikely(!filp))
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	if (unlikely((filp->f_flags & O_ACCMODE) == O_RDONLY))
+		goto out_fput;
+
+	if (unlikely(msg_len > info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	/* First try to allocate memory, before doing anything with
+	 * existing queues. */
+	msg_ptr = load_msg((void *)u_msg_ptr, msg_len);
+	if (unlikely(IS_ERR(msg_ptr))) {
+		ret = PTR_ERR(msg_ptr);
+		goto out_fput;
+	}
+	msg_ptr->m_ts = msg_len;
+	msg_ptr->m_type = msg_prio;
+
+	spin_lock(&info->lock);
+
+	if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) {
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+		} else if (unlikely(timeout < 0)) {
+			spin_unlock(&info->lock);
+			ret = timeout;
+		} else {
+			wait.task = current;
+			wait.msg = (void *) msg_ptr;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, SEND, timeout, &wait);
+			if (ret < 0)
+				free_msg(msg_ptr);
+		}
+	} else {
+		receiver = wq_get_first_waiter(info, RECV);
+		if (receiver) {
+			pipelined_send(info, msg_ptr, receiver);
+		} else {
+			/* adds message to the queue */
+			msg_insert(msg_ptr, info);
+			__do_notify(info);
+		}
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+				CURRENT_TIME;
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr,
+	size_t msg_len, unsigned int __user *u_msg_prio,
+	const struct timespec __user *u_abs_timeout)
+{
+	long timeout;
+	ssize_t ret;
+	struct msg_msg *msg_ptr;
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+	struct ext_wait_queue wait;
+
+	timeout = prepare_timeout(u_abs_timeout);
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (unlikely(!filp))
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	if (unlikely((filp->f_flags & O_ACCMODE) == O_WRONLY))
+		goto out_fput;
+
+	/* checks if buffer is big enough */
+	if (unlikely(msg_len < info->attr.mq_msgsize)) {
+		ret = -EMSGSIZE;
+		goto out_fput;
+	}
+
+	spin_lock(&info->lock);
+	if (info->attr.mq_curmsgs == 0) {
+		if (filp->f_flags & O_NONBLOCK) {
+			spin_unlock(&info->lock);
+			ret = -EAGAIN;
+			msg_ptr = NULL;
+		} else if (unlikely(timeout < 0)) {
+			spin_unlock(&info->lock);
+			ret = timeout;
+			msg_ptr = NULL;
+		} else {
+			wait.task = current;
+			wait.state = STATE_NONE;
+			ret = wq_sleep(info, RECV, timeout, &wait);
+			msg_ptr = wait.msg;
+		}
+	} else {
+		msg_ptr = msg_get(info);
+
+		inode->i_atime = inode->i_mtime = inode->i_ctime =
+				CURRENT_TIME;
+
+		/* There is now free space in queue. */
+		pipelined_receive(info);
+		spin_unlock(&info->lock);
+		ret = 0;
+	}
+	if (ret == 0) {
+		ret = msg_ptr->m_ts;
+
+		if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) ||
+			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
+			ret = -EFAULT;
+		}
+		free_msg(msg_ptr);
+	}
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+/*
+ * Notes: the case when user wants us to deregister (with NULL as pointer
+ * or SIGEV_NONE) and he isn't currently owner of notification will be
+ * silently discarded. It isn't explicitly defined in the POSIX.
+ */
+asmlinkage long sys_mq_notify(mqd_t mqdes,
+				const struct sigevent __user *u_notification)
+{
+	int ret, fd;
+	struct file *filp, *nfilp;
+	struct inode *inode;
+	struct sigevent notification;
+	struct mqueue_inode_info *info;
+
+	if (u_notification == NULL) {
+		notification.sigev_notify = SIGEV_NONE;
+	} else {
+		if (copy_from_user(&notification, u_notification,
+					sizeof(struct sigevent)))
+			return -EFAULT;
+
+		if (unlikely(notification.sigev_notify != SIGEV_NONE &&
+			     notification.sigev_notify != SIGEV_SIGNAL &&
+			     notification.sigev_notify != SIGEV_THREAD))
+			return -EINVAL;
+		if (notification.sigev_notify == SIGEV_SIGNAL &&
+			(notification.sigev_signo < 0 ||
+			 notification.sigev_signo > _NSIG)) {
+			return -EINVAL;
+		}
+	}
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (!filp)
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	ret = 0;
+	if (notification.sigev_notify == SIGEV_THREAD) {
+		ret = get_unused_fd();
+		if (ret < 0)
+			goto out_fput;
+		fd = ret;
+		nfilp = get_empty_filp();
+		if (!nfilp) {
+			ret = -ENFILE;
+			goto out_dropfd;
+		}
+		nfilp->private_data = NP_NONE;
+		nfilp->f_op = &mqueue_notify_fops;
+		nfilp->f_vfsmnt = mntget(mqueue_mnt);
+		nfilp->f_dentry = dget(filp->f_dentry);
+		nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+		nfilp->f_mode = FMODE_READ;
+	} else {
+		nfilp = NULL;
+		fd = -1;
+	}
+
+	spin_lock(&info->lock);
+
+	if (notification.sigev_notify == SIGEV_NONE) {
+		if (info->notify_owner == current->tgid) {
+			remove_notification(info);
+			inode->i_atime = inode->i_ctime = CURRENT_TIME;
+		}
+	} else if (info->notify_owner) {
+		ret = -EBUSY;
+	} else if (notification.sigev_notify == SIGEV_THREAD) {
+		info->notify_filp = nfilp;
+		fd_install(fd, nfilp);
+		ret = fd;
+		fd = -1;
+		nfilp = NULL;
+		info->notify.sigev_notify = SIGEV_THREAD;
+		info->notify_owner = current->tgid;
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}  else {
+		info->notify.sigev_signo = notification.sigev_signo;
+		info->notify.sigev_value = notification.sigev_value;
+		info->notify.sigev_notify = SIGEV_SIGNAL;
+		info->notify_owner = current->tgid;
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+	spin_unlock(&info->lock);
+out_dropfd:
+	if (fd != -1)
+		put_unused_fd(fd);
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
+			const struct mq_attr __user *u_mqstat,
+			struct mq_attr __user *u_omqstat)
+{
+	int ret;
+	struct mq_attr mqstat, omqstat;
+	struct file *filp;
+	struct inode *inode;
+	struct mqueue_inode_info *info;
+
+	if (u_mqstat != NULL) {
+		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
+			return -EFAULT;
+	}
+
+	ret = -EBADF;
+	filp = fget(mqdes);
+	if (!filp)
+		goto out;
+
+	inode = filp->f_dentry->d_inode;
+	if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb))
+		goto out_fput;
+	info = MQUEUE_I(inode);
+
+	spin_lock(&info->lock);
+
+	omqstat = info->attr;
+	omqstat.mq_flags = filp->f_flags;
+	if (u_mqstat) {
+		if (mqstat.mq_flags & O_NONBLOCK)
+			filp->f_flags |= O_NONBLOCK;
+		else
+			filp->f_flags &= ~O_NONBLOCK;
+
+		inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	}
+
+	spin_unlock(&info->lock);
+
+	ret = 0;
+	if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat,
+						sizeof(struct mq_attr)))
+		ret = -EFAULT;
+
+out_fput:
+	fput(filp);
+out:
+	return ret;
+}
+
+static struct inode_operations mqueue_dir_inode_operations = {
+	.lookup = simple_lookup,
+	.create = mqueue_create,
+	.unlink = simple_unlink,
+};
+
+static struct file_operations mqueue_file_operations = {
+	.flush = mqueue_flush_file,
+};
+
+static struct file_operations mqueue_notify_fops = {
+	.poll = mqueue_notify_poll,
+	.read = mqueue_notify_read,
+	.release = mqueue_notify_release,
+};
+
+
+static struct super_operations mqueue_super_ops = {
+	.alloc_inode = mqueue_alloc_inode,
+	.destroy_inode = mqueue_destroy_inode,
+	.delete_inode = mqueue_delete_inode,
+	.drop_inode = generic_delete_inode,
+};
+
+static struct file_system_type mqueue_fs_type = {
+	.name = "mqueue",
+	.get_sb = mqueue_get_sb,
+	.kill_sb = kill_anon_super,
+};
+
+static int msg_max_limit_min = DFLT_MSGMAX;
+static int msg_max_limit_max = HARD_MSGMAX;
+
+static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX;
+static int msg_maxsize_limit_max = INT_MAX;
+
+static ctl_table mq_sysctls[] = {
+	{
+		.ctl_name	= CTL_QUEUESMAX,
+		.procname	= "queues_max",
+		.data		= &queues_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_MSGMAX,
+		.procname	= "msg_max",
+		.data		= &msg_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &msg_max_limit_min,
+		.extra2		= &msg_max_limit_max,
+	},
+	{
+		.ctl_name	= CTL_MSGSIZEMAX,
+		.procname	= "msgsize_max",
+		.data		= &msgsize_max,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &msg_maxsize_limit_min,
+		.extra2		= &msg_maxsize_limit_max,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_dir[] = {
+	{
+		.ctl_name	= FS_MQUEUE,
+		.procname	= "mqueue",
+		.mode		= 0555,
+		.child		= mq_sysctls,
+	},
+	{ .ctl_name = 0 }
+};
+
+static ctl_table mq_sysctl_root[] = {
+	{
+		.ctl_name	= CTL_FS,
+		.procname	= "fs",
+		.mode		= 0555,
+		.child		= mq_sysctl_dir,
+	},
+	{ .ctl_name = 0 }
+};
+
+static int __init init_mqueue_fs(void)
+{
+	int error;
+
+	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
+				sizeof(struct mqueue_inode_info), 0,
+				SLAB_HWCACHE_ALIGN, init_once, NULL);
+	if (mqueue_inode_cachep == NULL)
+		return -ENOMEM;
+
+	mq_sysctl_table = register_sysctl_table(mq_sysctl_root, 0);
+	if (!mq_sysctl_table) {
+		error = -ENOMEM;
+		goto out_cache;
+	}
+
+	error = register_filesystem(&mqueue_fs_type);
+	if (error)
+		goto out_sysctl;
+
+	if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) {
+		error = PTR_ERR(mqueue_mnt);
+		goto out_filesystem;
+	}
+
+	/* internal initialization - not common for vfs */
+	queues_count = 0;
+	spin_lock_init(&mq_lock);
+
+	return 0;
+
+out_filesystem:
+	unregister_filesystem(&mqueue_fs_type);
+out_sysctl:
+	unregister_sysctl_table(mq_sysctl_table);
+out_cache:
+	if (kmem_cache_destroy(mqueue_inode_cachep)) {
+		printk(KERN_INFO
+			"mqueue_inode_cache: not all structures were freed\n");
+	}
+	return error;
+}
+
+__initcall(init_mqueue_fs);
diff --git a/kernel/signal.c b/kernel/signal.c
index 32992a71683b..e6b7904df68f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2047,6 +2047,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 		err |= __put_user(from->si_stime, &to->si_stime);
 		break;
 	case __SI_RT: /* This is not generated by the kernel as of now. */
+	case __SI_MESGQ: /* But this is */
 		err |= __put_user(from->si_pid, &to->si_pid);
 		err |= __put_user(from->si_uid, &to->si_uid);
 		err |= __put_user(from->si_int, &to->si_int);
-- 
cgit v1.2.3


From f3ca8d5dd5c23594bda07893ae374bed7981d473 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:54:54 -0700
Subject: [PATCH] posix message queue update

From: Manfred Spraul <manfred@colorfullife.com>

My discussion with Ulrich had one result:

- mq_setattr can accept implementation defined flags.  Right now we have
  none, but we might add some later (e.g.  switch to CLOCK_MONOTONIC for
  mq_timed{send,receive} or something similar).  When we add flags, we
  might need the fields for additional information.  And they don't hurt.
  Therefore add four __reserved fields to mq_attr.

- fail mq_setattr if we get unknown flags - otherwise glibc can't detect
  if it's running on a future kernel that supports new features.

- use memset to initialize the mq_attr structure - theoretically we could
  leak kernel memory.

- Only set O_NONBLOCK in mq_attr, explicitely clear O_RDWR & friends.
  openposix uses getattr, attr |=O_NONBLOCK, setattr - a sane approach.
  Without clearing O_RDWR, this fails.

I've retested all openposix conformance tests with the new patch - the two
new FAILED tests check undefined behavior.  Note that I won't have net
access until Sunday - if the message queue patch breaks something important
either ask Krzysztof or drop it.

Ulrich had another good idea for SIGEV_THREAD, but I must think about it.
It would mean less complexitiy in glibc, but more code in the kernel.  I'm
not yet convinced that it's overall better.
---
 include/linux/mqueue.h | 1 +
 ipc/mqueue.c           | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index c0c5fcc89f0e..535fe4b2f14b 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -27,6 +27,7 @@ struct mq_attr {
 	long	mq_maxmsg;	/* maximum number of messages		*/
 	long	mq_msgsize;	/* maximum message size			*/
 	long	mq_curmsgs;	/* number of messages currently queued	*/
+	long	__reserved[4];	/* ignored for input, zeroed for output */
 };
 
 #define NOTIFY_NONE	0
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index c9a3e652a026..b5f731781f56 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -121,7 +121,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
 			INIT_LIST_HEAD(&info->e_wait_q[1].list);
 			info->notify_owner = 0;
 			info->qsize = 0;
-			info->attr.mq_curmsgs = 0;
+			memset(&info->attr, 0, sizeof(info->attr));
 			info->attr.mq_maxmsg = DFLT_MSGMAX;
 			info->attr.mq_msgsize = DFLT_MSGSIZEMAX;
 			info->messages = kmalloc(DFLT_MSGMAX * sizeof(struct msg_msg *), GFP_KERNEL);
@@ -1082,6 +1082,8 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
 	if (u_mqstat != NULL) {
 		if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr)))
 			return -EFAULT;
+		if (mqstat.mq_flags & (~O_NONBLOCK))
+			return -EINVAL;
 	}
 
 	ret = -EBADF;
@@ -1097,7 +1099,7 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes,
 	spin_lock(&info->lock);
 
 	omqstat = info->attr;
-	omqstat.mq_flags = filp->f_flags;
+	omqstat.mq_flags = filp->f_flags & O_NONBLOCK;
 	if (u_mqstat) {
 		if (mqstat.mq_flags & O_NONBLOCK)
 			filp->f_flags |= O_NONBLOCK;
-- 
cgit v1.2.3


From ed6dcf4a49c1098e0701762f6cc52b194cb7f661 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:19 -0700
Subject: [PATCH] split netlink_unicast

From: Manfred Spraul <manfred@colorfullife.com>

The attached patch splits netlink_unicast into three steps:

- netlink_getsock{bypid,byfilp}: lookup the destination socket.

- netlink_attachskb: perform the nonblock checks, sleep if the socket
  queue is longer than the limit, etc.

- netlink_sendskb: actually send the skb.

jamal looked over it and didn't see a problem with the netlink change.  The
actual use from ipc/mqueue.c is still open (just send back whatever the C
library passed to mq_notify, add an nlmsghdr or perhaps even make it a
specialized netlink protocol), but the attached patch is independant from
the the message queue change.

(acked by davem)
---
 include/linux/netlink.h  |   7 +++
 net/netlink/af_netlink.c | 120 +++++++++++++++++++++++++++++++++++++----------
 2 files changed, 101 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 4e5ea27305a2..e5e15ddadab5 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -119,6 +119,13 @@ extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
 extern int netlink_register_notifier(struct notifier_block *nb);
 extern int netlink_unregister_notifier(struct notifier_block *nb);
 
+/* finegrained unicast helpers: */
+struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid);
+struct sock *netlink_getsockbyfilp(struct file *filp);
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo);
+void netlink_detachskb(struct sock *sk, struct sk_buff *skb);
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol);
+
 /*
  *	skb should fit one page. This choice is good for headerless malloc.
  *
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 38c27b9bb70a..398cd03f2d7b 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -415,38 +415,65 @@ static void netlink_overrun(struct sock *sk)
 	}
 }
 
-int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
+struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
 {
-	struct sock *sk;
-	struct netlink_opt *nlk;
-	int len = skb->len;
 	int protocol = ssk->sk_protocol;
-	long timeo;
-        DECLARE_WAITQUEUE(wait, current);
-
-	timeo = sock_sndtimeo(ssk, nonblock);
+	struct sock *sock;
+	struct netlink_opt *nlk;
 
-retry:
-	sk = netlink_lookup(protocol, pid);
-	if (sk == NULL)
-		goto no_dst;
-	nlk = nlk_sk(sk);
+	sock = netlink_lookup(protocol, pid);
+	if (!sock)
+		return ERR_PTR(-ECONNREFUSED);
 
 	/* Don't bother queuing skb if kernel socket has no input function */
-        if (nlk->pid == 0 && !nlk->data_ready)
-        	goto no_dst;
+	nlk = nlk_sk(sock);
+	if (nlk->pid == 0 && !nlk->data_ready) {
+		sock_put(sock);
+		return ERR_PTR(-ECONNREFUSED);
+	}
+	return sock;
+}
+
+struct sock *netlink_getsockbyfilp(struct file *filp)
+{
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct socket *socket;
+	struct sock *sock;
+
+	if (!inode->i_sock || !(socket = SOCKET_I(inode)))
+		return ERR_PTR(-ENOTSOCK);
+
+	sock = socket->sk;
+	if (sock->sk_family != AF_NETLINK)
+		return ERR_PTR(-EINVAL);
+
+	sock_hold(sock);
+	return sock;
+}
+
+/*
+ * Attach a skb to a netlink socket.
+ * The caller must hold a reference to the destination socket. On error, the
+ * reference is dropped. The skb is not send to the destination, just all
+ * all error checks are performed and memory in the queue is reserved.
+ * Return values:
+ * < 0: error. skb freed, reference to sock dropped.
+ * 0: continue
+ * 1: repeat lookup - reference dropped while waiting for socket memory.
+ */
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo)
+{
+	struct netlink_opt *nlk;
+
+	nlk = nlk_sk(sk);
 
 #ifdef NL_EMULATE_DEV
-	if (nlk->handler) {
-		skb_orphan(skb);
-		len = nlk->handler(protocol, skb);
-		sock_put(sk);
-		return len;
-	}
+	if (nlk->handler)
+		return 0;
 #endif
-
 	if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
 	    test_bit(0, &nlk->state)) {
+		DECLARE_WAITQUEUE(wait, current);
 		if (!timeo) {
 			if (!nlk->pid)
 				netlink_overrun(sk);
@@ -471,19 +498,60 @@ retry:
 			kfree_skb(skb);
 			return sock_intr_errno(timeo);
 		}
-		goto retry;
+		return 1;
 	}
-
 	skb_orphan(skb);
 	skb_set_owner_r(skb, sk);
+	return 0;
+}
+
+int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol)
+{
+	struct netlink_opt *nlk;
+	int len = skb->len;
+
+	nlk = nlk_sk(sk);
+#ifdef NL_EMULATE_DEV
+	if (nlk->handler) {
+		skb_orphan(skb);
+		len = nlk->handler(protocol, skb);
+		sock_put(sk);
+		return len;
+	}
+#endif
+
 	skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk, len);
 	sock_put(sk);
 	return len;
+}
 
-no_dst:
+void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
+{
 	kfree_skb(skb);
-	return -ECONNREFUSED;
+	sock_put(sk);
+}
+
+int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock)
+{
+	struct sock *sk;
+	int err;
+	long timeo;
+
+	timeo = sock_sndtimeo(ssk, nonblock);
+retry:
+	sk = netlink_getsockbypid(ssk, pid);
+	if (IS_ERR(sk)) {
+		kfree_skb(skb);
+		return PTR_ERR(skb);
+	}
+	err = netlink_attachskb(sk, skb, nonblock, timeo);
+	if (err == 1)
+		goto retry;
+	if (err)
+		return err;
+
+	return netlink_sendskb(sk, skb, ssk->sk_protocol);
 }
 
 static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
-- 
cgit v1.2.3


From 34b98f223bb21673f4cab2f5079a763c34a67946 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:32 -0700
Subject: [PATCH] posix message queues: send notifications via netlink

From: Manfred Spraul <manfred@colorfullife.com>

SIGEV_THREAD means that a given callback should be called in the context on a
new thread.  This must be done by the C library.  The kernel must deliver a
notice of the event to the C library when the callback should be called.

This patch switches to a new, simpler interface: User space creates a socket
with socket(PF_NETLINK, SOCK_RAW,0) and passes the fd to the mq_notify call
together with a cookie.  When the mq_notify() condition is satisfied, the
kernel "writes" the cookie to the socket.  User space then reads the cookie
and calls the appropriate callback.
---
 include/linux/mqueue.h |  16 ++++
 ipc/mqueue.c           | 254 +++++++++++++++++++++----------------------------
 2 files changed, 123 insertions(+), 147 deletions(-)

(limited to 'include')

diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index 535fe4b2f14b..fdab3b8ee242 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -30,8 +30,24 @@ struct mq_attr {
 	long	__reserved[4];	/* ignored for input, zeroed for output */
 };
 
+/*
+ * SIGEV_THREAD implementation:
+ * SIGEV_THREAD must be implemented in user space. If SIGEV_THREAD is passed
+ * to mq_notify, then
+ * - sigev_signo must be the file descriptor of an AF_NETLINK socket. It's not
+ *   necessary that the socket is bound.
+ * - sigev_value.sival_ptr must point to a cookie that is NOTIFY_COOKIE_LEN
+ *   bytes long.
+ * If the notification is triggered, then the cookie is sent to the netlink
+ * socket. The last byte of the cookie is replaced with the NOTIFY_?? codes:
+ * NOTIFY_WOKENUP if the notification got triggered, NOTIFY_REMOVED if it was
+ * removed, either due to a close() on the message queue fd or due to a
+ * mq_notify() that removed the notification.
+ */
 #define NOTIFY_NONE	0
 #define NOTIFY_WOKENUP	1
 #define NOTIFY_REMOVED	2
 
+#define NOTIFY_COOKIE_LEN	32
+
 #endif
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index f0d78fefc28b..f81441d63564 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -20,6 +20,9 @@
 #include <linux/poll.h>
 #include <linux/mqueue.h>
 #include <linux/msg.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <net/sock.h>
 #include "util.h"
 
 #define MQUEUE_MAGIC	0x19800202
@@ -33,9 +36,6 @@
 #define STATE_PENDING	1
 #define STATE_READY	2
 
-#define NP_NONE		((void*)NOTIFY_NONE)
-#define NP_WOKENUP	((void*)NOTIFY_WOKENUP)
-#define NP_REMOVED	((void*)NOTIFY_REMOVED)
 /* used by sysctl */
 #define FS_MQUEUE 	1
 #define CTL_QUEUESMAX 	2
@@ -48,6 +48,8 @@
 #define HARD_MSGMAX 	(131072/sizeof(void*))
 #define DFLT_MSGSIZEMAX 16384	/* max message size */
 
+#define NOTIFY_COOKIE_LEN	32
+
 struct ext_wait_queue {		/* queue of sleeping tasks */
 	struct task_struct *task;
 	struct list_head list;
@@ -56,25 +58,26 @@ struct ext_wait_queue {		/* queue of sleeping tasks */
 };
 
 struct mqueue_inode_info {
-	struct mq_attr attr;
+	spinlock_t lock;
+	struct inode vfs_inode;
+	wait_queue_head_t wait_q;
+
 	struct msg_msg **messages;
+	struct mq_attr attr;
 
-	pid_t notify_owner;	/* != 0 means notification registered */
-	struct sigevent notify;
-	struct file *notify_filp;
+	struct sigevent notify; /* notify.sigev_notify == SIGEV_NONE means */
+	pid_t notify_owner;	/*           no notification registered */
+	struct sock *notify_sock;
+	struct sk_buff *notify_cookie;
 
 	/* for tasks waiting for free space and messages, respectively */
 	struct ext_wait_queue e_wait_q[2];
-	wait_queue_head_t wait_q;
 
 	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
-	spinlock_t lock;
-	struct inode vfs_inode;
 };
 
 static struct inode_operations mqueue_dir_inode_operations;
 static struct file_operations mqueue_file_operations;
-static struct file_operations mqueue_notify_fops;
 static struct super_operations mqueue_super_ops;
 static void remove_notification(struct mqueue_inode_info *info);
 
@@ -119,7 +122,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode)
 			init_waitqueue_head(&info->wait_q);
 			INIT_LIST_HEAD(&info->e_wait_q[0].list);
 			INIT_LIST_HEAD(&info->e_wait_q[1].list);
-			info->notify_owner = 0;
+			info->notify.sigev_notify = SIGEV_NONE;
 			info->qsize = 0;
 			memset(&info->attr, 0, sizeof(info->attr));
 			info->attr.mq_maxmsg = DFLT_MSGMAX;
@@ -283,10 +286,11 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 	snprintf(buffer, sizeof(buffer),
 			"QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n",
 			info->qsize,
-			info->notify_owner ? info->notify.sigev_notify : SIGEV_NONE,
-			(info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL ) ?
+			info->notify.sigev_notify,
+			(info->notify.sigev_notify == SIGEV_SIGNAL ) ?
 				info->notify.sigev_signo : 0,
-			info->notify_owner);
+			(info->notify.sigev_notify != SIGEV_NONE) ?
+				info->notify_owner : 0);
 	spin_unlock(&info->lock);
 	buffer[sizeof(buffer)-1] = '\0';
 	slen = strlen(buffer)+1;
@@ -299,7 +303,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data,
 		count = slen - o;
 
 	if (copy_to_user(u_data, buffer + o, count))
-       		return -EFAULT;
+		return -EFAULT;
 
 	*off = o + count;
 	filp->f_dentry->d_inode->i_atime = filp->f_dentry->d_inode->i_ctime = CURRENT_TIME;
@@ -311,7 +315,8 @@ static int mqueue_flush_file(struct file *filp)
 	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
 
 	spin_lock(&info->lock);
-	if (current->tgid == info->notify_owner)
+	if (info->notify.sigev_notify != SIGEV_NONE &&
+			current->tgid == info->notify_owner)
 		remove_notification(info);
 
 	spin_unlock(&info->lock);
@@ -435,6 +440,11 @@ static inline struct msg_msg *msg_get(struct mqueue_inode_info *info)
 	return info->messages[info->attr.mq_curmsgs];
 }
 
+static inline void set_cookie(struct sk_buff *skb, char code)
+{
+	((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code;
+}
+
 /*
  * The next function is only to split too long sys_mq_timedsend
  */
@@ -445,7 +455,8 @@ static void __do_notify(struct mqueue_inode_info *info)
 	 * waiting synchronously for message AND state of queue changed from
 	 * empty to not empty. Here we are sure that no one is waiting
 	 * synchronously. */
-	if (info->notify_owner && info->attr.mq_curmsgs == 1) {
+	if (info->notify.sigev_notify != SIGEV_NONE &&
+			info->attr.mq_curmsgs == 1) {
 		/* sends signal */
 		if (info->notify.sigev_notify == SIGEV_SIGNAL) {
 			struct siginfo sig_i;
@@ -460,10 +471,12 @@ static void __do_notify(struct mqueue_inode_info *info)
 			kill_proc_info(info->notify.sigev_signo,
 				       &sig_i, info->notify_owner);
 		} else if (info->notify.sigev_notify == SIGEV_THREAD) {
-			info->notify_filp->private_data = (void*)NP_WOKENUP;
+			set_cookie(info->notify_cookie, NOTIFY_WOKENUP);
+			netlink_sendskb(info->notify_sock,
+					info->notify_cookie, 0);
 		}
 		/* after notification unregisters process */
-		info->notify_owner = 0;
+		info->notify.sigev_notify = SIGEV_NONE;
 	}
 	wake_up(&info->wait_q);
 }
@@ -499,90 +512,13 @@ static long prepare_timeout(const struct timespec __user *u_arg)
 	return timeout;
 }
 
-/*
- * File descriptor based notification, intended to be used to implement
- * SIGEV_THREAD:
- * SIGEV_THREAD means that a notification function should be called in the
- * context of a new thread. The kernel can't do that. Therefore mq_notify
- * calls with SIGEV_THREAD return a new file descriptor. A user space helper
- * must create a new thread and then read from the given file descriptor.
- * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must
- * call the notification function. If it's NOTIFY_REMOVED, then the
- * notification was removed. The file descriptor supports poll, thus one
- * supervisor thread can manage multiple message queue notifications.
- *
- * The implementation must support multiple outstanding notifications:
- * It's possible that a new notification is added and signaled before user
- * space calls mqueue_notify_read for the previous notification.
- * Therefore the notification state is stored in the private_data field of
- * the file descriptor.
- */
-static unsigned int mqueue_notify_poll(struct file *filp,
-					struct poll_table_struct *poll_tab)
-{
-	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
-	int retval;
-
-	poll_wait(filp, &info->wait_q, poll_tab);
-
-	if (filp->private_data == NP_NONE)
-		retval = 0;
-	else
-		retval = POLLIN | POLLRDNORM;
-	return retval;
-}
-
-static ssize_t mqueue_notify_read(struct file *filp, char __user *buf,
-					size_t count, loff_t *ppos)
-{
-	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
-	char result;
-
-	if (!count)
-		return 0;
-	if (*ppos != 0)
-		return 0;
-	spin_lock(&info->lock);
-	while (filp->private_data == NP_NONE) {
-		DEFINE_WAIT(wait);
-		if (filp->f_flags & O_NONBLOCK) {
-			spin_unlock(&info->lock);
-			return -EAGAIN;
-		}
-		prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE);
-		spin_unlock(&info->lock);
-		schedule();
-		finish_wait(&info->wait_q, &wait);
-		spin_lock(&info->lock);
-	}
-	spin_unlock(&info->lock);
-	result = (char)(unsigned long)filp->private_data;
-	if (put_user(result, buf))
-		return -EFAULT;
-	*ppos = 1;
-	return 1;
-}
-
-static int mqueue_notify_release(struct inode *inode, struct file *filp)
-{
-	struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode);
-
-	spin_lock(&info->lock);
-	if (info->notify_owner && info->notify_filp == filp)
-		info->notify_owner = 0;
-	filp->private_data = NP_REMOVED;
-	spin_unlock(&info->lock);
-
-	return 0;
-}
-
 static void remove_notification(struct mqueue_inode_info *info)
 {
 	if (info->notify.sigev_notify == SIGEV_THREAD) {
-		info->notify_filp->private_data = NP_REMOVED;
-		wake_up(&info->wait_q);
+		set_cookie(info->notify_cookie, NOTIFY_REMOVED);
+		netlink_sendskb(info->notify_sock, info->notify_cookie, 0);
 	}
-	info->notify_owner = 0;
+	info->notify.sigev_notify = SIGEV_NONE;
 }
 
 /*
@@ -780,7 +716,8 @@ out_unlock:
  */
 
 /* pipelined_send() - send a message directly to the task waiting in
- * sys_mq_timedreceive() (without inserting message into a queue). */
+ * sys_mq_timedreceive() (without inserting message into a queue).
+ */
 static inline void pipelined_send(struct mqueue_inode_info *info,
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
@@ -978,12 +915,16 @@ out:
 asmlinkage long sys_mq_notify(mqd_t mqdes,
 				const struct sigevent __user *u_notification)
 {
-	int ret, fd;
-	struct file *filp, *nfilp;
+	int ret;
+	struct file *filp;
+	struct sock *sock;
 	struct inode *inode;
 	struct sigevent notification;
 	struct mqueue_inode_info *info;
+	struct sk_buff *nc;
 
+	nc = NULL;
+	sock = NULL;
 	if (u_notification == NULL) {
 		notification.sigev_notify = SIGEV_NONE;
 	} else {
@@ -1000,6 +941,44 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 			 notification.sigev_signo > _NSIG)) {
 			return -EINVAL;
 		}
+		if (notification.sigev_notify == SIGEV_THREAD) {
+			/* create the notify skb */
+			nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL);
+			ret = -ENOMEM;
+			if (!nc)
+				goto out;
+			ret = -EFAULT;
+			if (copy_from_user(nc->data,
+					notification.sigev_value.sival_ptr,
+					NOTIFY_COOKIE_LEN)) {
+				goto out;
+			}
+
+			/* TODO: add a header? */
+			skb_put(nc, NOTIFY_COOKIE_LEN);
+			/* and attach it to the socket */
+retry:
+			filp = fget(notification.sigev_signo);
+			ret = -EBADF;
+			if (!filp)
+				goto out;
+			sock = netlink_getsockbyfilp(filp);
+			fput(filp);
+			if (IS_ERR(sock)) {
+				ret = PTR_ERR(sock);
+				sock = NULL;
+				goto out;
+			}
+
+			ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT);
+			if (ret == 1)
+		       		goto retry;
+			if (ret) {
+				sock = NULL;
+				nc = NULL;
+				goto out;
+			}
+		}
 	}
 
 	ret = -EBADF;
@@ -1013,47 +992,33 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 	info = MQUEUE_I(inode);
 
 	ret = 0;
-	if (notification.sigev_notify == SIGEV_THREAD) {
-		ret = get_unused_fd();
-		if (ret < 0)
-			goto out_fput;
-		fd = ret;
-		nfilp = get_empty_filp();
-		if (!nfilp) {
-			ret = -ENFILE;
-			goto out_dropfd;
-		}
-		nfilp->private_data = NP_NONE;
-		nfilp->f_op = &mqueue_notify_fops;
-		nfilp->f_vfsmnt = mntget(mqueue_mnt);
-		nfilp->f_dentry = dget(filp->f_dentry);
-		nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping;
-		nfilp->f_flags = O_RDONLY;
-		nfilp->f_mode = FMODE_READ;
-	} else {
-		nfilp = NULL;
-		fd = -1;
-	}
-
 	spin_lock(&info->lock);
-
-	if (notification.sigev_notify == SIGEV_NONE) {
-		if (info->notify_owner == current->tgid) {
+	switch (notification.sigev_notify) {
+	case SIGEV_NONE:
+		if (info->notify.sigev_notify != SIGEV_NONE &&
+				info->notify_owner == current->tgid) {
 			remove_notification(info);
 			inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		}
-	} else if (info->notify_owner) {
-		ret = -EBUSY;
-	} else if (notification.sigev_notify == SIGEV_THREAD) {
-		info->notify_filp = nfilp;
-		fd_install(fd, nfilp);
-		ret = fd;
-		fd = -1;
-		nfilp = NULL;
+		break;
+	case SIGEV_THREAD:
+		if (info->notify.sigev_notify != SIGEV_NONE) {
+			ret = -EBUSY;
+			break;
+		}
+		info->notify_sock = sock;
+		info->notify_cookie = nc;
+		sock = NULL;
+		nc = NULL;
 		info->notify.sigev_notify = SIGEV_THREAD;
 		info->notify_owner = current->tgid;
 		inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	}  else {
+		break;
+	case SIGEV_SIGNAL:
+		if (info->notify.sigev_notify != SIGEV_NONE) {
+			ret = -EBUSY;
+			break;
+		}
 		info->notify.sigev_signo = notification.sigev_signo;
 		info->notify.sigev_value = notification.sigev_value;
 		info->notify.sigev_notify = SIGEV_SIGNAL;
@@ -1061,12 +1026,14 @@ asmlinkage long sys_mq_notify(mqd_t mqdes,
 		inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	}
 	spin_unlock(&info->lock);
-out_dropfd:
-	if (fd != -1)
-		put_unused_fd(fd);
 out_fput:
 	fput(filp);
 out:
+	if (sock) {
+		netlink_detachskb(sock, nc);
+	} else if (nc) {
+		dev_kfree_skb(nc);
+	}
 	return ret;
 }
 
@@ -1135,13 +1102,6 @@ static struct file_operations mqueue_file_operations = {
 	.read = mqueue_read_file,
 };
 
-static struct file_operations mqueue_notify_fops = {
-	.poll = mqueue_notify_poll,
-	.read = mqueue_notify_read,
-	.release = mqueue_notify_release,
-};
-
-
 static struct super_operations mqueue_super_ops = {
 	.alloc_inode = mqueue_alloc_inode,
 	.destroy_inode = mqueue_destroy_inode,
-- 
cgit v1.2.3


From 87c22e8470366e81aa82bcbadaf147c4ecdfb182 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:55:45 -0700
Subject: [PATCH] compat emulation for posix message queues

From: Arnd Bergmann <arnd@arndb.de>

I have tested the code with the open posix test suite and found the same
four failures for both 64-bit and compat mode, most tests pass.  The patch
is against -mc1, but I guess it also applies to the other trees around.

What worries me more than mq_attr compatibility is the conversion of struct
sigevent, which might turn out really hard when more fields in there are
used.  AFAICS, the only other part in the kernel ABI is sys_timer_create(),
so maybe it's not too late to deprecate the current structure and create a
structure that can be used properly for compat syscalls.
---
 arch/ia64/ia32/ia32_signal.c     |   7 +-
 arch/mips/kernel/signal32.c      |   7 +-
 arch/s390/kernel/compat_signal.c |   5 +-
 arch/sparc64/kernel/signal32.c   |   7 +-
 arch/x86_64/ia32/ia32_signal.c   |   6 +-
 include/asm-ppc64/ppc32.h        |  14 ---
 include/linux/compat.h           |  17 ++++
 include/linux/mqueue.h           |   4 +-
 include/linux/posix_types.h      |   1 +
 include/linux/syscalls.h         |   1 -
 include/linux/types.h            |   1 +
 ipc/Makefile                     |   3 +-
 ipc/compat_mq.c                  | 196 +++++++++++++++++++++++++++++++++++++++
 kernel/sys.c                     |   5 +
 14 files changed, 251 insertions(+), 23 deletions(-)
 create mode 100644 ipc/compat_mq.c

(limited to 'include')

diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c
index 8b1374c172b6..bb1e836fb227 100644
--- a/arch/ia64/ia32/ia32_signal.c
+++ b/arch/ia64/ia32/ia32_signal.c
@@ -114,7 +114,12 @@ copy_siginfo_from_user32 (siginfo_t *to, siginfo_t32 *from)
 			err |= __get_user(to->si_band, &from->si_band);
 			err |= __get_user(to->si_fd, &from->si_fd);
 			break;
-			/* case __SI_RT: This is not generated by the kernel as of now.  */
+		      case __SI_RT: /* This is not generated by the kernel as of now.  */
+		      case __SI_MESGQ:
+			err |= __get_user(to->si_pid, &from->si_pid);
+			err |= __get_user(to->si_uid, &from->si_uid);
+			err |= __get_user(to->si_int, &from->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c
index 5c1489f4fdc2..c52074f84300 100644
--- a/arch/mips/kernel/signal32.c
+++ b/arch/mips/kernel/signal32.c
@@ -358,7 +358,12 @@ static int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 			err |= __put_user(from->si_band, &to->si_band);
 			err |= __put_user(from->si_fd, &to->si_fd);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_pid, &to->si_pid);
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index 44fe6e477e92..373040404a5a 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -74,6 +74,10 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 		err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE);
 	else {
 		switch (from->si_code >> 16) {
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_int, &to->si_int);
+			/* fallthrough */
 		case __SI_KILL >> 16:
 			err |= __put_user(from->si_pid, &to->si_pid);
 			err |= __put_user(from->si_uid, &to->si_uid);
@@ -96,7 +100,6 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from)
 			break;
 		default:
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
 		}
 	}
 	return err;
diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c
index cc3019d6dd65..e2f62a666d8c 100644
--- a/arch/sparc64/kernel/signal32.c
+++ b/arch/sparc64/kernel/signal32.c
@@ -129,7 +129,12 @@ int copy_siginfo_to_user32(siginfo_t32 __user *to, siginfo_t *from)
 			err |= __put_user(from->si_trapno, &to->si_trapno);
 			err |= __put_user((long)from->si_addr, &to->si_addr);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_pid, &to->si_pid);
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c
index bce5fbc5be2c..1a828de6a55d 100644
--- a/arch/x86_64/ia32/ia32_signal.c
+++ b/arch/x86_64/ia32/ia32_signal.c
@@ -85,7 +85,11 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from)
 			err |= __put_user(from->si_overrun, &to->si_overrun); 
 			err |= __put_user((u32)(u64)from->si_ptr, &to->si_ptr);
 			break;
-		/* case __SI_RT: This is not generated by the kernel as of now.  */
+		case __SI_RT: /* This is not generated by the kernel as of now.  */
+		case __SI_MESGQ:
+			err |= __put_user(from->si_uid, &to->si_uid);
+			err |= __put_user(from->si_int, &to->si_int);
+			break;
 		}
 	}
 	return err;
diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h
index 53865a8c4f8d..7338ea298a19 100644
--- a/include/asm-ppc64/ppc32.h
+++ b/include/asm-ppc64/ppc32.h
@@ -141,20 +141,6 @@ struct ucontext32 {
 	struct mcontext32	uc_mcontext;
 };
 
-typedef struct compat_sigevent {
-	compat_sigval_t sigev_value;
-	int sigev_signo;
-	int sigev_notify;
-	union {
-		int _pad[SIGEV_PAD_SIZE];
-		int _tid;
-		struct {
-			compat_uptr_t _function;
-			compat_uptr_t _attribute;
-		} _sigev_thread;
-	} _sigev_un;
-} compat_sigevent_t;
-
 struct ipc_kludge_32 {
 	unsigned int msgp;
 	int msgtyp;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 7b82209ab4ab..796204f59bd9 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -13,6 +13,7 @@
 #include <linux/sem.h>
 
 #include <asm/compat.h>
+#include <asm/siginfo.h>
 
 #define compat_jiffies_to_clock_t(x)	\
 		(((unsigned long)(x) * COMPAT_USER_HZ) / HZ)
@@ -90,6 +91,22 @@ typedef union compat_sigval {
 	compat_uptr_t	sival_ptr;
 } compat_sigval_t;
 
+typedef struct compat_sigevent {
+	compat_sigval_t sigev_value;
+	compat_int_t sigev_signo;
+	compat_int_t sigev_notify;
+	union {
+		compat_int_t _pad[SIGEV_PAD_SIZE];
+		compat_int_t _tid;
+
+		struct {
+			compat_uptr_t _function;
+			compat_uptr_t _attribute;
+		} _sigev_thread;
+	} _sigev_un;
+} compat_sigevent_t;
+
+
 long compat_sys_semctl(int first, int second, int third, void __user *uptr);
 long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
 long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h
index fdab3b8ee242..fc40b774b913 100644
--- a/include/linux/mqueue.h
+++ b/include/linux/mqueue.h
@@ -18,9 +18,9 @@
 #ifndef _LINUX_MQUEUE_H
 #define _LINUX_MQUEUE_H
 
-#define MQ_PRIO_MAX 	32768
+#include <linux/types.h>
 
-typedef int mqd_t;
+#define MQ_PRIO_MAX 	32768
 
 struct mq_attr {
 	long	mq_flags;	/* message queue flags			*/
diff --git a/include/linux/posix_types.h b/include/linux/posix_types.h
index 3ee2ed9de1db..f04c98cf44f3 100644
--- a/include/linux/posix_types.h
+++ b/include/linux/posix_types.h
@@ -42,6 +42,7 @@ typedef void (*__kernel_sighandler_t)(int);
 
 /* Type of a SYSV IPC key.  */
 typedef int __kernel_key_t;
+typedef int __kernel_mqd_t;
 
 #include <asm/posix_types.h>
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 7ee5f67abb5f..89ffe55898f2 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -48,7 +48,6 @@ struct timex;
 struct timezone;
 struct tms;
 struct utimbuf;
-typedef int mqd_t;
 struct mq_attr;
 
 #include <linux/config.h>
diff --git a/include/linux/types.h b/include/linux/types.h
index 3b407b06b48f..93f5f3653561 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -31,6 +31,7 @@ typedef __kernel_key_t		key_t;
 typedef __kernel_suseconds_t	suseconds_t;
 typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
+typedef __kernel_mqd_t		mqd_t;
 
 #ifdef __KERNEL__
 typedef __kernel_uid32_t	uid_t;
diff --git a/ipc/Makefile b/ipc/Makefile
index 913790207d85..0a6d626cd794 100644
--- a/ipc/Makefile
+++ b/ipc/Makefile
@@ -4,5 +4,6 @@
 
 obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o
 obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o
-obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o
+obj_mq-$(CONFIG_COMPAT) += compat_mq.o
+obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y)
 
diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c
new file mode 100644
index 000000000000..1520df89c424
--- /dev/null
+++ b/ipc/compat_mq.c
@@ -0,0 +1,196 @@
+/*
+ *  ipc/compat_mq.c
+ *    32 bit emulation for POSIX message queue system calls
+ *
+ *    Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation
+ *    Author: Arnd Bergmann <arnd@arndb.de>
+ */
+
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/mqueue.h>
+#include <linux/syscalls.h>
+
+#include <asm/uaccess.h>
+
+struct compat_mq_attr {
+	compat_long_t mq_flags;      /* message queue flags		     */
+	compat_long_t mq_maxmsg;     /* maximum number of messages	     */
+	compat_long_t mq_msgsize;    /* maximum message size		     */
+	compat_long_t mq_curmsgs;    /* number of messages currently queued  */
+	compat_long_t __reserved[4]; /* ignored for input, zeroed for output */
+};
+
+static inline int get_compat_mq_attr(struct mq_attr *attr,
+			const struct compat_mq_attr __user *uattr)
+{
+	if (verify_area(VERIFY_READ, uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __get_user(attr->mq_flags, &uattr->mq_flags)
+		| __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __get_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+static inline int put_compat_mq_attr(const struct mq_attr *attr,
+			struct compat_mq_attr __user *uattr)
+{
+	if (clear_user(uattr, sizeof *uattr))
+		return -EFAULT;
+
+	return __put_user(attr->mq_flags, &uattr->mq_flags)
+		| __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg)
+		| __put_user(attr->mq_msgsize, &uattr->mq_msgsize)
+		| __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs);
+}
+
+asmlinkage long compat_sys_mq_open(const char __user *u_name,
+			int oflag, compat_mode_t mode,
+			struct compat_mq_attr __user *u_attr)
+{
+	struct mq_attr attr;
+	mm_segment_t oldfs;
+	char *name;
+	long ret;
+
+	if ((oflag & O_CREAT) == 0 || !u_attr)
+		return sys_mq_open(u_name, oflag, mode, 0);
+
+	if (get_compat_mq_attr(&attr, u_attr))
+		return -EFAULT;
+
+	name = getname(u_name);
+	if (IS_ERR(name))
+		return PTR_ERR(name);
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_open(name, oflag, mode, &attr);
+	set_fs(oldfs);
+
+	putname(name);
+	return ret;
+}
+
+static struct timespec __user *compat_prepare_timeout(
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec ts, __user *u_ts;
+
+	if (!u_abs_timeout)
+		return 0;
+
+	u_ts = compat_alloc_user_space(sizeof(*u_ts));
+	if (get_compat_timespec(&ts, u_abs_timeout)
+		|| copy_to_user(u_ts, &ts, sizeof(*u_ts)))
+		return ERR_PTR(-EFAULT);
+
+	return u_ts;
+}
+
+asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes,
+			const char __user *u_msg_ptr,
+			size_t msg_len, unsigned int msg_prio,
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec __user *u_ts;
+
+	u_ts = compat_prepare_timeout(u_abs_timeout);
+	if (IS_ERR(u_ts))
+		return -EFAULT;
+
+	return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len,
+			msg_prio, u_ts);
+}
+
+asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes,
+			char __user *u_msg_ptr,
+			size_t msg_len, unsigned int __user *u_msg_prio,
+			const struct compat_timespec __user *u_abs_timeout)
+{
+	struct timespec *u_ts;
+
+	u_ts = compat_prepare_timeout(u_abs_timeout);
+	if (IS_ERR(u_ts))
+		return -EFAULT;
+
+	return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len,
+			u_msg_prio, u_ts);
+}
+
+static int get_compat_sigevent(struct sigevent *event,
+		const struct compat_sigevent __user *u_event)
+{
+	if (verify_area(VERIFY_READ, u_event, sizeof(*u_event)))
+		return -EFAULT;
+
+	return __get_user(event->sigev_value.sival_int,
+			  &u_event->sigev_value.sival_int)
+	     | __get_user(event->sigev_signo, &u_event->sigev_signo)
+	     | __get_user(event->sigev_notify, &u_event->sigev_notify)
+	     | __get_user(event->sigev_notify_thread_id,
+			  &u_event->sigev_notify_thread_id);
+}
+
+asmlinkage long compat_sys_mq_notify(mqd_t mqdes,
+			const struct compat_sigevent __user *u_notification)
+{
+	mm_segment_t oldfs;
+	struct sigevent notification;
+	char cookie[NOTIFY_COOKIE_LEN];
+	compat_uptr_t u_cookie;
+	long ret;
+
+	if (!u_notification)
+		return sys_mq_notify(mqdes, 0);
+
+	if (get_compat_sigevent(&notification, u_notification))
+		return -EFAULT;
+
+	if (notification.sigev_notify == SIGEV_THREAD) {
+		u_cookie = (compat_uptr_t)notification.sigev_value.sival_int;
+		if (copy_from_user(cookie, compat_ptr(u_cookie),
+						NOTIFY_COOKIE_LEN)) {
+			return -EFAULT;
+		}
+		notification.sigev_value.sival_ptr = cookie;
+	}
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_notify(mqdes, &notification);
+	set_fs(oldfs);
+
+	return ret;
+}
+
+asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes,
+			const struct compat_mq_attr __user *u_mqstat,
+			struct compat_mq_attr __user *u_omqstat)
+{
+	struct mq_attr mqstat, omqstat;
+	struct mq_attr *p_mqstat = 0, *p_omqstat = 0;
+	mm_segment_t oldfs;
+	long ret;
+
+	if (u_mqstat) {
+		p_mqstat = &mqstat;
+		if (get_compat_mq_attr(p_mqstat, u_mqstat))
+			return -EFAULT;
+	}
+
+	if (u_omqstat)
+		p_omqstat = &omqstat;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = sys_mq_getsetattr(mqdes, p_mqstat, p_omqstat);
+	set_fs(oldfs);
+
+	if (ret)
+		return ret;
+
+	return (u_omqstat) ? put_compat_mq_attr(&omqstat, u_omqstat) : 0;
+}
diff --git a/kernel/sys.c b/kernel/sys.c
index 7d1bf5c57aca..81f9e02f2071 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -266,6 +266,11 @@ cond_syscall(sys_mq_timedsend)
 cond_syscall(sys_mq_timedreceive)
 cond_syscall(sys_mq_notify)
 cond_syscall(sys_mq_getsetattr)
+cond_syscall(compat_sys_mq_open)
+cond_syscall(compat_sys_mq_timedsend)
+cond_syscall(compat_sys_mq_timedreceive)
+cond_syscall(compat_sys_mq_notify)
+cond_syscall(compat_sys_mq_getsetattr)
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read)
-- 
cgit v1.2.3


From 0ab2d6681c4e8502990523d46d928f37b764d52d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:34 -0700
Subject: [PATCH] IPMI driver updates

From: Corey Minyard <minyard@acm.org>

- Add support for messaging through an IPMI LAN interface, which is
  required for some system software that already exists on other IPMI
  drivers.  It also does some renaming and a lot of little cleanups.

- Add the "System Interface" driver.  The previous driver for system
  interfaces only supported the KCS interface, this driver supports all
  system interfaces defined in the IPMI standard.  It also does a much better
  job of handling ACPI and SMBIOS tables for detecting IPMI system
  interfaces.
---
 Documentation/IPMI.txt              |  218 +++-
 drivers/char/ipmi/Kconfig           |    8 +-
 drivers/char/ipmi/Makefile          |    9 +-
 drivers/char/ipmi/ipmi_bt_sm.c      |  513 +++++++++
 drivers/char/ipmi/ipmi_devintf.c    |  197 ++--
 drivers/char/ipmi/ipmi_kcs_intf.c   | 1305 ----------------------
 drivers/char/ipmi/ipmi_kcs_sm.c     |  156 +--
 drivers/char/ipmi/ipmi_kcs_sm.h     |   70 --
 drivers/char/ipmi/ipmi_msghandler.c | 1292 +++++++++++++++++++---
 drivers/char/ipmi/ipmi_si_intf.c    | 2052 +++++++++++++++++++++++++++++++++++
 drivers/char/ipmi/ipmi_si_sm.h      |  117 ++
 drivers/char/ipmi/ipmi_smic_sm.c    |  599 ++++++++++
 drivers/char/ipmi/ipmi_watchdog.c   |  122 +--
 include/linux/ipmi.h                |  131 ++-
 include/linux/ipmi_msgdefs.h        |   36 +-
 include/linux/ipmi_smi.h            |   14 +-
 16 files changed, 5013 insertions(+), 1826 deletions(-)
 create mode 100644 drivers/char/ipmi/ipmi_bt_sm.c
 delete mode 100644 drivers/char/ipmi/ipmi_kcs_intf.c
 delete mode 100644 drivers/char/ipmi/ipmi_kcs_sm.h
 create mode 100644 drivers/char/ipmi/ipmi_si_intf.c
 create mode 100644 drivers/char/ipmi/ipmi_si_sm.h
 create mode 100644 drivers/char/ipmi/ipmi_smic_sm.c

(limited to 'include')

diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt
index 825e83cb4acc..ec8a6fa2c34b 100644
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -22,6 +22,58 @@ are not familiar with IPMI itself, see the web site at
 http://www.intel.com/design/servers/ipmi/index.htm.  IPMI is a big
 subject and I can't cover it all here!
 
+Configuration
+-------------
+
+The LinuxIPMI driver is modular, which means you have to pick several
+things to have it work right depending on your hardware.  Most of
+these are available in the 'Character Devices' menu.
+
+No matter what, you must pick 'IPMI top-level message handler' to use
+IPMI.  What you do beyond that depends on your needs and hardware.
+
+The message handler does not provide any user-level interfaces.
+Kernel code (like the watchdog) can still use it.  If you need access
+from userland, you need to select 'Device interface for IPMI' if you
+want access through a device driver.  Another interface is also
+available, you may select 'IPMI sockets' in the 'Networking Support'
+main menu.  This provides a socket interface to IPMI.  You may select
+both of these at the same time, they will both work together.
+
+The driver interface depends on your hardware.  If you have a board
+with a standard interface (These will generally be either "KCS",
+"SMIC", or "BT", consult your hardware manual), choose the 'IPMI SI
+handler' option.  A driver also exists for direct I2C access to the
+IPMI management controller.  Some boards support this, but it is
+unknown if it will work on every board.  For this, choose 'IPMI SMBus
+handler', but be ready to try to do some figuring to see if it will
+work.
+
+There is also a KCS-only driver interface supplied, but it is
+depracated in favor of the SI interface.
+
+You should generally enable ACPI on your system, as systems with IPMI
+should have ACPI tables describing them.
+
+If you have a standard interface and the board manufacturer has done
+their job correctly, the IPMI controller should be automatically
+detect (via ACPI or SMBIOS tables) and should just work.  Sadly, many
+boards do not have this information.  The driver attempts standard
+defaults, but they may not work.  If you fall into this situation, you
+need to read the section below named 'The SI Driver' on how to
+hand-configure your system.
+
+IPMI defines a standard watchdog timer.  You can enable this with the
+'IPMI Watchdog Timer' config option.  If you compile the driver into
+the kernel, then via a kernel command-line option you can have the
+watchdog timer start as soon as it intitializes.  It also have a lot
+of other options, see the 'Watchdog' section below for more details.
+Note that you can also have the watchdog continue to run if it is
+closed (by default it is disabled on close).  Go into the 'Watchdog
+Cards' menu, enable 'Watchdog Timer Support', and enable the option
+'Disable watchdog shutdown on close'.
+
+
 Basic Design
 ------------
 
@@ -41,18 +93,30 @@ ipmi_devintf - This provides a userland IOCTL interface for the IPMI
 driver, each open file for this device ties in to the message handler
 as an IPMI user.
 
-ipmi_kcs_drv - A driver for the KCS SMI.  Most system have a KCS
-interface for IPMI.
+ipmi_si - A driver for various system interfaces.  This supports
+KCS, SMIC, and may support BT in the future.  Unless you have your own
+custom interface, you probably need to use this.
+
+ipmi_smb - A driver for accessing BMCs on the SMBus. It uses the
+I2C kernel driver's SMBus interfaces to send and receive IPMI messages
+over the SMBus.
+
+af_ipmi - A network socket interface to IPMI.  This doesn't take up
+a character device in your system.
 
+Note that the KCS-only interface ahs been removed.
 
 Much documentation for the interface is in the include files.  The
 IPMI include files are:
 
-ipmi.h - Contains the user interface and IOCTL interface for IPMI.
+net/af_ipmi.h - Contains the socket interface.
 
-ipmi_smi.h - Contains the interface for SMI drivers to use.
+linux/ipmi.h - Contains the user interface and IOCTL interface for IPMI.
 
-ipmi_msgdefs.h - General definitions for base IPMI messaging.
+linux/ipmi_smi.h - Contains the interface for system management interfaces
+(things that interface to IPMI controllers) to use.
+
+linux/ipmi_msgdefs.h - General definitions for base IPMI messaging.
 
 
 Addressing
@@ -260,70 +324,131 @@ they register with the message handler.  They are generally assigned
 in the order they register, although if an SMI unregisters and then
 another one registers, all bets are off.
 
-The ipmi_smi.h defines the interface for SMIs, see that for more
-details.
+The ipmi_smi.h defines the interface for management interfaces, see
+that for more details.
 
 
-The KCS Driver
---------------
+The SI Driver
+-------------
 
-The KCS driver allows up to 4 KCS interfaces to be configured in the
-system.  By default, the driver will register one KCS interface at the
-spec-specified I/O port 0xca2 without interrupts.  You can change this
-at module load time (for a module) with:
+The SI driver allows up to 4 KCS or SMIC interfaces to be configured
+in the system.  By default, scan the ACPI tables for interfaces, and
+if it doesn't find any the driver will attempt to register one KCS
+interface at the spec-specified I/O port 0xca2 without interrupts.
+You can change this at module load time (for a module) with:
+
+  modprobe ipmi_si.o type=<type1>,<type2>....
+       ports=<port1>,<port2>... addrs=<addr1>,<addr2>...
+       irqs=<irq1>,<irq2>... trydefaults=[0|1]
+
+Each of these except si_trydefaults is a list, the first item for the
+first interface, second item for the second interface, etc.
 
-  insmod ipmi_kcs_drv.o kcs_ports=<port1>,<port2>... kcs_addrs=<addr1>,<addr2>
-       kcs_irqs=<irq1>,<irq2>... kcs_trydefaults=[0|1]
+The si_type may be either "kcs", "smic", or "bt".  If you leave it blank, it
+defaults to "kcs".
 
-The KCS driver supports two types of interfaces, ports (for I/O port
-based KCS interfaces) and memory addresses (for KCS interfaces in
-memory).  The driver will support both of them simultaneously, setting
-the port to zero (or just not specifying it) will allow the memory
-address to be used.  The port will override the memory address if it
-is specified and non-zero.  kcs_trydefaults sets whether the standard
-IPMI interface at 0xca2 and any interfaces specified by ACPE are
-tried.  By default, the driver tries it, set this value to zero to
-turn this off.
+If you specify si_addrs as non-zero for an interface, the driver will
+use the memory address given as the address of the device.  This
+overrides si_ports.
+
+If you specify si_ports as non-zero for an interface, the driver will
+use the I/O port given as the device address.
+
+If you specify si_irqs as non-zero for an interface, the driver will
+attempt to use the given interrupt for the device.
+
+si_trydefaults sets whether the standard IPMI interface at 0xca2 and
+any interfaces specified by ACPE are tried.  By default, the driver
+tries it, set this value to zero to turn this off.
 
 When compiled into the kernel, the addresses can be specified on the
 kernel command line as:
 
-  ipmi_kcs=<bmc1>:<irq1>,<bmc2>:<irq2>....,[nodefault]
+  ipmi_si.type=<type1>,<type2>...
+       ipmi_si.ports=<port1>,<port2>... ipmi_si.addrs=<addr1>,<addr2>...
+       ipmi_si.irqs=<irq1>,<irq2>... ipmi_si.trydefaults=[0|1]
 
-The <bmcx> values is either "p<port>" or "m<addr>" for port or memory
-addresses.  So for instance, a KCS interface at port 0xca2 using
-interrupt 9 and a memory interface at address 0xf9827341 with no
-interrupt would be specified "ipmi_kcs=p0xca2:9,m0xf9827341".
-If you specify zero for in irq or don't specify it, the driver will
-run polled unless the software can detect the interrupt to use in the
-ACPI tables.
+It works the same as the module parameters of the same names.
 
-By default, the driver will attempt to detect a KCS device at the
-spec-specified 0xca2 address and any address specified by ACPI.  If
-you want to turn this off, use the "nodefault" option.
+By default, the driver will attempt to detect any device specified by
+ACPI, and if none of those then a KCS device at the spec-specified
+0xca2.  If you want to turn this off, set the "trydefaults" option to
+false.
 
 If you have high-res timers compiled into the kernel, the driver will
 use them to provide much better performance.  Note that if you do not
 have high-res timers enabled in the kernel and you don't have
 interrupts enabled, the driver will run VERY slowly.  Don't blame me,
-the KCS interface sucks.
+these interfaces suck.
+
+
+The SMBus Driver
+----------------
+
+The SMBus driver allows up to 4 SMBus devices to be configured in the
+system.  By default, the driver will register any SMBus interfaces it finds
+in the I2C address range of 0x20 to 0x4f on any adapter.  You can change this
+at module load time (for a module) with:
+
+  modprobe ipmi_smb.o
+	addr=<adapter1>,<i2caddr1>[,<adapter2>,<i2caddr2>[,...]]
+	dbg=<flags1>,<flags2>...
+	[defaultprobe=0] [dbg_probe=1]
+
+The addresses are specified in pairs, the first is the adapter ID and the
+second is the I2C address on that adapter.
+
+The debug flags are bit flags for each BMC found, they are:
+IPMI messages: 1, driver state: 2, timing: 4, I2C probe: 8
+
+Setting smb_defaultprobe to zero disabled the default probing of SMBus
+interfaces at address range 0x20 to 0x4f.  This means that only the
+BMCs specified on the smb_addr line will be detected.
+
+Setting smb_dbg_probe to 1 will enable debugging of the probing and
+detection process for BMCs on the SMBusses.
+
+Discovering the IPMI compilant BMC on the SMBus can cause devices
+on the I2C bus to fail. The SMBus driver writes a "Get Device ID" IPMI
+message as a block write to the I2C bus and waits for a response.
+This action can be detrimental to some I2C devices. It is highly recommended
+that the known I2c address be given to the SMBus driver in the smb_addr
+parameter. The default adrress range will not be used when a smb_addr
+parameter is provided.
+
+When compiled into the kernel, the addresses can be specified on the
+kernel command line as:
+
+  ipmb_smb.addr=<adapter1>,<i2caddr1>[,<adapter2>,<i2caddr2>[,...]]
+	ipmi_smb.dbg=<flags1>,<flags2>...
+	ipmi_smb.defaultprobe=0 ipmi_smb.dbg_probe=1
+
+These are the same options as on the module command line.
+
+Note that you might need some I2C changes if CONFIG_IPMI_PANIC_EVENT
+is enabled along with this, so the I2C driver knows to run to
+completion during sending a panic event.
 
 
 Other Pieces
 ------------
 
 Watchdog
+--------
 
 A watchdog timer is provided that implements the Linux-standard
 watchdog timer interface.  It has three module parameters that can be
 used to control it:
 
-  insmod ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
-      preaction=<preaction type> preop=<preop type>
+  modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
+      preaction=<preaction type> preop=<preop type> start_now=x
 
 The timeout is the number of seconds to the action, and the pretimeout
 is the amount of seconds before the reset that the pre-timeout panic will
-occur (if pretimeout is zero, then pretimeout will not be enabled).
+occur (if pretimeout is zero, then pretimeout will not be enabled).  Note
+that the pretimeout is the time before the final timeout.  So if the
+timeout is 50 seconds and the pretimeout is 10 seconds, then the pretimeout
+will occur in 40 second (10 seconds before the timeout).
 
 The action may be "reset", "power_cycle", or "power_off", and
 specifies what to do when the timer times out, and defaults to
@@ -344,16 +469,19 @@ When preop is set to "preop_give_data", one byte comes ready to read
 on the device when the pretimeout occurs.  Select and fasync work on
 the device, as well.
 
+If start_now is set to 1, the watchdog timer will start running as
+soon as the driver is loaded.
+
 When compiled into the kernel, the kernel command line is available
 for configuring the watchdog:
 
-  ipmi_wdog=<timeout>[,<pretimeout>[,<option>[,<options>....]]]
+  ipmi_watchdog.timeout=<t> ipmi_watchdog.pretimeout=<t>
+	ipmi_watchdog.action=<action type>
+	ipmi_watchdog.preaction=<preaction type>
+	ipmi_watchdog.preop=<preop type>
+	ipmi_watchdog.start_now=x
 
-The options are the actions and preaction above (if an option
-controlling the same thing is specified twice, the last is taken).  An
-options "start_now" is also there, if included, the watchdog will
-start running immediately when all the drivers are ready, it doesn't
-have to have a user hooked up to start it.
+The options are the same as the module parameter options.
 
 The watchdog will panic and start a 120 second reset timeout if it
 gets a pre-action.  During a panic or a reboot, the watchdog will
diff --git a/drivers/char/ipmi/Kconfig b/drivers/char/ipmi/Kconfig
index 9940b2dccbea..b632538fff36 100644
--- a/drivers/char/ipmi/Kconfig
+++ b/drivers/char/ipmi/Kconfig
@@ -43,11 +43,13 @@ config IPMI_DEVICE_INTERFACE
          This provides an IOCTL interface to the IPMI message handler so
 	 userland processes may use IPMI.  It supports poll() and select().
 
-config IPMI_KCS
-       tristate 'IPMI KCS handler'
+config IPMI_SI
+       tristate 'IPMI System Interface handler'
        depends on IPMI_HANDLER
        help
-         Provides a driver for a KCS-style interface to a BMC.
+         Provides a driver for System Interfaces (KCS, SMIC, BT).
+	 Currently, only KCS and SMIC are supported.  If
+	 you are using IPMI, you should probably say "y" here.
 
 config IPMI_WATCHDOG
        tristate 'IPMI Watchdog Timer'
diff --git a/drivers/char/ipmi/Makefile b/drivers/char/ipmi/Makefile
index 1f55b46a8188..b7d8230721a2 100644
--- a/drivers/char/ipmi/Makefile
+++ b/drivers/char/ipmi/Makefile
@@ -2,12 +2,13 @@
 # Makefile for the ipmi drivers.
 #
 
-ipmi_kcs_drv-objs := ipmi_kcs_sm.o ipmi_kcs_intf.o
+ipmi_si-objs := ipmi_si_intf.o ipmi_kcs_sm.o ipmi_smic_sm.o ipmi_bt_sm.o
 
 obj-$(CONFIG_IPMI_HANDLER) += ipmi_msghandler.o
 obj-$(CONFIG_IPMI_DEVICE_INTERFACE) += ipmi_devintf.o
-obj-$(CONFIG_IPMI_KCS) += ipmi_kcs_drv.o
+obj-$(CONFIG_IPMI_SI) += ipmi_si.o
 obj-$(CONFIG_IPMI_WATCHDOG) += ipmi_watchdog.o
 
-ipmi_kcs_drv.o:	$(ipmi_kcs_drv-objs)
-	$(LD) -r -o $@ $(ipmi_kcs_drv-objs) 
+ipmi_si.o:	$(ipmi_si-objs)
+	$(LD) -r -o $@ $(ipmi_si-objs)
+
diff --git a/drivers/char/ipmi/ipmi_bt_sm.c b/drivers/char/ipmi/ipmi_bt_sm.c
new file mode 100644
index 000000000000..622456a52e5c
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_bt_sm.c
@@ -0,0 +1,513 @@
+/*
+ *  ipmi_bt_sm.c
+ *
+ *  The state machine for an Open IPMI BT sub-driver under ipmi_si.c, part
+ *  of the driver architecture at http://sourceforge.net/project/openipmi
+ *
+ *  Author:	Rocky Craig <first.last@hp.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+#include <linux/kernel.h> /* For printk. */
+#include <linux/string.h>
+#include <linux/ipmi_msgdefs.h>		/* for completion codes */
+#include "ipmi_si_sm.h"
+
+#define IPMI_BT_VERSION "v31"
+
+static int bt_debug = 0x00;	/* Production value 0, see following flags */
+
+#define	BT_DEBUG_ENABLE	1
+#define BT_DEBUG_MSG	2
+#define BT_DEBUG_STATES	4
+
+/* Typical "Get BT Capabilities" values are 2-3 retries, 5-10 seconds,
+   and 64 byte buffers.  However, one HP implementation wants 255 bytes of
+   buffer (with a documented message of 160 bytes) so go for the max.
+   Since the Open IPMI architecture is single-message oriented at this
+   stage, the queue depth of BT is of no concern. */
+
+#define BT_NORMAL_TIMEOUT	2000000	/* seconds in microseconds */
+#define BT_RETRY_LIMIT		2
+#define BT_RESET_DELAY		6000000	/* 6 seconds after warm reset */
+
+enum bt_states {
+	BT_STATE_IDLE,
+	BT_STATE_XACTION_START,
+	BT_STATE_WRITE_BYTES,
+	BT_STATE_WRITE_END,
+	BT_STATE_WRITE_CONSUME,
+	BT_STATE_B2H_WAIT,
+	BT_STATE_READ_END,
+	BT_STATE_RESET1,		/* These must come last */
+	BT_STATE_RESET2,
+	BT_STATE_RESET3,
+	BT_STATE_RESTART,
+	BT_STATE_HOSED
+};
+
+struct si_sm_data {
+	enum bt_states	state;
+	enum bt_states	last_state;	/* assist printing and resets */
+	unsigned char	seq;		/* BT sequence number */
+	struct si_sm_io	*io;
+        unsigned char	write_data[IPMI_MAX_MSG_LENGTH];
+        int		write_count;
+        unsigned char	read_data[IPMI_MAX_MSG_LENGTH];
+        int		read_count;
+        int		truncated;
+        long		timeout;
+        unsigned int	error_retries;	/* end of "common" fields */
+	int		nonzero_status;	/* hung BMCs stay all 0 */
+};
+
+#define BT_CLR_WR_PTR	0x01	/* See IPMI 1.5 table 11.6.4 */
+#define BT_CLR_RD_PTR	0x02
+#define BT_H2B_ATN	0x04
+#define BT_B2H_ATN	0x08
+#define BT_SMS_ATN	0x10
+#define BT_OEM0		0x20
+#define BT_H_BUSY	0x40
+#define BT_B_BUSY	0x80
+
+/* Some bits are toggled on each write: write once to set it, once
+   more to clear it; writing a zero does nothing.  To absolutely
+   clear it, check its state and write if set.  This avoids the "get
+   current then use as mask" scheme to modify one bit.  Note that the
+   variable "bt" is hardcoded into these macros. */
+
+#define BT_STATUS	bt->io->inputb(bt->io, 0)
+#define BT_CONTROL(x)	bt->io->outputb(bt->io, 0, x)
+
+#define BMC2HOST	bt->io->inputb(bt->io, 1)
+#define HOST2BMC(x)	bt->io->outputb(bt->io, 1, x)
+
+#define BT_INTMASK_R	bt->io->inputb(bt->io, 2)
+#define BT_INTMASK_W(x)	bt->io->outputb(bt->io, 2, x)
+
+/* Convenience routines for debugging.  These are not multi-open safe!
+   Note the macros have hardcoded variables in them. */
+
+static char *state2txt(unsigned char state)
+{
+	switch (state) {
+		case BT_STATE_IDLE:		return("IDLE");
+		case BT_STATE_XACTION_START:	return("XACTION");
+		case BT_STATE_WRITE_BYTES:	return("WR_BYTES");
+		case BT_STATE_WRITE_END:	return("WR_END");
+		case BT_STATE_WRITE_CONSUME:	return("WR_CONSUME");
+		case BT_STATE_B2H_WAIT:		return("B2H_WAIT");
+		case BT_STATE_READ_END:		return("RD_END");
+		case BT_STATE_RESET1:		return("RESET1");
+		case BT_STATE_RESET2:		return("RESET2");
+		case BT_STATE_RESET3:		return("RESET3");
+		case BT_STATE_RESTART:		return("RESTART");
+		case BT_STATE_HOSED:		return("HOSED");
+	}
+	return("BAD STATE");
+}
+#define STATE2TXT state2txt(bt->state)
+
+static char *status2txt(unsigned char status, char *buf)
+{
+	strcpy(buf, "[ ");
+	if (status & BT_B_BUSY) strcat(buf, "B_BUSY ");
+	if (status & BT_H_BUSY) strcat(buf, "H_BUSY ");
+	if (status & BT_OEM0) strcat(buf, "OEM0 ");
+	if (status & BT_SMS_ATN) strcat(buf, "SMS ");
+	if (status & BT_B2H_ATN) strcat(buf, "B2H ");
+	if (status & BT_H2B_ATN) strcat(buf, "H2B ");
+	strcat(buf, "]");
+	return buf;
+}
+#define STATUS2TXT(buf) status2txt(status, buf)
+
+/* This will be called from within this module on a hosed condition */
+#define FIRST_SEQ	0
+static unsigned int bt_init_data(struct si_sm_data *bt, struct si_sm_io *io)
+{
+	bt->state = BT_STATE_IDLE;
+	bt->last_state = BT_STATE_IDLE;
+	bt->seq = FIRST_SEQ;
+	bt->io = io;
+	bt->write_count = 0;
+	bt->read_count = 0;
+	bt->error_retries = 0;
+	bt->nonzero_status = 0;
+	bt->truncated = 0;
+	bt->timeout = BT_NORMAL_TIMEOUT;
+	return 3; /* We claim 3 bytes of space; ought to check SPMI table */
+}
+
+static int bt_start_transaction(struct si_sm_data *bt,
+				unsigned char *data,
+				unsigned int size)
+{
+	unsigned int i;
+
+	if ((size < 2) || (size > IPMI_MAX_MSG_LENGTH)) return -1;
+
+	if ((bt->state != BT_STATE_IDLE) && (bt->state != BT_STATE_HOSED))
+		return -2;
+
+	if (bt_debug & BT_DEBUG_MSG) {
+    		printk(KERN_WARNING "+++++++++++++++++++++++++++++++++++++\n");
+		printk(KERN_WARNING "BT: write seq=0x%02X:", bt->seq);
+		for (i = 0; i < size; i ++) printk (" %02x", data[i]);
+		printk("\n");
+	}
+	bt->write_data[0] = size + 1;	/* all data plus seq byte */
+	bt->write_data[1] = *data;	/* NetFn/LUN */
+	bt->write_data[2] = bt->seq;
+	memcpy(bt->write_data + 3, data + 1, size - 1);
+	bt->write_count = size + 2;
+
+	bt->error_retries = 0;
+	bt->nonzero_status = 0;
+	bt->read_count = 0;
+	bt->truncated = 0;
+	bt->state = BT_STATE_XACTION_START;
+	bt->last_state = BT_STATE_IDLE;
+	bt->timeout = BT_NORMAL_TIMEOUT;
+	return 0;
+}
+
+/* After the upper state machine has been told SI_SM_TRANSACTION_COMPLETE
+   it calls this.  Strip out the length and seq bytes. */
+
+static int bt_get_result(struct si_sm_data *bt,
+			   unsigned char *data,
+			   unsigned int length)
+{
+	int i, msg_len;
+
+	msg_len = bt->read_count - 2;		/* account for length & seq */
+	/* Always NetFn, Cmd, cCode */
+	if (msg_len < 3 || msg_len > IPMI_MAX_MSG_LENGTH) {
+		printk(KERN_WARNING "BT results: bad msg_len = %d\n", msg_len);
+		data[0] = bt->write_data[1] | 0x4;	/* Kludge a response */
+		data[1] = bt->write_data[3];
+		data[2] = IPMI_ERR_UNSPECIFIED;
+		msg_len = 3;
+	} else {
+		data[0] = bt->read_data[1];
+		data[1] = bt->read_data[3];
+		if (length < msg_len) bt->truncated = 1;
+		if (bt->truncated) {	/* can be set in read_all_bytes() */
+			data[2] = IPMI_ERR_MSG_TRUNCATED;
+			msg_len = 3;
+		} else memcpy(data + 2, bt->read_data + 4, msg_len - 2);
+
+		if (bt_debug & BT_DEBUG_MSG) {
+			printk (KERN_WARNING "BT: res (raw)");
+			for (i = 0; i < msg_len; i++) printk(" %02x", data[i]);
+			printk ("\n");
+		}
+	}
+	bt->read_count = 0;	/* paranoia */
+	return msg_len;
+}
+
+/* This bit's functionality is optional */
+#define BT_BMC_HWRST	0x80
+
+static void reset_flags(struct si_sm_data *bt)
+{
+	if (BT_STATUS & BT_H_BUSY) BT_CONTROL(BT_H_BUSY);
+	if (BT_STATUS & BT_B_BUSY) BT_CONTROL(BT_B_BUSY);
+	BT_CONTROL(BT_CLR_WR_PTR);
+	BT_CONTROL(BT_SMS_ATN);
+	BT_INTMASK_W(BT_BMC_HWRST);
+#ifdef DEVELOPMENT_ONLY_NOT_FOR_PRODUCTION
+	if (BT_STATUS & BT_B2H_ATN) {
+		int i;
+		BT_CONTROL(BT_H_BUSY);
+		BT_CONTROL(BT_B2H_ATN);
+		BT_CONTROL(BT_CLR_RD_PTR);
+		for (i = 0; i < IPMI_MAX_MSG_LENGTH + 2; i++) BMC2HOST;
+		BT_CONTROL(BT_H_BUSY);
+	}
+#endif
+}
+
+static inline void write_all_bytes(struct si_sm_data *bt)
+{
+	int i;
+
+	if (bt_debug & BT_DEBUG_MSG) {
+    		printk(KERN_WARNING "BT: write %d bytes seq=0x%02X",
+			bt->write_count, bt->seq);
+		for (i = 0; i < bt->write_count; i++)
+			printk (" %02x", bt->write_data[i]);
+		printk ("\n");
+	}
+	for (i = 0; i < bt->write_count; i++) HOST2BMC(bt->write_data[i]);
+}
+
+static inline int read_all_bytes(struct si_sm_data *bt)
+{
+	unsigned char i;
+
+	bt->read_data[0] = BMC2HOST;
+	bt->read_count = bt->read_data[0];
+	if (bt_debug & BT_DEBUG_MSG)
+    		printk(KERN_WARNING "BT: read %d bytes:", bt->read_count);
+
+	/* minimum: length, NetFn, Seq, Cmd, cCode == 5 total, or 4 more
+	   following the length byte. */
+	if (bt->read_count < 4 || bt->read_count >= IPMI_MAX_MSG_LENGTH) {
+		if (bt_debug & BT_DEBUG_MSG)
+			printk("bad length %d\n", bt->read_count);
+		bt->truncated = 1;
+		return 1;	/* let next XACTION START clean it up */
+	}
+	for (i = 1; i <= bt->read_count; i++) bt->read_data[i] = BMC2HOST;
+	bt->read_count++;	/* account for the length byte */
+
+	if (bt_debug & BT_DEBUG_MSG) {
+	    	for (i = 0; i < bt->read_count; i++)
+			printk (" %02x", bt->read_data[i]);
+	    	printk ("\n");
+	}
+	if (bt->seq != bt->write_data[2])	/* idiot check */
+		printk(KERN_WARNING "BT: internal error: sequence mismatch\n");
+
+	/* per the spec, the (NetFn, Seq, Cmd) tuples should match */
+	if ((bt->read_data[3] == bt->write_data[3]) &&		/* Cmd */
+        	(bt->read_data[2] == bt->write_data[2]) &&	/* Sequence */
+        	((bt->read_data[1] & 0xF8) == (bt->write_data[1] & 0xF8)))
+			return 1;
+
+	if (bt_debug & BT_DEBUG_MSG) printk(KERN_WARNING "BT: bad packet: "
+		"want 0x(%02X, %02X, %02X) got (%02X, %02X, %02X)\n",
+		bt->write_data[1], bt->write_data[2], bt->write_data[3],
+		bt->read_data[1],  bt->read_data[2],  bt->read_data[3]);
+	return 0;
+}
+
+/* Modifies bt->state appropriately, need to get into the bt_event() switch */
+
+static void error_recovery(struct si_sm_data *bt, char *reason)
+{
+	unsigned char status;
+	char buf[40]; /* For getting status */
+
+	bt->timeout = BT_NORMAL_TIMEOUT; /* various places want to retry */
+
+	status = BT_STATUS;
+	printk(KERN_WARNING "BT: %s in %s %s ", reason, STATE2TXT,
+	       STATUS2TXT(buf));
+
+	(bt->error_retries)++;
+	if (bt->error_retries > BT_RETRY_LIMIT) {
+		printk("retry limit (%d) exceeded\n", BT_RETRY_LIMIT);
+		bt->state = BT_STATE_HOSED;
+		if (!bt->nonzero_status)
+			printk(KERN_ERR "IPMI: BT stuck, try power cycle\n");
+		else if (bt->seq == FIRST_SEQ + BT_RETRY_LIMIT) {
+			/* most likely during insmod */
+			printk(KERN_WARNING "IPMI: BT reset (takes 5 secs)\n");
+        		bt->state = BT_STATE_RESET1;
+		}
+	return;
+	}
+
+	/* Sometimes the BMC queues get in an "off-by-one" state...*/
+	if ((bt->state == BT_STATE_B2H_WAIT) && (status & BT_B2H_ATN)) {
+    		printk("retry B2H_WAIT\n");
+		return;
+	}
+
+	printk("restart command\n");
+	bt->state = BT_STATE_RESTART;
+}
+
+/* Check the status and (possibly) advance the BT state machine.  The
+   default return is SI_SM_CALL_WITH_DELAY. */
+
+static enum si_sm_result bt_event(struct si_sm_data *bt, long time)
+{
+	unsigned char status;
+	char buf[40]; /* For getting status */
+	int i;
+
+	status = BT_STATUS;
+	bt->nonzero_status |= status;
+
+	if ((bt_debug & BT_DEBUG_STATES) && (bt->state != bt->last_state))
+		printk(KERN_WARNING "BT: %s %s TO=%ld - %ld \n",
+			STATE2TXT,
+			STATUS2TXT(buf),
+			bt->timeout,
+			time);
+	bt->last_state = bt->state;
+
+	if (bt->state == BT_STATE_HOSED) return SI_SM_HOSED;
+
+	if (bt->state != BT_STATE_IDLE) {	/* do timeout test */
+
+		/* Certain states, on error conditions, can lock up a CPU
+		   because they are effectively in an infinite loop with
+		   CALL_WITHOUT_DELAY (right back here with time == 0).
+		   Prevent infinite lockup by ALWAYS decrementing timeout. */
+
+    	/* FIXME: bt_event is sometimes called with time > BT_NORMAL_TIMEOUT
+              (noticed in ipmi_smic_sm.c January 2004) */
+
+		if ((time <= 0) || (time >= BT_NORMAL_TIMEOUT)) time = 100;
+		bt->timeout -= time;
+		if ((bt->timeout < 0) && (bt->state < BT_STATE_RESET1)) {
+			error_recovery(bt, "timed out");
+			return SI_SM_CALL_WITHOUT_DELAY;
+		}
+	}
+
+	switch (bt->state) {
+
+    	case BT_STATE_IDLE:	/* check for asynchronous messages */
+		if (status & BT_SMS_ATN) {
+			BT_CONTROL(BT_SMS_ATN);	/* clear it */
+			return SI_SM_ATTN;
+		}
+		return SI_SM_IDLE;
+
+	case BT_STATE_XACTION_START:
+		if (status & BT_H_BUSY) {
+			BT_CONTROL(BT_H_BUSY);
+			break;
+		}
+    		if (status & BT_B2H_ATN) break;
+		bt->state = BT_STATE_WRITE_BYTES;
+		return SI_SM_CALL_WITHOUT_DELAY;	/* for logging */
+
+	case BT_STATE_WRITE_BYTES:
+		if (status & (BT_B_BUSY | BT_H2B_ATN)) break;
+		BT_CONTROL(BT_CLR_WR_PTR);
+		write_all_bytes(bt);
+		BT_CONTROL(BT_H2B_ATN);	/* clears too fast to catch? */
+		bt->state = BT_STATE_WRITE_CONSUME;
+		return SI_SM_CALL_WITHOUT_DELAY; /* it MIGHT sail through */
+
+	case BT_STATE_WRITE_CONSUME: /* BMCs usually blow right thru here */
+        	if (status & (BT_H2B_ATN | BT_B_BUSY)) break;
+		bt->state = BT_STATE_B2H_WAIT;
+		/* fall through with status */
+
+	/* Stay in BT_STATE_B2H_WAIT until a packet matches.  However, spinning
+	   hard here, constantly reading status, seems to hold off the
+	   generation of B2H_ATN so ALWAYS return CALL_WITH_DELAY. */
+
+	case BT_STATE_B2H_WAIT:
+    		if (!(status & BT_B2H_ATN)) break;
+
+		/* Assume ordered, uncached writes: no need to wait */
+		if (!(status & BT_H_BUSY)) BT_CONTROL(BT_H_BUSY); /* set */
+		BT_CONTROL(BT_B2H_ATN);		/* clear it, ACK to the BMC */
+		BT_CONTROL(BT_CLR_RD_PTR);	/* reset the queue */
+		i = read_all_bytes(bt);
+		BT_CONTROL(BT_H_BUSY);		/* clear */
+		if (!i) break;			/* Try this state again */
+		bt->state = BT_STATE_READ_END;
+		return SI_SM_CALL_WITHOUT_DELAY;	/* for logging */
+
+    	case BT_STATE_READ_END:
+
+		/* I could wait on BT_H_BUSY to go clear for a truly clean
+		   exit.  However, this is already done in XACTION_START
+		   and the (possible) extra loop/status/possible wait affects
+		   performance.  So, as long as it works, just ignore H_BUSY */
+
+#ifdef MAKE_THIS_TRUE_IF_NECESSARY
+
+		if (status & BT_H_BUSY) break;
+#endif
+		bt->seq++;
+		bt->state = BT_STATE_IDLE;
+		return SI_SM_TRANSACTION_COMPLETE;
+
+	case BT_STATE_RESET1:
+    		reset_flags(bt);
+    		bt->timeout = BT_RESET_DELAY;;
+		bt->state = BT_STATE_RESET2;
+		break;
+
+	case BT_STATE_RESET2:		/* Send a soft reset */
+		BT_CONTROL(BT_CLR_WR_PTR);
+		HOST2BMC(3);		/* number of bytes following */
+		HOST2BMC(0x18);		/* NetFn/LUN == Application, LUN 0 */
+		HOST2BMC(42);		/* Sequence number */
+		HOST2BMC(3);		/* Cmd == Soft reset */
+		BT_CONTROL(BT_H2B_ATN);
+		bt->state = BT_STATE_RESET3;
+		break;
+
+	case BT_STATE_RESET3:
+		if (bt->timeout > 0) return SI_SM_CALL_WITH_DELAY;
+		bt->state = BT_STATE_RESTART;	/* printk in debug modes */
+		break;
+
+	case BT_STATE_RESTART:		/* don't reset retries! */
+		bt->write_data[2] = ++bt->seq;
+		bt->read_count = 0;
+		bt->nonzero_status = 0;
+		bt->timeout = BT_NORMAL_TIMEOUT;
+		bt->state = BT_STATE_XACTION_START;
+		break;
+
+	default:	/* HOSED is supposed to be caught much earlier */
+		error_recovery(bt, "internal logic error");
+		break;
+  	}
+  	return SI_SM_CALL_WITH_DELAY;
+}
+
+static int bt_detect(struct si_sm_data *bt)
+{
+	/* It's impossible for the BT status and interrupt registers to be
+	   all 1's, (assuming a properly functioning, self-initialized BMC)
+	   but that's what you get from reading a bogus address, so we
+	   test that first.  The calling routine uses negative logic. */
+
+	if ((BT_STATUS == 0xFF) && (BT_INTMASK_R == 0xFF)) return 1;
+	reset_flags(bt);
+	return 0;
+}
+
+static void bt_cleanup(struct si_sm_data *bt)
+{
+}
+
+static int bt_size(void)
+{
+	return sizeof(struct si_sm_data);
+}
+
+struct si_sm_handlers bt_smi_handlers =
+{
+	.version           = IPMI_BT_VERSION,
+	.init_data         = bt_init_data,
+	.start_transaction = bt_start_transaction,
+	.get_result        = bt_get_result,
+	.event             = bt_event,
+	.detect            = bt_detect,
+	.cleanup           = bt_cleanup,
+	.size              = bt_size,
+};
diff --git a/drivers/char/ipmi/ipmi_devintf.c b/drivers/char/ipmi/ipmi_devintf.c
index b69ff3d19284..afd1de325f93 100644
--- a/drivers/char/ipmi/ipmi_devintf.c
+++ b/drivers/char/ipmi/ipmi_devintf.c
@@ -33,6 +33,7 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/errno.h>
 #include <asm/system.h>
 #include <linux/sched.h>
@@ -44,6 +45,8 @@
 #include <asm/semaphore.h>
 #include <linux/init.h>
 
+#define IPMI_DEVINTF_VERSION "v31"
+
 struct ipmi_file_private
 {
 	ipmi_user_t          user;
@@ -53,6 +56,8 @@ struct ipmi_file_private
 	struct fasync_struct *fasync_queue;
 	wait_queue_head_t    wait;
 	struct semaphore     recv_sem;
+	int                  default_retries;
+	unsigned int         default_retry_time_ms;
 };
 
 static void file_receive_handler(struct ipmi_recv_msg *msg,
@@ -138,6 +143,10 @@ static int ipmi_open(struct inode *inode, struct file *file)
 	priv->fasync_queue = NULL;
 	sema_init(&(priv->recv_sem), 1);
 
+	/* Use the low-level defaults. */
+	priv->default_retries = -1;
+	priv->default_retry_time_ms = 0;
+
 	return 0;
 }
 
@@ -158,6 +167,63 @@ static int ipmi_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int handle_send_req(ipmi_user_t     user,
+			   struct ipmi_req *req,
+			   int             retries,
+			   unsigned int    retry_time_ms)
+{
+	int              rv;
+	struct ipmi_addr addr;
+	unsigned char    *msgdata;
+
+	if (req->addr_len > sizeof(struct ipmi_addr))
+		return -EINVAL;
+
+	if (copy_from_user(&addr, req->addr, req->addr_len))
+		return -EFAULT;
+
+	msgdata = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+	if (!msgdata)
+		return -ENOMEM;
+
+	/* From here out we cannot return, we must jump to "out" for
+	   error exits to free msgdata. */
+
+	rv = ipmi_validate_addr(&addr, req->addr_len);
+	if (rv)
+		goto out;
+
+	if (req->msg.data != NULL) {
+		if (req->msg.data_len > IPMI_MAX_MSG_LENGTH) {
+			rv = -EMSGSIZE;
+			goto out;
+		}
+
+		if (copy_from_user(&msgdata,
+				   req->msg.data,
+				   req->msg.data_len))
+		{
+			rv = -EFAULT;
+			goto out;
+		}
+	} else {
+		req->msg.data_len = 0;
+	}
+	req->msg.data = msgdata;
+
+	rv = ipmi_request_settime(user,
+				  &addr,
+				  req->msgid,
+				  &(req->msg),
+				  NULL,
+				  0,
+				  retries,
+				  retry_time_ms);
+ out:
+	kfree(msgdata);
+	return rv;
+}
+
 static int ipmi_ioctl(struct inode  *inode,
 		      struct file   *file,
 		      unsigned int  cmd,
@@ -170,54 +236,33 @@ static int ipmi_ioctl(struct inode  *inode,
 	{
 	case IPMICTL_SEND_COMMAND:
 	{
-		struct ipmi_req    req;
-		struct ipmi_addr   addr;
-		unsigned char msgdata[IPMI_MAX_MSG_LENGTH];
+		struct ipmi_req req;
 
 		if (copy_from_user(&req, (void *) data, sizeof(req))) {
 			rv = -EFAULT;
 			break;
 		}
 
-		if (req.addr_len > sizeof(struct ipmi_addr))
-		{
-			rv = -EINVAL;
-			break;
-		}
+		rv = handle_send_req(priv->user,
+				     &req,
+				     priv->default_retries,
+				     priv->default_retry_time_ms);
+		break;
+	}
+
+	case IPMICTL_SEND_COMMAND_SETTIME:
+	{
+		struct ipmi_req_settime req;
 
-		if (copy_from_user(&addr, req.addr, req.addr_len)) {
+		if (copy_from_user(&req, (void *) data, sizeof(req))) {
 			rv = -EFAULT;
 			break;
 		}
 
-		rv = ipmi_validate_addr(&addr, req.addr_len);
-		if (rv)
-			break;
-
-		if (req.msg.data != NULL) {
-			if (req.msg.data_len > IPMI_MAX_MSG_LENGTH) {
-				rv = -EMSGSIZE;
-				break;
-			}
-
-			if (copy_from_user(&msgdata,
-					   req.msg.data,
-					   req.msg.data_len))
-			{
-				rv = -EFAULT;
-				break;
-			}
-		} else {
-			req.msg.data_len = 0;
-		}
-
-		req.msg.data = msgdata;
-
-		rv = ipmi_request(priv->user,
-				  &addr,
-				  req.msgid,
-				  &(req.msg),
-				  0);
+		rv = handle_send_req(priv->user,
+				     &req.req,
+				     req.retries,
+				     req.retry_time_ms);
 		break;
 	}
 
@@ -416,7 +461,36 @@ static int ipmi_ioctl(struct inode  *inode,
 		rv = 0;
 		break;
 	}
+	case IPMICTL_SET_TIMING_PARMS_CMD:
+	{
+		struct ipmi_timing_parms parms;
+
+		if (copy_from_user(&parms, (void *) data, sizeof(parms))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		priv->default_retries = parms.retries;
+		priv->default_retry_time_ms = parms.retry_time_ms;
+		rv = 0;
+		break;
+	}
+
+	case IPMICTL_GET_TIMING_PARMS_CMD:
+	{
+		struct ipmi_timing_parms parms;
+
+		parms.retries = priv->default_retries;
+		parms.retry_time_ms = priv->default_retry_time_ms;
 
+		if (copy_to_user((void *) data, &parms, sizeof(parms))) {
+			rv = -EFAULT;
+			break;
+		}
+
+		rv = 0;
+		break;
+	}
 	}
   
 	return rv;
@@ -435,29 +509,30 @@ static struct file_operations ipmi_fops = {
 #define DEVICE_NAME     "ipmidev"
 
 static int ipmi_major = 0;
-MODULE_PARM(ipmi_major, "i");
-
-#define MAX_DEVICES 10
+module_param(ipmi_major, int, 0);
+MODULE_PARM_DESC(ipmi_major, "Sets the major number of the IPMI device.  By"
+		 " default, or if you set it to zero, it will choose the next"
+		 " available device.  Setting it to -1 will disable the"
+		 " interface.  Other values will set the major device number"
+		 " to that value.");
 
 static void ipmi_new_smi(int if_num)
 {
-	if (if_num <= MAX_DEVICES) {
-		devfs_mk_cdev(MKDEV(ipmi_major, if_num),
-				S_IFCHR | S_IRUSR | S_IWUSR,
-				"ipmidev/%d", if_num);
-	}
+	devfs_mk_cdev(MKDEV(ipmi_major, if_num),
+		      S_IFCHR | S_IRUSR | S_IWUSR,
+		      "ipmidev/%d", if_num);
 }
 
 static void ipmi_smi_gone(int if_num)
 {
-	if (if_num <= MAX_DEVICES)
-		devfs_remove("ipmidev/%d", if_num);
+	devfs_remove("ipmidev/%d", if_num);
 }
 
 static struct ipmi_smi_watcher smi_watcher =
 {
-	.new_smi	= ipmi_new_smi,
-	.smi_gone	= ipmi_smi_gone,
+	.owner    = THIS_MODULE,
+	.new_smi  = ipmi_new_smi,
+	.smi_gone = ipmi_smi_gone,
 };
 
 static __init int init_ipmi_devintf(void)
@@ -467,6 +542,9 @@ static __init int init_ipmi_devintf(void)
 	if (ipmi_major < 0)
 		return -EINVAL;
 
+	printk(KERN_INFO "ipmi device interface version "
+	       IPMI_DEVINTF_VERSION "\n");
+
 	rv = register_chrdev(ipmi_major, DEVICE_NAME, &ipmi_fops);
 	if (rv < 0) {
 		printk(KERN_ERR "ipmi: can't get major %d\n", ipmi_major);
@@ -482,13 +560,10 @@ static __init int init_ipmi_devintf(void)
 	rv = ipmi_smi_watcher_register(&smi_watcher);
 	if (rv) {
 		unregister_chrdev(ipmi_major, DEVICE_NAME);
-		printk(KERN_WARNING "ipmi: can't register smi watcher");
+		printk(KERN_WARNING "ipmi: can't register smi watcher\n");
 		return rv;
 	}
 
-	printk(KERN_INFO "ipmi: device interface at char major %d\n",
-	       ipmi_major);
-
 	return 0;
 }
 module_init(init_ipmi_devintf);
@@ -500,21 +575,5 @@ static __exit void cleanup_ipmi(void)
 	unregister_chrdev(ipmi_major, DEVICE_NAME);
 }
 module_exit(cleanup_ipmi);
-#ifndef MODULE
-static __init int ipmi_setup (char *str)
-{
-	int x;
-
-	if (get_option (&str, &x)) {
-		/* ipmi=x sets the major number to x. */
-		ipmi_major = x;
-	} else if (!strcmp(str, "off")) {
-		ipmi_major = -1;
-	}
-
-	return 1;
-}
-#endif
 
-__setup("ipmi=", ipmi_setup);
 MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_kcs_intf.c b/drivers/char/ipmi/ipmi_kcs_intf.c
deleted file mode 100644
index f215d7697160..000000000000
--- a/drivers/char/ipmi/ipmi_kcs_intf.c
+++ /dev/null
@@ -1,1305 +0,0 @@
-/*
- * ipmi_kcs_intf.c
- *
- * The interface to the IPMI driver for the KCS.
- *
- * Author: MontaVista Software, Inc.
- *         Corey Minyard <minyard@mvista.com>
- *         source@mvista.com
- *
- * Copyright 2002 MontaVista Software Inc.
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
- *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * This file holds the "policy" for the interface to the KCS state
- * machine.  It does the configuration, handles timers and interrupts,
- * and drives the real KCS state machine.
- */
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <asm/system.h>
-#include <linux/sched.h>
-#include <linux/timer.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/slab.h>
-#include <linux/delay.h>
-#include <linux/list.h>
-#include <linux/ioport.h>
-#ifdef CONFIG_HIGH_RES_TIMERS
-#include <linux/hrtime.h>
-#endif
-#include <linux/interrupt.h>
-#include <linux/rcupdate.h>
-#include <linux/ipmi_smi.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include "ipmi_kcs_sm.h"
-#include <linux/init.h>
-
-/* Measure times between events in the driver. */
-#undef DEBUG_TIMING
-
-/* Timing parameters.  Call every 10 ms when not doing anything,
-   otherwise call every KCS_SHORT_TIMEOUT_USEC microseconds. */
-#define KCS_TIMEOUT_TIME_USEC	10000
-#define KCS_USEC_PER_JIFFY	(1000000/HZ)
-#define KCS_TIMEOUT_JIFFIES	(KCS_TIMEOUT_TIME_USEC/KCS_USEC_PER_JIFFY)
-#define KCS_SHORT_TIMEOUT_USEC  250 /* .25ms when the SM request a
-                                       short timeout */
-
-#ifdef CONFIG_IPMI_KCS
-/* This forces a dependency to the config file for this option. */
-#endif
-
-enum kcs_intf_state {
-	KCS_NORMAL,
-	KCS_GETTING_FLAGS,
-	KCS_GETTING_EVENTS,
-	KCS_CLEARING_FLAGS,
-	KCS_CLEARING_FLAGS_THEN_SET_IRQ,
-	KCS_GETTING_MESSAGES,
-	KCS_ENABLE_INTERRUPTS1,
-	KCS_ENABLE_INTERRUPTS2
-	/* FIXME - add watchdog stuff. */
-};
-
-struct kcs_info
-{
-	ipmi_smi_t          intf;
-	struct kcs_data     *kcs_sm;
-	spinlock_t          kcs_lock;
-	spinlock_t          msg_lock;
-	struct list_head    xmit_msgs;
-	struct list_head    hp_xmit_msgs;
-	struct ipmi_smi_msg *curr_msg;
-	enum kcs_intf_state kcs_state;
-
-	/* Flags from the last GET_MSG_FLAGS command, used when an ATTN
-	   is set to hold the flags until we are done handling everything
-	   from the flags. */
-#define RECEIVE_MSG_AVAIL	0x01
-#define EVENT_MSG_BUFFER_FULL	0x02
-#define WDT_PRE_TIMEOUT_INT	0x08
-	unsigned char       msg_flags;
-
-	/* If set to true, this will request events the next time the
-	   state machine is idle. */
-	atomic_t            req_events;
-
-	/* If true, run the state machine to completion on every send
-	   call.  Generally used after a panic to make sure stuff goes
-	   out. */
-	int                 run_to_completion;
-
-	/* The I/O port of a KCS interface. */
-	int                 port;
-
-	/* zero if no irq; */
-	int                 irq;
-
-	/* The physical and remapped memory addresses of a KCS interface. */
-	unsigned long	    physaddr;
-	unsigned char	    *addr;
-
-	/* The timer for this kcs. */
-	struct timer_list   kcs_timer;
-
-	/* The time (in jiffies) the last timeout occurred at. */
-	unsigned long       last_timeout_jiffies;
-
-	/* Used to gracefully stop the timer without race conditions. */
-	volatile int        stop_operation;
-	volatile int        timer_stopped;
-
-	/* The driver will disable interrupts when it gets into a
-	   situation where it cannot handle messages due to lack of
-	   memory.  Once that situation clears up, it will re-enable
-	   interrupts. */
-	int                 interrupt_disabled;
-};
-
-static void kcs_restart_short_timer(struct kcs_info *kcs_info);
-
-static void deliver_recv_msg(struct kcs_info *kcs_info, struct ipmi_smi_msg *msg)
-{
-	/* Deliver the message to the upper layer with the lock
-           released. */
-	spin_unlock(&(kcs_info->kcs_lock));
-	ipmi_smi_msg_received(kcs_info->intf, msg);
-	spin_lock(&(kcs_info->kcs_lock));
-}
-
-static void return_hosed_msg(struct kcs_info *kcs_info)
-{
-	struct ipmi_smi_msg *msg = kcs_info->curr_msg;
-
-	/* Make it a reponse */
-	msg->rsp[0] = msg->data[0] | 4;
-	msg->rsp[1] = msg->data[1];
-	msg->rsp[2] = 0xFF; /* Unknown error. */
-	msg->rsp_size = 3;
-			
-	kcs_info->curr_msg = NULL;
-	deliver_recv_msg(kcs_info, msg);
-}
-
-static enum kcs_result start_next_msg(struct kcs_info *kcs_info)
-{
-	int              rv;
-	struct list_head *entry = NULL;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	/* No need to save flags, we aleady have interrupts off and we
-	   already hold the KCS lock. */
-	spin_lock(&(kcs_info->msg_lock));
-	
-	/* Pick the high priority queue first. */
-	if (! list_empty(&(kcs_info->hp_xmit_msgs))) {
-		entry = kcs_info->hp_xmit_msgs.next;
-	} else if (! list_empty(&(kcs_info->xmit_msgs))) {
-		entry = kcs_info->xmit_msgs.next;
-	}
-
-	if (!entry) {
-		kcs_info->curr_msg = NULL;
-		rv = KCS_SM_IDLE;
-	} else {
-		int err;
-
-		list_del(entry);
-		kcs_info->curr_msg = list_entry(entry,
-						struct ipmi_smi_msg,
-						link);
-#ifdef DEBUG_TIMING
-		do_gettimeofday(&t);
-		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-		err = start_kcs_transaction(kcs_info->kcs_sm,
-					   kcs_info->curr_msg->data,
-					   kcs_info->curr_msg->data_size);
-		if (err) {
-			return_hosed_msg(kcs_info);
-		}
-
-		rv = KCS_CALL_WITHOUT_DELAY;
-	}
-	spin_unlock(&(kcs_info->msg_lock));
-
-	return rv;
-}
-
-static void start_enable_irq(struct kcs_info *kcs_info)
-{
-	unsigned char msg[2];
-
-	/* If we are enabling interrupts, we have to tell the
-	   BMC to use them. */
-	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-	msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
-
-	start_kcs_transaction(kcs_info->kcs_sm, msg, 2);
-	kcs_info->kcs_state = KCS_ENABLE_INTERRUPTS1;
-}
-
-static void start_clear_flags(struct kcs_info *kcs_info)
-{
-	unsigned char msg[3];
-
-	/* Make sure the watchdog pre-timeout flag is not set at startup. */
-	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-	msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD;
-	msg[2] = WDT_PRE_TIMEOUT_INT;
-
-	start_kcs_transaction(kcs_info->kcs_sm, msg, 3);
-	kcs_info->kcs_state = KCS_CLEARING_FLAGS;
-}
-
-/* When we have a situtaion where we run out of memory and cannot
-   allocate messages, we just leave them in the BMC and run the system
-   polled until we can allocate some memory.  Once we have some
-   memory, we will re-enable the interrupt. */
-static inline void disable_kcs_irq(struct kcs_info *kcs_info)
-{
-	if ((kcs_info->irq) && (!kcs_info->interrupt_disabled)) {
-		disable_irq_nosync(kcs_info->irq);
-		kcs_info->interrupt_disabled = 1;
-	}
-}
-
-static inline void enable_kcs_irq(struct kcs_info *kcs_info)
-{
-	if ((kcs_info->irq) && (kcs_info->interrupt_disabled)) {
-		enable_irq(kcs_info->irq);
-		kcs_info->interrupt_disabled = 0;
-	}
-}
-
-static void handle_flags(struct kcs_info *kcs_info)
-{
-	if (kcs_info->msg_flags & WDT_PRE_TIMEOUT_INT) {
-		/* Watchdog pre-timeout */
-		start_clear_flags(kcs_info);
-		kcs_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
-		spin_unlock(&(kcs_info->kcs_lock));
-		ipmi_smi_watchdog_pretimeout(kcs_info->intf);
-		spin_lock(&(kcs_info->kcs_lock));
-	} else if (kcs_info->msg_flags & RECEIVE_MSG_AVAIL) {
-		/* Messages available. */
-		kcs_info->curr_msg = ipmi_alloc_smi_msg();
-		if (!kcs_info->curr_msg) {
-			disable_kcs_irq(kcs_info);
-			kcs_info->kcs_state = KCS_NORMAL;
-			return;
-		}
-		enable_kcs_irq(kcs_info);
-
-		kcs_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		kcs_info->curr_msg->data[1] = IPMI_GET_MSG_CMD;
-		kcs_info->curr_msg->data_size = 2;
-
-		start_kcs_transaction(kcs_info->kcs_sm,
-				      kcs_info->curr_msg->data,
-				      kcs_info->curr_msg->data_size);
-		kcs_info->kcs_state = KCS_GETTING_MESSAGES;
-	} else if (kcs_info->msg_flags & EVENT_MSG_BUFFER_FULL) {
-		/* Events available. */
-		kcs_info->curr_msg = ipmi_alloc_smi_msg();
-		if (!kcs_info->curr_msg) {
-			disable_kcs_irq(kcs_info);
-			kcs_info->kcs_state = KCS_NORMAL;
-			return;
-		}
-		enable_kcs_irq(kcs_info);
-
-		kcs_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		kcs_info->curr_msg->data[1] = IPMI_READ_EVENT_MSG_BUFFER_CMD;
-		kcs_info->curr_msg->data_size = 2;
-
-		start_kcs_transaction(kcs_info->kcs_sm,
-				      kcs_info->curr_msg->data,
-				      kcs_info->curr_msg->data_size);
-		kcs_info->kcs_state = KCS_GETTING_EVENTS;
-	} else {
-		kcs_info->kcs_state = KCS_NORMAL;
-	}
-}
-
-static void handle_transaction_done(struct kcs_info *kcs_info)
-{
-	struct ipmi_smi_msg *msg;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-
-	do_gettimeofday(&t);
-	printk("**Done: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-	switch (kcs_info->kcs_state) {
-	case KCS_NORMAL:
-		if (!kcs_info->curr_msg)
-			break;
-			
-		kcs_info->curr_msg->rsp_size
-			= kcs_get_result(kcs_info->kcs_sm,
-					 kcs_info->curr_msg->rsp,
-					 IPMI_MAX_MSG_LENGTH);
-		
-		/* Do this here becase deliver_recv_msg() releases the
-		   lock, and a new message can be put in during the
-		   time the lock is released. */
-		msg = kcs_info->curr_msg;
-		kcs_info->curr_msg = NULL;
-		deliver_recv_msg(kcs_info, msg);
-		break;
-		
-	case KCS_GETTING_FLAGS:
-	{
-		unsigned char msg[4];
-		unsigned int  len;
-
-		/* We got the flags from the KCS, now handle them. */
-		len = kcs_get_result(kcs_info->kcs_sm, msg, 4);
-		if (msg[2] != 0) {
-			/* Error fetching flags, just give up for
-			   now. */
-			kcs_info->kcs_state = KCS_NORMAL;
-		} else if (len < 3) {
-			/* Hmm, no flags.  That's technically illegal, but
-			   don't use uninitialized data. */
-			kcs_info->kcs_state = KCS_NORMAL;
-		} else {
-			kcs_info->msg_flags = msg[3];
-			handle_flags(kcs_info);
-		}
-		break;
-	}
-
-	case KCS_CLEARING_FLAGS:
-	case KCS_CLEARING_FLAGS_THEN_SET_IRQ:
-	{
-		unsigned char msg[3];
-
-		/* We cleared the flags. */
-		kcs_get_result(kcs_info->kcs_sm, msg, 3);
-		if (msg[2] != 0) {
-			/* Error clearing flags */
-			printk(KERN_WARNING
-			       "ipmi_kcs: Error clearing flags: %2.2x\n",
-			       msg[2]);
-		}
-		if (kcs_info->kcs_state == KCS_CLEARING_FLAGS_THEN_SET_IRQ)
-			start_enable_irq(kcs_info);
-		else
-			kcs_info->kcs_state = KCS_NORMAL;
-		break;
-	}
-
-	case KCS_GETTING_EVENTS:
-	{
-		kcs_info->curr_msg->rsp_size
-			= kcs_get_result(kcs_info->kcs_sm,
-					 kcs_info->curr_msg->rsp,
-					 IPMI_MAX_MSG_LENGTH);
-
-		/* Do this here becase deliver_recv_msg() releases the
-		   lock, and a new message can be put in during the
-		   time the lock is released. */
-		msg = kcs_info->curr_msg;
-		kcs_info->curr_msg = NULL;
-		if (msg->rsp[2] != 0) {
-			/* Error getting event, probably done. */
-			msg->done(msg);
-
-			/* Take off the event flag. */
-			kcs_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL;
-		} else {
-			deliver_recv_msg(kcs_info, msg);
-		}
-		handle_flags(kcs_info);
-		break;
-	}
-
-	case KCS_GETTING_MESSAGES:
-	{
-		kcs_info->curr_msg->rsp_size
-			= kcs_get_result(kcs_info->kcs_sm,
-					 kcs_info->curr_msg->rsp,
-					 IPMI_MAX_MSG_LENGTH);
-
-		/* Do this here becase deliver_recv_msg() releases the
-		   lock, and a new message can be put in during the
-		   time the lock is released. */
-		msg = kcs_info->curr_msg;
-		kcs_info->curr_msg = NULL;
-		if (msg->rsp[2] != 0) {
-			/* Error getting event, probably done. */
-			msg->done(msg);
-
-			/* Take off the msg flag. */
-			kcs_info->msg_flags &= ~RECEIVE_MSG_AVAIL;
-		} else {
-			deliver_recv_msg(kcs_info, msg);
-		}
-		handle_flags(kcs_info);
-		break;
-	}
-
-	case KCS_ENABLE_INTERRUPTS1:
-	{
-		unsigned char msg[4];
-
-		/* We got the flags from the KCS, now handle them. */
-		kcs_get_result(kcs_info->kcs_sm, msg, 4);
-		if (msg[2] != 0) {
-			printk(KERN_WARNING
-			       "ipmi_kcs: Could not enable interrupts"
-			       ", failed get, using polled mode.\n");
-			kcs_info->kcs_state = KCS_NORMAL;
-		} else {
-			msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-			msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
-			msg[2] = msg[3] | 1; /* enable msg queue int */
-			start_kcs_transaction(kcs_info->kcs_sm, msg,3);
-			kcs_info->kcs_state = KCS_ENABLE_INTERRUPTS2;
-		}
-		break;
-	}
-
-	case KCS_ENABLE_INTERRUPTS2:
-	{
-		unsigned char msg[4];
-
-		/* We got the flags from the KCS, now handle them. */
-		kcs_get_result(kcs_info->kcs_sm, msg, 4);
-		if (msg[2] != 0) {
-			printk(KERN_WARNING
-			       "ipmi_kcs: Could not enable interrupts"
-			       ", failed set, using polled mode.\n");
-		}
-		kcs_info->kcs_state = KCS_NORMAL;
-		break;
-	}
-	}
-}
-
-/* Called on timeouts and events.  Timeouts should pass the elapsed
-   time, interrupts should pass in zero. */
-static enum kcs_result kcs_event_handler(struct kcs_info *kcs_info, int time)
-{
-	enum kcs_result kcs_result;
-
- restart:
-	/* There used to be a loop here that waited a little while
-	   (around 25us) before giving up.  That turned out to be
-	   pointless, the minimum delays I was seeing were in the 300us
-	   range, which is far too long to wait in an interrupt.  So
-	   we just run until the state machine tells us something
-	   happened or it needs a delay. */
-	kcs_result = kcs_event(kcs_info->kcs_sm, time);
-	time = 0;
-	while (kcs_result == KCS_CALL_WITHOUT_DELAY)
-	{
-		kcs_result = kcs_event(kcs_info->kcs_sm, 0);
-	}
-
-	if (kcs_result == KCS_TRANSACTION_COMPLETE)
-	{
-		handle_transaction_done(kcs_info);
-		kcs_result = kcs_event(kcs_info->kcs_sm, 0);
-	}
-	else if (kcs_result == KCS_SM_HOSED)
-	{
-		if (kcs_info->curr_msg != NULL) {
-			/* If we were handling a user message, format
-                           a response to send to the upper layer to
-                           tell it about the error. */
-			return_hosed_msg(kcs_info);
-		}
-		kcs_result = kcs_event(kcs_info->kcs_sm, 0);
-		kcs_info->kcs_state = KCS_NORMAL;
-	}
-
-	/* We prefer handling attn over new messages. */
-	if (kcs_result == KCS_ATTN)
-	{
-		unsigned char msg[2];
-
-		/* Got a attn, send down a get message flags to see
-                   what's causing it.  It would be better to handle
-                   this in the upper layer, but due to the way
-                   interrupts work with the KCS, that's not really
-                   possible. */
-		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
-
-		start_kcs_transaction(kcs_info->kcs_sm, msg, 2);
-		kcs_info->kcs_state = KCS_GETTING_FLAGS;
-		goto restart;
-	}
-
-	/* If we are currently idle, try to start the next message. */
-	if (kcs_result == KCS_SM_IDLE) {
-		kcs_result = start_next_msg(kcs_info);
-		if (kcs_result != KCS_SM_IDLE)
-			goto restart;
-        }
-
-	if ((kcs_result == KCS_SM_IDLE)
-	    && (atomic_read(&kcs_info->req_events)))
-	{
-		/* We are idle and the upper layer requested that I fetch
-		   events, so do so. */
-		unsigned char msg[2];
-
-		atomic_set(&kcs_info->req_events, 0);
-		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
-		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
-
-		start_kcs_transaction(kcs_info->kcs_sm, msg, 2);
-		kcs_info->kcs_state = KCS_GETTING_FLAGS;
-		goto restart;
-	}
-
-	return kcs_result;
-}
-
-static void sender(void                *send_info,
-		   struct ipmi_smi_msg *msg,
-		   int                 priority)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) send_info;
-	enum kcs_result result;
-	unsigned long   flags;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	spin_lock_irqsave(&(kcs_info->msg_lock), flags);
-#ifdef DEBUG_TIMING
-	do_gettimeofday(&t);
-	printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-
-	if (kcs_info->run_to_completion) {
-		/* If we are running to completion, then throw it in
-		   the list and run transactions until everything is
-		   clear.  Priority doesn't matter here. */
-		list_add_tail(&(msg->link), &(kcs_info->xmit_msgs));
-
-		/* We have to release the msg lock and claim the kcs
-		   lock in this case, because of race conditions. */
-		spin_unlock_irqrestore(&(kcs_info->msg_lock), flags);
-
-		spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-		result = kcs_event_handler(kcs_info, 0);
-		while (result != KCS_SM_IDLE) {
-			udelay(KCS_SHORT_TIMEOUT_USEC);
-			result = kcs_event_handler(kcs_info,
-						   KCS_SHORT_TIMEOUT_USEC);
-		}
-		spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-		return;
-	} else {
-		if (priority > 0) {
-			list_add_tail(&(msg->link), &(kcs_info->hp_xmit_msgs));
-		} else {
-			list_add_tail(&(msg->link), &(kcs_info->xmit_msgs));
-		}
-	}
-	spin_unlock_irqrestore(&(kcs_info->msg_lock), flags);
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-	if ((kcs_info->kcs_state == KCS_NORMAL)
-	    && (kcs_info->curr_msg == NULL))
-	{
-		start_next_msg(kcs_info);
-		kcs_restart_short_timer(kcs_info);
-	}
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-}
-
-static void set_run_to_completion(void *send_info, int i_run_to_completion)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) send_info;
-	enum kcs_result result;
-	unsigned long   flags;
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-
-	kcs_info->run_to_completion = i_run_to_completion;
-	if (i_run_to_completion) {
-		result = kcs_event_handler(kcs_info, 0);
-		while (result != KCS_SM_IDLE) {
-			udelay(KCS_SHORT_TIMEOUT_USEC);
-			result = kcs_event_handler(kcs_info,
-						   KCS_SHORT_TIMEOUT_USEC);
-		}
-	}
-
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-}
-
-static void request_events(void *send_info)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) send_info;
-
-	atomic_set(&kcs_info->req_events, 1);
-}
-
-static int initialized = 0;
-
-/* Must be called with interrupts off and with the kcs_lock held. */
-static void kcs_restart_short_timer(struct kcs_info *kcs_info)
-{
-	if (del_timer(&(kcs_info->kcs_timer))) {
-#ifdef CONFIG_HIGH_RES_TIMERS
-		unsigned long jiffies_now;
-
-		/* If we don't delete the timer, then it will go off
-		   immediately, anyway.  So we only process if we
-		   actually delete the timer. */
-
-		/* We already have irqsave on, so no need for it
-                   here. */
-		read_lock(&xtime_lock);
-		jiffies_now = jiffies;
-		kcs_info->kcs_timer.expires = jiffies_now;
-
-		kcs_info->kcs_timer.sub_expires
-			= quick_update_jiffies_sub(jiffies_now);
-		read_unlock(&xtime_lock);
-
-		kcs_info->kcs_timer.sub_expires
-			+= usec_to_arch_cycles(KCS_SHORT_TIMEOUT_USEC);
-		while (kcs_info->kcs_timer.sub_expires >= cycles_per_jiffies) {
-			kcs_info->kcs_timer.expires++;
-			kcs_info->kcs_timer.sub_expires -= cycles_per_jiffies;
-		}
-#else
-		kcs_info->kcs_timer.expires = jiffies + 1;
-#endif
-		add_timer(&(kcs_info->kcs_timer));
-	}
-}
-
-static void kcs_timeout(unsigned long data)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) data;
-	enum kcs_result kcs_result;
-	unsigned long   flags;
-	unsigned long   jiffies_now;
-	unsigned long   time_diff;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	if (kcs_info->stop_operation) {
-		kcs_info->timer_stopped = 1;
-		return;
-	}
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-#ifdef DEBUG_TIMING
-	do_gettimeofday(&t);
-	printk("**Timer: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-	jiffies_now = jiffies;
-
-	time_diff = ((jiffies_now - kcs_info->last_timeout_jiffies)
-		     * KCS_USEC_PER_JIFFY);
-	kcs_result = kcs_event_handler(kcs_info, time_diff);
-
-	kcs_info->last_timeout_jiffies = jiffies_now;
-
-	if ((kcs_info->irq) && (! kcs_info->interrupt_disabled)) {
-		/* Running with interrupts, only do long timeouts. */
-		kcs_info->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-		goto do_add_timer;
-	}
-
-	/* If the state machine asks for a short delay, then shorten
-           the timer timeout. */
-#ifdef CONFIG_HIGH_RES_TIMERS
-	if (kcs_result == KCS_CALL_WITH_DELAY) {
-		kcs_info->kcs_timer.sub_expires
-			+= usec_to_arch_cycles(KCS_SHORT_TIMEOUT_USEC);
-		while (kcs_info->kcs_timer.sub_expires >= cycles_per_jiffies) {
-			kcs_info->kcs_timer.expires++;
-			kcs_info->kcs_timer.sub_expires -= cycles_per_jiffies;
-		}
-	} else {
-		kcs_info->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-		kcs_info->kcs_timer.sub_expires = 0;
-	}
-#else
-	/* If requested, take the shortest delay possible */
-	if (kcs_result == KCS_CALL_WITH_DELAY) {
-		kcs_info->kcs_timer.expires = jiffies + 1;
-	} else {
-		kcs_info->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-	}
-#endif
-
- do_add_timer:
-	add_timer(&(kcs_info->kcs_timer));
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-}
-
-static irqreturn_t kcs_irq_handler(int irq, void *data, struct pt_regs *regs)
-{
-	struct kcs_info *kcs_info = (struct kcs_info *) data;
-	unsigned long   flags;
-#ifdef DEBUG_TIMING
-	struct timeval t;
-#endif
-
-	spin_lock_irqsave(&(kcs_info->kcs_lock), flags);
-	if (kcs_info->stop_operation)
-		goto out;
-
-#ifdef DEBUG_TIMING
-	do_gettimeofday(&t);
-	printk("**Interrupt: %d.%9.9d\n", t.tv_sec, t.tv_usec);
-#endif
-	kcs_event_handler(kcs_info, 0);
- out:
-	spin_unlock_irqrestore(&(kcs_info->kcs_lock), flags);
-	return IRQ_HANDLED;
-}
-
-static struct ipmi_smi_handlers handlers =
-{
-	.owner			= THIS_MODULE,
-	.sender			= sender,
-	.request_events		= request_events,
-	.set_run_to_completion	= set_run_to_completion,
-};
-
-static unsigned char ipmi_kcs_dev_rev;
-static unsigned char ipmi_kcs_fw_rev_major;
-static unsigned char ipmi_kcs_fw_rev_minor;
-static unsigned char ipmi_version_major;
-static unsigned char ipmi_version_minor;
-
-extern int kcs_dbg;
-static int ipmi_kcs_detect_hardware(unsigned int port,
-				    unsigned char *addr,
-				    struct kcs_data *data)
-{
-	unsigned char   msg[2];
-	unsigned char   resp[IPMI_MAX_MSG_LENGTH];
-	unsigned long   resp_len;
-	enum kcs_result kcs_result;
-
-	/* It's impossible for the KCS status register to be all 1's,
-	   (assuming a properly functioning, self-initialized BMC)
-	   but that's what you get from reading a bogus address, so we
-	   test that first. */
-
-	if (port) {
-		if (inb(port+1) == 0xff) return -ENODEV; 
-	} else { 
-		if (readb(addr+1) == 0xff) return -ENODEV; 
-	}
-
-	/* Do a Get Device ID command, since it comes back with some
-	   useful info. */
-	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
-	msg[1] = IPMI_GET_DEVICE_ID_CMD;
-	start_kcs_transaction(data, msg, 2);
-	
-	kcs_result = kcs_event(data, 0);
-	for (;;)
-	{
-		if (kcs_result == KCS_CALL_WITH_DELAY) {
-			udelay(100);
-			kcs_result = kcs_event(data, 100);
-		}
-		else if (kcs_result == KCS_CALL_WITHOUT_DELAY)
-		{
-			kcs_result = kcs_event(data, 0);
-		}
-		else
-			break;
-	}
-	if (kcs_result == KCS_SM_HOSED) {
-		/* We couldn't get the state machine to run, so whatever's at
-		   the port is probably not an IPMI KCS interface. */
-		return -ENODEV;
-	}
-	/* Otherwise, we got some data. */
-	resp_len = kcs_get_result(data, resp, IPMI_MAX_MSG_LENGTH);
-	if (resp_len < 6)
-		/* That's odd, it should be longer. */
-		return -EINVAL;
-	
-	if ((resp[1] != IPMI_GET_DEVICE_ID_CMD) || (resp[2] != 0))
-		/* That's odd, it shouldn't be able to fail. */
-		return -EINVAL;
-	
-	ipmi_kcs_dev_rev = resp[4] & 0xf;
-	ipmi_kcs_fw_rev_major = resp[5] & 0x7f;
-	ipmi_kcs_fw_rev_minor = resp[6];
-	ipmi_version_major = resp[7] & 0xf;
-	ipmi_version_minor = resp[7] >> 4;
-
-	return 0;
-}
-
-/* There can be 4 IO ports passed in (with or without IRQs), 4 addresses,
-   a default IO port, and 1 ACPI/SPMI address.  That sets KCS_MAX_DRIVERS */
-
-#define KCS_MAX_PARMS 4
-#define KCS_MAX_DRIVERS ((KCS_MAX_PARMS * 2) + 2)
-static struct kcs_info *kcs_infos[KCS_MAX_DRIVERS] =
-{ NULL, NULL, NULL, NULL };
-
-#define DEVICE_NAME "ipmi_kcs"
-
-#define DEFAULT_IO_PORT 0xca2
-
-static int kcs_trydefaults = 1;
-static unsigned long kcs_addrs[KCS_MAX_PARMS] = { 0, 0, 0, 0 };
-static int kcs_ports[KCS_MAX_PARMS] = { 0, 0, 0, 0 };
-static int kcs_irqs[KCS_MAX_PARMS] = { 0, 0, 0, 0 };
-
-MODULE_PARM(kcs_trydefaults, "i");
-MODULE_PARM(kcs_addrs, "1-4l");
-MODULE_PARM(kcs_irqs, "1-4i");
-MODULE_PARM(kcs_ports, "1-4i");
-
-/* Returns 0 if initialized, or negative on an error. */
-static int init_one_kcs(int kcs_port, 
-			int irq, 
-			unsigned long kcs_physaddr,
-			struct kcs_info **kcs)
-{
-	int		rv;
-	struct kcs_info *new_kcs;
-
-	/* Did anything get passed in at all?  Both == zero disables the
-	   driver. */
-
-	if (!(kcs_port || kcs_physaddr)) 
-		return -ENODEV;
-	
-	/* Only initialize a port OR a physical address on this call.
-	   Also, IRQs can go with either ports or addresses. */
-
-	if (kcs_port && kcs_physaddr)
-		return -EINVAL;
-
-	new_kcs = kmalloc(sizeof(*new_kcs), GFP_KERNEL);
-	if (!new_kcs) {
-		printk(KERN_ERR "ipmi_kcs: out of memory\n");
-		return -ENOMEM;
-	}
-
-	/* So we know not to free it unless we have allocated one. */
-	new_kcs->kcs_sm = NULL;
-
-	new_kcs->addr = NULL;
-	new_kcs->physaddr = kcs_physaddr;
-	new_kcs->port = kcs_port;
-
-	if (kcs_port) {
-		if (request_region(kcs_port, 2, DEVICE_NAME) == NULL) {
-			kfree(new_kcs);
-			printk(KERN_ERR 
-			       "ipmi_kcs: can't reserve port @ 0x%4.4x\n",
-		       	       kcs_port);
-			return -EIO;
-		}
-	} else {
-		if (request_mem_region(kcs_physaddr, 2, DEVICE_NAME) == NULL) {
-			kfree(new_kcs);
-			printk(KERN_ERR 
-			       "ipmi_kcs: can't reserve memory @ 0x%lx\n",
-		       	       kcs_physaddr);
-			return -EIO;
-		}
-		if ((new_kcs->addr = ioremap(kcs_physaddr, 2)) == NULL) {
-			kfree(new_kcs);
-			printk(KERN_ERR 
-			       "ipmi_kcs: can't remap memory at 0x%lx\n",
-		       	       kcs_physaddr);
-			return -EIO;
-		}
-	}
-
-	new_kcs->kcs_sm = kmalloc(kcs_size(), GFP_KERNEL);
-	if (!new_kcs->kcs_sm) {
-		printk(KERN_ERR "ipmi_kcs: out of memory\n");
-		rv = -ENOMEM;
-		goto out_err;
-	}
-	init_kcs_data(new_kcs->kcs_sm, kcs_port, new_kcs->addr);
-	spin_lock_init(&(new_kcs->kcs_lock));
-	spin_lock_init(&(new_kcs->msg_lock));
-
-	rv = ipmi_kcs_detect_hardware(kcs_port, new_kcs->addr, new_kcs->kcs_sm);
-	if (rv) {
-		if (kcs_port) 
-			printk(KERN_ERR 
-			       "ipmi_kcs: No KCS @ port 0x%4.4x\n", 
-			       kcs_port);
-		else
-			printk(KERN_ERR 
-			       "ipmi_kcs: No KCS @ addr 0x%lx\n", 
-			       kcs_physaddr);
-		goto out_err;
-	}
-
-	if (irq != 0) {
-		rv = request_irq(irq,
-				 kcs_irq_handler,
-				 SA_INTERRUPT,
-				 DEVICE_NAME,
-				 new_kcs);
-		if (rv) {
-			printk(KERN_WARNING
-			       "ipmi_kcs: %s unable to claim interrupt %d,"
-			       " running polled\n",
-			       DEVICE_NAME, irq);
-			irq = 0;
-		}
-	}
-	new_kcs->irq = irq;
-
-	INIT_LIST_HEAD(&(new_kcs->xmit_msgs));
-	INIT_LIST_HEAD(&(new_kcs->hp_xmit_msgs));
-	new_kcs->curr_msg = NULL;
-	atomic_set(&new_kcs->req_events, 0);
-	new_kcs->run_to_completion = 0;
-
-	start_clear_flags(new_kcs);
-
-	if (irq) {
-		new_kcs->kcs_state = KCS_CLEARING_FLAGS_THEN_SET_IRQ;
-
-		printk(KERN_INFO 
-		       "ipmi_kcs: Acquiring BMC @ port=0x%x irq=%d\n",
-		       kcs_port, irq);
-
-	} else {
-		if (kcs_port)
-			printk(KERN_INFO 
-			       "ipmi_kcs: Acquiring BMC @ port=0x%x\n",
-		       	       kcs_port);
-		else
-			printk(KERN_INFO 
-			       "ipmi_kcs: Acquiring BMC @ addr=0x%lx\n",
-		       	       kcs_physaddr);
-	}
-
-	rv = ipmi_register_smi(&handlers,
-			       new_kcs,
-			       ipmi_version_major,
-			       ipmi_version_minor,
-			       &(new_kcs->intf));
-	if (rv) {
-		free_irq(irq, new_kcs);
-		printk(KERN_ERR 
-		       "ipmi_kcs: Unable to register device: error %d\n",
-		       rv);
-		goto out_err;
-	}
-
-	new_kcs->interrupt_disabled = 0;
-	new_kcs->timer_stopped = 0;
-	new_kcs->stop_operation = 0;
-
-	init_timer(&(new_kcs->kcs_timer));
-	new_kcs->kcs_timer.data = (long) new_kcs;
-	new_kcs->kcs_timer.function = kcs_timeout;
-	new_kcs->last_timeout_jiffies = jiffies;
-	new_kcs->kcs_timer.expires = jiffies + KCS_TIMEOUT_JIFFIES;
-	add_timer(&(new_kcs->kcs_timer));
-
-	*kcs = new_kcs;
-
-	return 0;
-
- out_err:
-	if (kcs_port) 
-		release_region (kcs_port, 2);
-	if (new_kcs->addr) 
-		iounmap(new_kcs->addr);
-	if (kcs_physaddr) 
-		release_mem_region(kcs_physaddr, 2);
-	if (new_kcs->kcs_sm)
-		kfree(new_kcs->kcs_sm);
-	kfree(new_kcs);
-	return rv;
-}
-
-#ifdef CONFIG_ACPI_INTERPRETER
-
-#include <linux/acpi.h>
-
-struct SPMITable {
-	s8      Signature[4];
-	u32     Length;
-	u8      Revision;
-	u8      Checksum;
-	s8      OEMID[6];
-	s8      OEMTableID[8];
-	s8      OEMRevision[4];
-	s8      CreatorID[4];
-	s8      CreatorRevision[4];
-	u8      InterfaceType[2];
-	s16     SpecificationRevision;
-
-	/*
-	 * Bit 0 - SCI interrupt supported
-	 * Bit 1 - I/O APIC/SAPIC
-	 */
-	u8      InterruptType;
-
-	/* If bit 0 of InterruptType is set, then this is the SCI
-	   interrupt in the GPEx_STS register. */
-	u8      GPE;
-
-	s16     Reserved;
-
-	/* If bit 1 of InterruptType is set, then this is the I/O
-	   APIC/SAPIC interrupt. */
-	u32     GlobalSystemInterrupt;
-
-	/* The actual register address. */
-	struct acpi_generic_address addr;
-
-	u8      UID[4];
-
-	s8      spmi_id[1]; /* A '\0' terminated array starts here. */
-};
-
-static int acpi_find_bmc(unsigned long *physaddr, int *port)
-{
-	acpi_status          status;
-	struct SPMITable     *spmi;
-	
-	status = acpi_get_firmware_table("SPMI", 1,
-					 ACPI_LOGICAL_ADDRESSING,
-					 (struct acpi_table_header **) &spmi);
-	if (status != AE_OK)
-		goto not_found;
-
-	if (spmi->InterfaceType[0] != 1)
-		/* Not IPMI. */
-		goto not_found;
-
-	if (spmi->InterfaceType[1] != 1)
-		/* Not KCS. */
-		goto not_found;
-
-	if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
-		*physaddr = spmi->addr.address;
-		printk("ipmi_kcs_intf: Found ACPI-specified state machine"
-		       " at memory address 0x%lx\n",
-		       (unsigned long) spmi->addr.address);
-	} else if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
-		*port = spmi->addr.address;
-		printk("ipmi_kcs_intf: Found ACPI-specified state machine"
-		       " at I/O address 0x%lx\n",
-		       (unsigned long) spmi->addr.address);
-	} else
-		goto not_found; /* Not an address type we recognise. */
-
-	return 0;
-
- not_found:
-	return -ENODEV;
-}
-#endif
-
-static __init int init_ipmi_kcs(void)
-{
-	int		rv = 0;
-	int		pos = 0;
-	int		i = 0;
-#ifdef CONFIG_ACPI_INTERPRETER
-	unsigned long	physaddr = 0;
-	int             port = 0;
-#endif
-
-	if (initialized)
-		return 0;
-	initialized = 1;
-
-	/* First do the "command-line" parameters */
-
-	for (i=0; i < KCS_MAX_PARMS; i++) {
-		rv = init_one_kcs(kcs_ports[i], 
-				  kcs_irqs[i], 
-				  0, 
-				  &(kcs_infos[pos]));
-		if (rv == 0)
-			pos++;
-
-		rv = init_one_kcs(0, 
-				  kcs_irqs[i], 
-				  kcs_addrs[i], 
-				  &(kcs_infos[pos]));
-		if (rv == 0)
-			pos++;
-	}
-
-	/* Only try the defaults if enabled and resources are available
-	   (because they weren't already specified above). */
-
-	if (kcs_trydefaults && (pos == 0)) {
-		rv = -EINVAL;
-#ifdef CONFIG_ACPI_INTERPRETER
-		if (rv && (physaddr = acpi_find_bmc(&physaddr, &port) == 0)) {
-			rv = init_one_kcs(port, 
-					  0, 
-					  physaddr, 
-					  &(kcs_infos[pos]));
-			if (rv == 0)
-				pos++;
-		}
-#endif
-		if (rv) {
-			rv = init_one_kcs(DEFAULT_IO_PORT, 
-					  0, 
-					  0, 
-					  &(kcs_infos[pos]));
-			if (rv == 0)
-				pos++;
-		}
-	}
-
-	if (kcs_infos[0] == NULL) {
-		printk("ipmi_kcs: Unable to find any KCS interfaces\n");
-		return -ENODEV;
-	} 
-
-	return 0;
-}
-module_init(init_ipmi_kcs);
-
-#ifdef MODULE
-void __exit cleanup_one_kcs(struct kcs_info *to_clean)
-{
-	int           rv;
-	unsigned long flags;
-
-	if (! to_clean)
-		return;
-
-	/* Tell the timer and interrupt handlers that we are shutting
-	   down. */
-	spin_lock_irqsave(&(to_clean->kcs_lock), flags);
-	spin_lock(&(to_clean->msg_lock));
-
-	to_clean->stop_operation = 1;
-
-	if (to_clean->irq != 0)
-		free_irq(to_clean->irq, to_clean);
-	if (to_clean->port) {
-		printk(KERN_INFO 
-		       "ipmi_kcs: Releasing BMC @ port=0x%x\n",
-		       to_clean->port);
-		release_region (to_clean->port, 2);
-	}
-	if (to_clean->addr) {
-		printk(KERN_INFO 
-		       "ipmi_kcs: Releasing BMC @ addr=0x%lx\n",
-		       to_clean->physaddr);
-		iounmap(to_clean->addr);
-		release_mem_region(to_clean->physaddr, 2);
-	}
-
-	spin_unlock(&(to_clean->msg_lock));
-	spin_unlock_irqrestore(&(to_clean->kcs_lock), flags);
-
-	/* Wait until we know that we are out of any interrupt
-	   handlers might have been running before we freed the
-	   interrupt. */
-	synchronize_kernel();
-
-	/* Wait for the timer to stop.  This avoids problems with race
-	   conditions removing the timer here. */
-	while (!to_clean->timer_stopped) {
-		schedule_timeout(1);
-	}
-
-	rv = ipmi_unregister_smi(to_clean->intf);
-	if (rv) {
-		printk(KERN_ERR 
-		       "ipmi_kcs: Unable to unregister device: errno=%d\n",
-		       rv);
-	}
-
-	initialized = 0;
-
-	kfree(to_clean->kcs_sm);
-	kfree(to_clean);
-}
-
-static __exit void cleanup_ipmi_kcs(void)
-{
-	int i;
-
-	if (!initialized)
-		return;
-
-	for (i=0; i<KCS_MAX_DRIVERS; i++) {
-		cleanup_one_kcs(kcs_infos[i]);
-	}
-}
-module_exit(cleanup_ipmi_kcs);
-#else
-
-/* Unfortunately, cmdline::get_options() only returns integers, not
-   longs.  Since we need ulongs (64-bit physical addresses) parse the 
-   comma-separated list manually.  Arguments can be one of these forms:
-   m0xaabbccddeeff	A physical memory address without an IRQ
-   m0xaabbccddeeff:cc	A physical memory address with an IRQ
-   p0xaabb		An IO port without an IRQ
-   p0xaabb:cc		An IO port with an IRQ
-   nodefaults		Suppress trying the default IO port or ACPI address 
-
-   For example, to pass one IO port with an IRQ, one address, and 
-   suppress the use of the default IO port and ACPI address,
-   use this option string: ipmi_kcs=p0xCA2:5,m0xFF5B0022,nodefaults
-
-   Remember, ipmi_kcs_setup() is passed the string after the equal sign. */
-
-static int __init ipmi_kcs_setup(char *str)
-{
-	unsigned long val;
-	char *cur, *colon;
-	int pos;
-
-	pos = 0;
-	
-	cur = strsep(&str, ",");
-	while ((cur) && (*cur) && (pos < KCS_MAX_PARMS)) {
-		switch (*cur) {
-		case 'n':
-			if (strcmp(cur, "nodefaults") == 0)
-				kcs_trydefaults = 0;
-			else
-				printk(KERN_INFO 
-				       "ipmi_kcs: bad parameter value %s\n",
-				       cur);
-			break;
-		
-		case 'm':
-		case 'p':
-			val = simple_strtoul(cur + 1,
-					     &colon,
-					     0);
-			if (*cur == 'p')
-				kcs_ports[pos] = val;
-			else
-				kcs_addrs[pos] = val;
-			if (*colon == ':') {
-				val = simple_strtoul(colon + 1,
-						     &colon,
-						     0);
-				kcs_irqs[pos] = val;
-			}
-			pos++;
-			break;
-
-		default:
-			printk(KERN_INFO 
-			       "ipmi_kcs: bad parameter value %s\n",
-			       cur);
-		}
-		cur = strsep(&str, ",");
-	}
-
-	return 1;
-}
-__setup("ipmi_kcs=", ipmi_kcs_setup);
-#endif
-
-MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.c b/drivers/char/ipmi/ipmi_kcs_sm.c
index 29b14608e292..f4dd321e9638 100644
--- a/drivers/char/ipmi/ipmi_kcs_sm.c
+++ b/drivers/char/ipmi/ipmi_kcs_sm.c
@@ -37,13 +37,12 @@
  * that document.
  */
 
-#include <linux/types.h>
+#include <linux/kernel.h> /* For printk. */
+#include <linux/string.h>
+#include <linux/ipmi_msgdefs.h>		/* for completion codes */
+#include "ipmi_si_sm.h"
 
-#include <asm/io.h>
-#include <asm/string.h>		/* Gets rid of memcpy warning */
-#include <asm/system.h>
-
-#include "ipmi_kcs_sm.h"
+#define IPMI_KCS_VERSION "v31"
 
 /* Set this if you want a printout of why the state machine was hosed
    when it gets hosed. */
@@ -95,32 +94,28 @@ enum kcs_states {
 #define OBF_RETRY_TIMEOUT 1000000
 #define MAX_ERROR_RETRIES 10
 
-#define IPMI_ERR_MSG_TRUNCATED	0xc6
-#define IPMI_ERR_UNSPECIFIED	0xff
-
-struct kcs_data
+struct si_sm_data
 {
-	enum kcs_states state;
-	unsigned int    port;
-	unsigned char	*addr;
-	unsigned char   write_data[MAX_KCS_WRITE_SIZE];
-	int             write_pos;
-	int             write_count;
-	int             orig_write_count;
-	unsigned char   read_data[MAX_KCS_READ_SIZE];
-	int             read_pos;
-	int	        truncated;
+	enum kcs_states  state;
+	struct si_sm_io *io;
+	unsigned char    write_data[MAX_KCS_WRITE_SIZE];
+	int              write_pos;
+	int              write_count;
+	int              orig_write_count;
+	unsigned char    read_data[MAX_KCS_READ_SIZE];
+	int              read_pos;
+	int	         truncated;
 
 	unsigned int  error_retries;
 	long          ibf_timeout;
 	long          obf_timeout;
 };
 
-void init_kcs_data(struct kcs_data *kcs, unsigned int port, unsigned char *addr)
+static unsigned int init_kcs_data(struct si_sm_data *kcs,
+				  struct si_sm_io *io)
 {
 	kcs->state = KCS_IDLE;
-	kcs->port = port;
-	kcs->addr = addr;
+	kcs->io = io;
 	kcs->write_pos = 0;
 	kcs->write_count = 0;
 	kcs->orig_write_count = 0;
@@ -129,40 +124,29 @@ void init_kcs_data(struct kcs_data *kcs, unsigned int port, unsigned char *addr)
 	kcs->truncated = 0;
 	kcs->ibf_timeout = IBF_RETRY_TIMEOUT;
 	kcs->obf_timeout = OBF_RETRY_TIMEOUT;
-}
 
-/* Remember, init_one_kcs() insured port and addr can't both be set */
+	/* Reserve 2 I/O bytes. */
+	return 2;
+}
 
-static inline unsigned char read_status(struct kcs_data *kcs)
+static inline unsigned char read_status(struct si_sm_data *kcs)
 {
-        if (kcs->port)
-		return inb(kcs->port + 1);
-        else
-		return readb(kcs->addr + 1);
+	return kcs->io->inputb(kcs->io, 1);
 }
 
-static inline unsigned char read_data(struct kcs_data *kcs)
+static inline unsigned char read_data(struct si_sm_data *kcs)
 {
-        if (kcs->port)
-		return inb(kcs->port + 0);
-        else
-		return readb(kcs->addr + 0);
+	return kcs->io->inputb(kcs->io, 0);
 }
 
-static inline void write_cmd(struct kcs_data *kcs, unsigned char data)
+static inline void write_cmd(struct si_sm_data *kcs, unsigned char data)
 {
-        if (kcs->port)
-		outb(data, kcs->port + 1);
-        else
-		writeb(data, kcs->addr + 1);
+	kcs->io->outputb(kcs->io, 1, data);
 }
 
-static inline void write_data(struct kcs_data *kcs, unsigned char data)
+static inline void write_data(struct si_sm_data *kcs, unsigned char data)
 {
-        if (kcs->port)
-		outb(data, kcs->port + 0);
-        else
-		writeb(data, kcs->addr + 0);
+	kcs->io->outputb(kcs->io, 0, data);
 }
 
 /* Control codes. */
@@ -182,14 +166,14 @@ static inline void write_data(struct kcs_data *kcs, unsigned char data)
 #define GET_STATUS_OBF(status) ((status) & 0x01)
 
 
-static inline void write_next_byte(struct kcs_data *kcs)
+static inline void write_next_byte(struct si_sm_data *kcs)
 {
 	write_data(kcs, kcs->write_data[kcs->write_pos]);
 	(kcs->write_pos)++;
 	(kcs->write_count)--;
 }
 
-static inline void start_error_recovery(struct kcs_data *kcs, char *reason)
+static inline void start_error_recovery(struct si_sm_data *kcs, char *reason)
 {
 	(kcs->error_retries)++;
 	if (kcs->error_retries > MAX_ERROR_RETRIES) {
@@ -202,7 +186,7 @@ static inline void start_error_recovery(struct kcs_data *kcs, char *reason)
 	}
 }
 
-static inline void read_next_byte(struct kcs_data *kcs)
+static inline void read_next_byte(struct si_sm_data *kcs)
 {
 	if (kcs->read_pos >= MAX_KCS_READ_SIZE) {
 		/* Throw the data away and mark it truncated. */
@@ -215,9 +199,8 @@ static inline void read_next_byte(struct kcs_data *kcs)
 	write_data(kcs, KCS_READ_BYTE);
 }
 
-static inline int check_ibf(struct kcs_data *kcs,
-			    unsigned char   status,
-			    long            time)
+static inline int check_ibf(struct si_sm_data *kcs, unsigned char status,
+			    long time)
 {
 	if (GET_STATUS_IBF(status)) {
 		kcs->ibf_timeout -= time;
@@ -232,9 +215,8 @@ static inline int check_ibf(struct kcs_data *kcs,
 	return 1;
 }
 
-static inline int check_obf(struct kcs_data *kcs,
-			    unsigned char   status,
-			    long            time)
+static inline int check_obf(struct si_sm_data *kcs, unsigned char status,
+			    long time)
 {
 	if (! GET_STATUS_OBF(status)) {
 		kcs->obf_timeout -= time;
@@ -248,13 +230,13 @@ static inline int check_obf(struct kcs_data *kcs,
 	return 1;
 }
 
-static void clear_obf(struct kcs_data *kcs, unsigned char status)
+static void clear_obf(struct si_sm_data *kcs, unsigned char status)
 {
 	if (GET_STATUS_OBF(status))
 		read_data(kcs);
 }
 
-static void restart_kcs_transaction(struct kcs_data *kcs)
+static void restart_kcs_transaction(struct si_sm_data *kcs)
 {
 	kcs->write_count = kcs->orig_write_count;
 	kcs->write_pos = 0;
@@ -265,7 +247,8 @@ static void restart_kcs_transaction(struct kcs_data *kcs)
 	write_cmd(kcs, KCS_WRITE_START);
 }
 
-int start_kcs_transaction(struct kcs_data *kcs, char *data, unsigned int size)
+static int start_kcs_transaction(struct si_sm_data *kcs, unsigned char *data,
+				 unsigned int size)
 {
 	if ((size < 2) || (size > MAX_KCS_WRITE_SIZE)) {
 		return -1;
@@ -287,7 +270,8 @@ int start_kcs_transaction(struct kcs_data *kcs, char *data, unsigned int size)
 	return 0;
 }
 
-int kcs_get_result(struct kcs_data *kcs, unsigned char *data, int length)
+static int get_kcs_result(struct si_sm_data *kcs, unsigned char *data,
+			  unsigned int length)
 {
 	if (length < kcs->read_pos) {
 		kcs->read_pos = length;
@@ -316,7 +300,7 @@ int kcs_get_result(struct kcs_data *kcs, unsigned char *data, int length)
 /* This implements the state machine defined in the IPMI manual, see
    that for details on how this works.  Divide that flowchart into
    sections delimited by "Wait for IBF" and this will become clear. */
-enum kcs_result kcs_event(struct kcs_data *kcs, long time)
+static enum si_sm_result kcs_event(struct si_sm_data *kcs, long time)
 {
 	unsigned char status;
 	unsigned char state;
@@ -328,7 +312,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 #endif
 	/* All states wait for ibf, so just do it here. */
 	if (!check_ibf(kcs, status, time))
-		return KCS_CALL_WITH_DELAY;
+		return SI_SM_CALL_WITH_DELAY;
 
 	/* Just about everything looks at the KCS state, so grab that, too. */
 	state = GET_STATUS_STATE(status);
@@ -339,9 +323,9 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 		clear_obf(kcs, status);
 
 		if (GET_STATUS_ATN(status))
-			return KCS_ATTN;
+			return SI_SM_ATTN;
 		else
-			return KCS_SM_IDLE;
+			return SI_SM_IDLE;
 
 	case KCS_START_OP:
 		if (state != KCS_IDLE) {
@@ -408,7 +392,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 
 		if (state == KCS_READ_STATE) {
 			if (! check_obf(kcs, status, time))
-				return KCS_CALL_WITH_DELAY;
+				return SI_SM_CALL_WITH_DELAY;
 			read_next_byte(kcs);
 		} else {
 			/* We don't implement this exactly like the state
@@ -421,7 +405,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 			clear_obf(kcs, status);
 			kcs->orig_write_count = 0;
 			kcs->state = KCS_IDLE;
-			return KCS_TRANSACTION_COMPLETE;
+			return SI_SM_TRANSACTION_COMPLETE;
 		}
 		break;
 
@@ -444,7 +428,7 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 			break;
 		}
 		if (! check_obf(kcs, status, time))
-			return KCS_CALL_WITH_DELAY;
+			return SI_SM_CALL_WITH_DELAY;
 
 		clear_obf(kcs, status);
 		write_data(kcs, KCS_READ_BYTE);
@@ -459,14 +443,14 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 		}
 
 		if (! check_obf(kcs, status, time))
-			return KCS_CALL_WITH_DELAY;
+			return SI_SM_CALL_WITH_DELAY;
 
 		clear_obf(kcs, status);
 		if (kcs->orig_write_count) {
 			restart_kcs_transaction(kcs);
 		} else {
 			kcs->state = KCS_IDLE;
-			return KCS_TRANSACTION_COMPLETE;
+			return SI_SM_TRANSACTION_COMPLETE;
 		}
 		break;
 			
@@ -475,14 +459,42 @@ enum kcs_result kcs_event(struct kcs_data *kcs, long time)
 	}
 
 	if (kcs->state == KCS_HOSED) {
-		init_kcs_data(kcs, kcs->port, kcs->addr);
-		return KCS_SM_HOSED;
+		init_kcs_data(kcs, kcs->io);
+		return SI_SM_HOSED;
 	}
 
-	return KCS_CALL_WITHOUT_DELAY;
+	return SI_SM_CALL_WITHOUT_DELAY;
 }
 
-int kcs_size(void)
+static int kcs_size(void)
 {
-	return sizeof(struct kcs_data);
+	return sizeof(struct si_sm_data);
 }
+
+static int kcs_detect(struct si_sm_data *kcs)
+{
+	/* It's impossible for the KCS status register to be all 1's,
+	   (assuming a properly functioning, self-initialized BMC)
+	   but that's what you get from reading a bogus address, so we
+	   test that first. */
+	if (read_status(kcs) == 0xff)
+		return 1;
+
+	return 0;
+}
+
+static void kcs_cleanup(struct si_sm_data *kcs)
+{
+}
+
+struct si_sm_handlers kcs_smi_handlers =
+{
+	.version           = IPMI_KCS_VERSION,
+	.init_data         = init_kcs_data,
+	.start_transaction = start_kcs_transaction,
+	.get_result        = get_kcs_result,
+	.event             = kcs_event,
+	.detect            = kcs_detect,
+	.cleanup           = kcs_cleanup,
+	.size              = kcs_size,
+};
diff --git a/drivers/char/ipmi/ipmi_kcs_sm.h b/drivers/char/ipmi/ipmi_kcs_sm.h
deleted file mode 100644
index 81cf952f6314..000000000000
--- a/drivers/char/ipmi/ipmi_kcs_sm.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * ipmi_kcs_sm.h
- *
- * State machine for handling IPMI KCS interfaces.
- *
- * Author: MontaVista Software, Inc.
- *         Corey Minyard <minyard@mvista.com>
- *         source@mvista.com
- *
- * Copyright 2002 MontaVista Software Inc.
- *
- *  This program is free software; you can redistribute it and/or modify it
- *  under the terms of the GNU General Public License as published by the
- *  Free Software Foundation; either version 2 of the License, or (at your
- *  option) any later version.
- *
- *
- *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
- *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
- *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
- *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
- *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- *  You should have received a copy of the GNU General Public License along
- *  with this program; if not, write to the Free Software Foundation, Inc.,
- *  675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-struct kcs_data;
-
-void init_kcs_data(struct kcs_data *kcs,
-		   unsigned int    port,
-		   unsigned char   *addr);
-
-/* Start a new transaction in the state machine.  This will return -2
-   if the state machine is not idle, -1 if the size is invalid (to
-   large or too small), or 0 if the transaction is successfully
-   completed. */
-int start_kcs_transaction(struct kcs_data *kcs, char *data, unsigned int size);
-
-/* Return the results after the transaction.  This will return -1 if
-   the buffer is too small, zero if no transaction is present, or the
-   actual length of the result data. */
-int kcs_get_result(struct kcs_data *kcs, unsigned char *data, int length);
-
-enum kcs_result
-{
-	KCS_CALL_WITHOUT_DELAY, /* Call the driver again immediately */
-	KCS_CALL_WITH_DELAY,	/* Delay some before calling again. */
-	KCS_TRANSACTION_COMPLETE, /* A transaction is finished. */
-	KCS_SM_IDLE,		/* The SM is in idle state. */
-	KCS_SM_HOSED,		/* The hardware violated the state machine. */
-	KCS_ATTN		/* The hardware is asserting attn and the
-				   state machine is idle. */
-};
-
-/* Call this periodically (for a polled interface) or upon receiving
-   an interrupt (for a interrupt-driven interface).  If interrupt
-   driven, you should probably poll this periodically when not in idle
-   state.  This should be called with the time that passed since the
-   last call, if it is significant.  Time is in microseconds. */
-enum kcs_result kcs_event(struct kcs_data *kcs, long time);
-
-/* Return the size of the KCS structure in bytes. */
-int kcs_size(void);
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ebbd8032fa9a..c1e4abf1463b 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -44,16 +44,21 @@
 #include <linux/ipmi_smi.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
+#include <linux/proc_fs.h>
+
+#define IPMI_MSGHANDLER_VERSION "v31"
 
 struct ipmi_recv_msg *ipmi_alloc_recv_msg(void);
 static int ipmi_init_msghandler(void);
 
 static int initialized = 0;
 
+static struct proc_dir_entry *proc_ipmi_root = NULL;
+
 #define MAX_EVENTS_IN_QUEUE	25
 
 /* Don't let a message sit in a queue forever, always time it with at lest
-   the max message timer. */
+   the max message timer.  This is in milliseconds. */
 #define MAX_MSG_TIMEOUT		60000
 
 struct ipmi_user
@@ -82,7 +87,8 @@ struct cmd_rcvr
 
 struct seq_table
 {
-	int                  inuse : 1;
+	unsigned int         inuse : 1;
+	unsigned int         broadcast : 1;
 
 	unsigned long        timeout;
 	unsigned long        orig_timeout;
@@ -111,10 +117,19 @@ struct seq_table
 
 #define NEXT_SEQID(seqid) (((seqid) + 1) & 0x3fffff)
 
+struct ipmi_channel
+{
+	unsigned char medium;
+	unsigned char protocol;
+};
 
 #define IPMI_IPMB_NUM_SEQ	64
+#define IPMI_MAX_CHANNELS       8
 struct ipmi_smi
 {
+	/* What interface number are we? */
+	int intf_num;
+
 	/* The list of upper layers that are using me.  We read-lock
            this when delivering messages to the upper layer to keep
            the user from going away while we are processing the
@@ -123,6 +138,9 @@ struct ipmi_smi
 	rwlock_t                users_lock;
 	struct list_head        users;
 
+	/* Used for wake ups at startup. */
+	wait_queue_head_t waitq;
+
 	/* The IPMI version of the BMC on the other end. */
 	unsigned char       version_major;
 	unsigned char       version_minor;
@@ -182,6 +200,86 @@ struct ipmi_smi
 	   it.  Note that the message will still be freed by the
 	   caller.  This only works on the system interface. */
 	void (*null_user_handler)(ipmi_smi_t intf, struct ipmi_smi_msg *msg);
+
+	/* When we are scanning the channels for an SMI, this will
+	   tell which channel we are scanning. */
+	int curr_channel;
+
+	/* Channel information */
+	struct ipmi_channel channels[IPMI_MAX_CHANNELS];
+
+	/* Proc FS stuff. */
+	struct proc_dir_entry *proc_dir;
+	char                  proc_dir_name[10];
+
+	spinlock_t   counter_lock; /* For making counters atomic. */
+
+	/* Commands we got that were invalid. */
+	unsigned int sent_invalid_commands;
+
+	/* Commands we sent to the MC. */
+	unsigned int sent_local_commands;
+	/* Responses from the MC that were delivered to a user. */
+	unsigned int handled_local_responses;
+	/* Responses from the MC that were not delivered to a user. */
+	unsigned int unhandled_local_responses;
+
+	/* Commands we sent out to the IPMB bus. */
+	unsigned int sent_ipmb_commands;
+	/* Commands sent on the IPMB that had errors on the SEND CMD */
+	unsigned int sent_ipmb_command_errs;
+	/* Each retransmit increments this count. */
+	unsigned int retransmitted_ipmb_commands;
+	/* When a message times out (runs out of retransmits) this is
+           incremented. */
+	unsigned int timed_out_ipmb_commands;
+
+	/* This is like above, but for broadcasts.  Broadcasts are
+           *not* included in the above count (they are expected to
+           time out). */
+	unsigned int timed_out_ipmb_broadcasts;
+
+	/* Responses I have sent to the IPMB bus. */
+	unsigned int sent_ipmb_responses;
+
+	/* The response was delivered to the user. */
+	unsigned int handled_ipmb_responses;
+	/* The response had invalid data in it. */
+	unsigned int invalid_ipmb_responses;
+	/* The response didn't have anyone waiting for it. */
+	unsigned int unhandled_ipmb_responses;
+
+	/* Commands we sent out to the IPMB bus. */
+	unsigned int sent_lan_commands;
+	/* Commands sent on the IPMB that had errors on the SEND CMD */
+	unsigned int sent_lan_command_errs;
+	/* Each retransmit increments this count. */
+	unsigned int retransmitted_lan_commands;
+	/* When a message times out (runs out of retransmits) this is
+           incremented. */
+	unsigned int timed_out_lan_commands;
+
+	/* Responses I have sent to the IPMB bus. */
+	unsigned int sent_lan_responses;
+
+	/* The response was delivered to the user. */
+	unsigned int handled_lan_responses;
+	/* The response had invalid data in it. */
+	unsigned int invalid_lan_responses;
+	/* The response didn't have anyone waiting for it. */
+	unsigned int unhandled_lan_responses;
+
+	/* The command was delivered to the user. */
+	unsigned int handled_commands;
+	/* The command had invalid data in it. */
+	unsigned int invalid_commands;
+	/* The command didn't have anyone waiting for it. */
+	unsigned int unhandled_commands;
+
+	/* Invalid data in an event. */
+	unsigned int invalid_events;
+	/* Events that were received with the proper format. */
+	unsigned int events;
 };
 
 int
@@ -264,6 +362,21 @@ int ipmi_smi_watcher_unregister(struct ipmi_smi_watcher *watcher)
 	return 0;
 }
 
+static void
+call_smi_watchers(int i)
+{
+	struct ipmi_smi_watcher *w;
+
+	down_read(&smi_watchers_sem);
+	list_for_each_entry(w, &smi_watchers, link) {
+		if (try_module_get(w->owner)) {
+			w->new_smi(i);
+			module_put(w->owner);
+		}
+	}
+	up_read(&smi_watchers_sem);
+}
+
 int
 ipmi_addr_equal(struct ipmi_addr *addr1, struct ipmi_addr *addr2)
 {
@@ -293,6 +406,19 @@ ipmi_addr_equal(struct ipmi_addr *addr1, struct ipmi_addr *addr2)
 			&& (ipmb_addr1->lun == ipmb_addr2->lun));
 	}
 
+	if (addr1->addr_type == IPMI_LAN_ADDR_TYPE) {
+		struct ipmi_lan_addr *lan_addr1
+			= (struct ipmi_lan_addr *) addr1;
+		struct ipmi_lan_addr *lan_addr2
+		    = (struct ipmi_lan_addr *) addr2;
+
+		return ((lan_addr1->remote_SWID == lan_addr2->remote_SWID)
+			&& (lan_addr1->local_SWID == lan_addr2->local_SWID)
+			&& (lan_addr1->session_handle
+			    == lan_addr2->session_handle)
+			&& (lan_addr1->lun == lan_addr2->lun));
+	}
+
 	return 1;
 }
 
@@ -322,6 +448,13 @@ int ipmi_validate_addr(struct ipmi_addr *addr, int len)
 		return 0;
 	}
 
+	if (addr->addr_type == IPMI_LAN_ADDR_TYPE) {
+		if (len < sizeof(struct ipmi_lan_addr)) {
+			return -EINVAL;
+		}
+		return 0;
+	}
+
 	return -EINVAL;
 }
 
@@ -341,7 +474,7 @@ unsigned int ipmi_addr_length(int addr_type)
 
 static void deliver_response(struct ipmi_recv_msg *msg)
 {
-    msg->user->handler->ipmi_recv_hndl(msg, msg->user->handler_data);
+	msg->user->handler->ipmi_recv_hndl(msg, msg->user->handler_data);
 }
 
 /* Find the next sequence number not being used and add the given
@@ -351,6 +484,7 @@ static int intf_next_seq(ipmi_smi_t           intf,
 			 struct ipmi_recv_msg *recv_msg,
 			 unsigned long        timeout,
 			 int                  retries,
+			 int                  broadcast,
 			 unsigned char        *seq,
 			 long                 *seqid)
 {
@@ -373,6 +507,7 @@ static int intf_next_seq(ipmi_smi_t           intf,
 		intf->seq_table[i].timeout = MAX_MSG_TIMEOUT;
 		intf->seq_table[i].orig_timeout = timeout;
 		intf->seq_table[i].retries_left = retries;
+		intf->seq_table[i].broadcast = broadcast;
 		intf->seq_table[i].inuse = 1;
 		intf->seq_table[i].seqid = NEXT_SEQID(intf->seq_table[i].seqid);
 		*seq = i;
@@ -425,8 +560,8 @@ static int intf_find_seq(ipmi_smi_t           intf,
 
 
 /* Start the timer for a specific sequence table entry. */
-static int intf_start_seq_timer(ipmi_smi_t           intf,
-				long                 msgid)
+static int intf_start_seq_timer(ipmi_smi_t intf,
+				long       msgid)
 {
 	int           rv = -ENODEV;
 	unsigned long flags;
@@ -451,6 +586,46 @@ static int intf_start_seq_timer(ipmi_smi_t           intf,
 	return rv;
 }
 
+/* Got an error for the send message for a specific sequence number. */
+static int intf_err_seq(ipmi_smi_t   intf,
+			long         msgid,
+			unsigned int err)
+{
+	int                  rv = -ENODEV;
+	unsigned long        flags;
+	unsigned char        seq;
+	unsigned long        seqid;
+	struct ipmi_recv_msg *msg = NULL;
+
+
+	GET_SEQ_FROM_MSGID(msgid, seq, seqid);
+
+	spin_lock_irqsave(&(intf->seq_lock), flags);
+	/* We do this verification because the user can be deleted
+           while a message is outstanding. */
+	if ((intf->seq_table[seq].inuse)
+	    && (intf->seq_table[seq].seqid == seqid))
+	{
+		struct seq_table *ent = &(intf->seq_table[seq]);
+
+		ent->inuse = 0;
+		msg = ent->recv_msg;
+		rv = 0;
+	}
+	spin_unlock_irqrestore(&(intf->seq_lock), flags);
+
+	if (msg) {
+		msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+		msg->msg_data[0] = err;
+		msg->msg.netfn |= 1; /* Convert to a response. */
+		msg->msg.data_len = 1;
+		msg->msg.data = msg->msg_data;
+		deliver_response(msg);
+	}
+
+	return rv;
+}
+
 
 int ipmi_create_user(unsigned int          if_num,
 		     struct ipmi_user_hndl *handler,
@@ -523,15 +698,14 @@ static int ipmi_destroy_user_nolock(ipmi_user_t user)
 {
 	int              rv = -ENODEV;
 	ipmi_user_t      t_user;
-	struct list_head *entry, *entry2;
+	struct cmd_rcvr  *rcvr, *rcvr2;
 	int              i;
 	unsigned long    flags;
 
 	/* Find the user and delete them from the list. */
-	list_for_each(entry, &(user->intf->users)) {
-		t_user = list_entry(entry, struct ipmi_user, link);
+	list_for_each_entry(t_user, &(user->intf->users), link) {
 		if (t_user == user) {
-			list_del(entry);
+			list_del(&t_user->link);
 			rv = 0;
 			break;
 		}
@@ -554,11 +728,9 @@ static int ipmi_destroy_user_nolock(ipmi_user_t user)
 
 	/* Remove the user from the command receiver's table. */
 	write_lock_irqsave(&(user->intf->cmd_rcvr_lock), flags);
-	list_for_each_safe(entry, entry2, &(user->intf->cmd_rcvrs)) {
-		struct cmd_rcvr *rcvr;
-		rcvr = list_entry(entry, struct cmd_rcvr, link);
+	list_for_each_entry_safe(rcvr, rcvr2, &(user->intf->cmd_rcvrs), link) {
 		if (rcvr->user == user) {
-			list_del(entry);
+			list_del(&rcvr->link);
 			kfree(rcvr);
 		}
 	}
@@ -621,8 +793,7 @@ unsigned char ipmi_get_my_LUN(ipmi_user_t user)
 int ipmi_set_gets_events(ipmi_user_t user, int val)
 {
 	unsigned long         flags;
-	struct list_head      *e, *e2;
-	struct ipmi_recv_msg  *msg;
+	struct ipmi_recv_msg  *msg, *msg2;
 
 	read_lock(&(user->intf->users_lock));
 	spin_lock_irqsave(&(user->intf->events_lock), flags);
@@ -630,9 +801,8 @@ int ipmi_set_gets_events(ipmi_user_t user, int val)
 
 	if (val) {
 		/* Deliver any queued events. */
-		list_for_each_safe(e, e2, &(user->intf->waiting_events)) {
-			msg = list_entry(e, struct ipmi_recv_msg, link);
-			list_del(e);
+		list_for_each_entry_safe(msg, msg2, &(user->intf->waiting_events), link) {
+			list_del(&msg->link);
 			msg->user = user;
 			deliver_response(msg);
 		}
@@ -648,7 +818,7 @@ int ipmi_register_for_cmd(ipmi_user_t   user,
 			  unsigned char netfn,
 			  unsigned char cmd)
 {
-	struct list_head *entry;
+	struct cmd_rcvr  *cmp;
 	unsigned long    flags;
 	struct cmd_rcvr  *rcvr;
 	int              rv = 0;
@@ -666,9 +836,7 @@ int ipmi_register_for_cmd(ipmi_user_t   user,
 	}
 
 	/* Make sure the command/netfn is not already registered. */
-	list_for_each(entry, &(user->intf->cmd_rcvrs)) {
-		struct cmd_rcvr *cmp;
-		cmp = list_entry(entry, struct cmd_rcvr, link);
+	list_for_each_entry(cmp, &(user->intf->cmd_rcvrs), link) {
 		if ((cmp->netfn == netfn) && (cmp->cmd == cmd)) {
 			rv = -EBUSY;
 			break;
@@ -695,7 +863,6 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
 			    unsigned char netfn,
 			    unsigned char cmd)
 {
-	struct list_head *entry;
 	unsigned long    flags;
 	struct cmd_rcvr  *rcvr;
 	int              rv = -ENOENT;
@@ -703,11 +870,10 @@ int ipmi_unregister_for_cmd(ipmi_user_t   user,
 	read_lock(&(user->intf->users_lock));
 	write_lock_irqsave(&(user->intf->cmd_rcvr_lock), flags);
 	/* Make sure the command/netfn is not already registered. */
-	list_for_each(entry, &(user->intf->cmd_rcvrs)) {
-		rcvr = list_entry(entry, struct cmd_rcvr, link);
+	list_for_each_entry(rcvr, &(user->intf->cmd_rcvrs), link) {
 		if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)) {
 			rv = 0;
-			list_del(entry);
+			list_del(&rcvr->link);
 			kfree(rcvr);
 			break;
 		}
@@ -771,6 +937,43 @@ static inline void format_ipmb_msg(struct ipmi_smi_msg   *smi_msg,
 	smi_msg->msgid = msgid;
 }
 
+static inline void format_lan_msg(struct ipmi_smi_msg   *smi_msg,
+				  struct ipmi_msg       *msg,
+				  struct ipmi_lan_addr  *lan_addr,
+				  long                  msgid,
+				  unsigned char         ipmb_seq,
+				  unsigned char         source_lun)
+{
+	/* Format the IPMB header data. */
+	smi_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+	smi_msg->data[1] = IPMI_SEND_MSG_CMD;
+	smi_msg->data[2] = lan_addr->channel;
+	smi_msg->data[3] = lan_addr->session_handle;
+	smi_msg->data[4] = lan_addr->remote_SWID;
+	smi_msg->data[5] = (msg->netfn << 2) | (lan_addr->lun & 0x3);
+	smi_msg->data[6] = ipmb_checksum(&(smi_msg->data[4]), 2);
+	smi_msg->data[7] = lan_addr->local_SWID;
+	smi_msg->data[8] = (ipmb_seq << 2) | source_lun;
+	smi_msg->data[9] = msg->cmd;
+
+	/* Now tack on the data to the message. */
+	if (msg->data_len > 0)
+		memcpy(&(smi_msg->data[10]), msg->data,
+		       msg->data_len);
+	smi_msg->data_size = msg->data_len + 10;
+
+	/* Now calculate the checksum and tack it on. */
+	smi_msg->data[smi_msg->data_size]
+		= ipmb_checksum(&(smi_msg->data[7]),
+				smi_msg->data_size-7);
+
+	/* Add on the checksum size and the offset from the
+	   broadcast. */
+	smi_msg->data_size += 1;
+
+	smi_msg->msgid = msgid;
+}
+
 /* Separate from ipmi_request so that the user does not have to be
    supplied in certain circumstances (mainly at panic time).  If
    messages are supplied, they will be freed, even if an error
@@ -780,11 +983,14 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 				 struct ipmi_addr     *addr,
 				 long                 msgid,
 				 struct ipmi_msg      *msg,
+				 void                 *user_msg_data,
 				 void                 *supplied_smi,
 				 struct ipmi_recv_msg *supplied_recv,
 				 int                  priority,
 				 unsigned char        source_address,
-				 unsigned char        source_lun)
+				 unsigned char        source_lun,
+				 int                  retries,
+				 unsigned int         retry_time_ms)
 {
 	int                  rv = 0;
 	struct ipmi_smi_msg  *smi_msg;
@@ -800,6 +1006,7 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 			return -ENOMEM;
 		}
 	}
+	recv_msg->user_msg_data = user_msg_data;
 
 	if (supplied_smi) {
 		smi_msg = (struct ipmi_smi_msg *) supplied_smi;
@@ -811,11 +1018,6 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		}
 	}
 
-	if (addr->channel > IPMI_NUM_CHANNELS) {
-	    rv = -EINVAL;
-	    goto out_err;
-	}
-
 	recv_msg->user = user;
 	recv_msg->msgid = msgid;
 	/* Store the message to send in the receive message so timeout
@@ -825,10 +1027,20 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 	if (addr->addr_type == IPMI_SYSTEM_INTERFACE_ADDR_TYPE) {
 		struct ipmi_system_interface_addr *smi_addr;
 
+		if (msg->netfn & 1) {
+			/* Responses are not allowed to the SMI. */
+			rv = -EINVAL;
+			goto out_err;
+		}
 
 		smi_addr = (struct ipmi_system_interface_addr *) addr;
-		if (smi_addr->lun > 3)
-			return -EINVAL;
+		if (smi_addr->lun > 3) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
 
 		memcpy(&recv_msg->addr, smi_addr, sizeof(*smi_addr));
 
@@ -839,11 +1051,17 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		{
 			/* We don't let the user do these, since we manage
 			   the sequence numbers. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EINVAL;
 			goto out_err;
 		}
 
 		if ((msg->data_len + 2) > IPMI_MAX_MSG_LENGTH) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EMSGSIZE;
 			goto out_err;
 		}
@@ -855,41 +1073,69 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		if (msg->data_len > 0)
 			memcpy(&(smi_msg->data[2]), msg->data, msg->data_len);
 		smi_msg->data_size = msg->data_len + 2;
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->sent_local_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 	} else if ((addr->addr_type == IPMI_IPMB_ADDR_TYPE)
 		   || (addr->addr_type == IPMI_IPMB_BROADCAST_ADDR_TYPE))
 	{
 		struct ipmi_ipmb_addr *ipmb_addr;
 		unsigned char         ipmb_seq;
 		long                  seqid;
-		int                   broadcast;
-		int                   retries;
+		int                   broadcast = 0;
+
+		if (addr->channel > IPMI_NUM_CHANNELS) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
 
-		if (addr == NULL) {
+		if (intf->channels[addr->channel].medium
+		    != IPMI_CHANNEL_MEDIUM_IPMB)
+		{
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EINVAL;
 			goto out_err;
 		}
 
+		if (retries < 0) {
+		    if (addr->addr_type == IPMI_IPMB_BROADCAST_ADDR_TYPE)
+			retries = 0; /* Don't retry broadcasts. */
+		    else
+			retries = 4;
+		}
 		if (addr->addr_type == IPMI_IPMB_BROADCAST_ADDR_TYPE) {
 		    /* Broadcasts add a zero at the beginning of the
 		       message, but otherwise is the same as an IPMB
 		       address. */
 		    addr->addr_type = IPMI_IPMB_ADDR_TYPE;
 		    broadcast = 1;
-		    retries = 0; /* Don't retry broadcasts. */
-		} else {
-		    broadcast = 0;
-		    retries = 4;
 		}
 
+
+		/* Default to 1 second retries. */
+		if (retry_time_ms == 0)
+		    retry_time_ms = 1000;
+
 		/* 9 for the header and 1 for the checksum, plus
                    possibly one for the broadcast. */
 		if ((msg->data_len + 10 + broadcast) > IPMI_MAX_MSG_LENGTH) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EMSGSIZE;
 			goto out_err;
 		}
 
 		ipmb_addr = (struct ipmi_ipmb_addr *) addr;
 		if (ipmb_addr->lun > 3) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			rv = -EINVAL;
 			goto out_err;
 		}
@@ -899,21 +1145,32 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		if (recv_msg->msg.netfn & 0x1) {
 			/* It's a response, so use the user's sequence
                            from msgid. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_ipmb_responses++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
 			format_ipmb_msg(smi_msg, msg, ipmb_addr, msgid,
 					msgid, broadcast,
 					source_address, source_lun);
+
+			/* Save the receive message so we can use it
+			   to deliver the response. */
+			smi_msg->user_data = recv_msg;
 		} else {
 			/* It's a command, so get a sequence for it. */
 
 			spin_lock_irqsave(&(intf->seq_lock), flags);
 
+			spin_lock(&intf->counter_lock);
+			intf->sent_ipmb_commands++;
+			spin_unlock(&intf->counter_lock);
+
 			/* Create a sequence number with a 1 second
                            timeout and 4 retries. */
-			/* FIXME - magic number for the timeout. */
 			rv = intf_next_seq(intf,
 					   recv_msg,
-					   1000,
+					   retry_time_ms,
 					   retries,
+					   broadcast,
 					   &ipmb_seq,
 					   &seqid);
 			if (rv) {
@@ -939,6 +1196,117 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 			recv_msg->msg.data = recv_msg->msg_data;
 			recv_msg->msg.data_len = smi_msg->data_size;
 
+			/* We don't unlock until here, because we need
+                           to copy the completed message into the
+                           recv_msg before we release the lock.
+                           Otherwise, race conditions may bite us.  I
+                           know that's pretty paranoid, but I prefer
+                           to be correct. */
+			spin_unlock_irqrestore(&(intf->seq_lock), flags);
+		}
+	} else if (addr->addr_type == IPMI_LAN_ADDR_TYPE) {
+		struct ipmi_lan_addr  *lan_addr;
+		unsigned char         ipmb_seq;
+		long                  seqid;
+
+		if (addr->channel > IPMI_NUM_CHANNELS) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
+
+		if ((intf->channels[addr->channel].medium
+		    != IPMI_CHANNEL_MEDIUM_8023LAN)
+		    && (intf->channels[addr->channel].medium
+			!= IPMI_CHANNEL_MEDIUM_ASYNC))
+		{
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
+
+		retries = 4;
+
+		/* Default to 1 second retries. */
+		if (retry_time_ms == 0)
+		    retry_time_ms = 1000;
+
+		/* 11 for the header and 1 for the checksum. */
+		if ((msg->data_len + 12) > IPMI_MAX_MSG_LENGTH) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EMSGSIZE;
+			goto out_err;
+		}
+
+		lan_addr = (struct ipmi_lan_addr *) addr;
+		if (lan_addr->lun > 3) {
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_invalid_commands++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			rv = -EINVAL;
+			goto out_err;
+		}
+
+		memcpy(&recv_msg->addr, lan_addr, sizeof(*lan_addr));
+
+		if (recv_msg->msg.netfn & 0x1) {
+			/* It's a response, so use the user's sequence
+                           from msgid. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			intf->sent_lan_responses++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			format_lan_msg(smi_msg, msg, lan_addr, msgid,
+				       msgid, source_lun);
+
+			/* Save the receive message so we can use it
+			   to deliver the response. */
+			smi_msg->user_data = recv_msg;
+		} else {
+			/* It's a command, so get a sequence for it. */
+
+			spin_lock_irqsave(&(intf->seq_lock), flags);
+
+			spin_lock(&intf->counter_lock);
+			intf->sent_lan_commands++;
+			spin_unlock(&intf->counter_lock);
+
+			/* Create a sequence number with a 1 second
+                           timeout and 4 retries. */
+			rv = intf_next_seq(intf,
+					   recv_msg,
+					   retry_time_ms,
+					   retries,
+					   0,
+					   &ipmb_seq,
+					   &seqid);
+			if (rv) {
+				/* We have used up all the sequence numbers,
+				   probably, so abort. */
+				spin_unlock_irqrestore(&(intf->seq_lock),
+						       flags);
+				goto out_err;
+			}
+
+			/* Store the sequence number in the message,
+                           so that when the send message response
+                           comes back we can start the timer. */
+			format_lan_msg(smi_msg, msg, lan_addr,
+				       STORE_SEQ_IN_MSGID(ipmb_seq, seqid),
+				       ipmb_seq, source_lun);
+
+			/* Copy the message into the recv message data, so we
+			   can retransmit it later if necessary. */
+			memcpy(recv_msg->msg_data, smi_msg->data,
+			       smi_msg->data_size);
+			recv_msg->msg.data = recv_msg->msg_data;
+			recv_msg->msg.data_len = smi_msg->data_size;
+
 			/* We don't unlock until here, because we need
                            to copy the completed message into the
                            recv_msg before we release the lock.
@@ -949,16 +1317,19 @@ static inline int i_ipmi_request(ipmi_user_t          user,
 		}
 	} else {
 	    /* Unknown address type. */
-	    rv = -EINVAL;
-	    goto out_err;
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->sent_invalid_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		rv = -EINVAL;
+		goto out_err;
 	}
 
 #if DEBUG_MSGING
 	{
-	    int m;
-	    for (m=0; m<smi_msg->data_size; m++)
-		printk(" %2.2x", smi_msg->data[m]);
-	    printk("\n");
+		int m;
+		for (m=0; m<smi_msg->data_size; m++)
+			printk(" %2.2x", smi_msg->data[m]);
+		printk("\n");
 	}
 #endif
 	intf->handlers->sender(intf->send_info, smi_msg, priority);
@@ -975,6 +1346,7 @@ int ipmi_request(ipmi_user_t      user,
 		 struct ipmi_addr *addr,
 		 long             msgid,
 		 struct ipmi_msg  *msg,
+		 void             *user_msg_data,
 		 int              priority)
 {
 	return i_ipmi_request(user,
@@ -982,16 +1354,42 @@ int ipmi_request(ipmi_user_t      user,
 			      addr,
 			      msgid,
 			      msg,
+			      user_msg_data,
+			      NULL, NULL,
+			      priority,
+			      user->intf->my_address,
+			      user->intf->my_lun,
+			      -1, 0);
+}
+
+int ipmi_request_settime(ipmi_user_t      user,
+			 struct ipmi_addr *addr,
+			 long             msgid,
+			 struct ipmi_msg  *msg,
+			 void             *user_msg_data,
+			 int              priority,
+			 int              retries,
+			 unsigned int     retry_time_ms)
+{
+	return i_ipmi_request(user,
+			      user->intf,
+			      addr,
+			      msgid,
+			      msg,
+			      user_msg_data,
 			      NULL, NULL,
 			      priority,
 			      user->intf->my_address,
-			      user->intf->my_lun);
+			      user->intf->my_lun,
+			      retries,
+			      retry_time_ms);
 }
 
 int ipmi_request_supply_msgs(ipmi_user_t          user,
 			     struct ipmi_addr     *addr,
 			     long                 msgid,
 			     struct ipmi_msg      *msg,
+			     void                 *user_msg_data,
 			     void                 *supplied_smi,
 			     struct ipmi_recv_msg *supplied_recv,
 			     int                  priority)
@@ -1001,17 +1399,20 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			      addr,
 			      msgid,
 			      msg,
+			      user_msg_data,
 			      supplied_smi,
 			      supplied_recv,
 			      priority,
 			      user->intf->my_address,
-			      user->intf->my_lun);
+			      user->intf->my_lun,
+			      -1, 0);
 }
 
 int ipmi_request_with_source(ipmi_user_t      user,
 			     struct ipmi_addr *addr,
 			     long             msgid,
 			     struct ipmi_msg  *msg,
+			     void             *user_msg_data,
 			     int              priority,
 			     unsigned char    source_address,
 			     unsigned char    source_lun)
@@ -1021,10 +1422,215 @@ int ipmi_request_with_source(ipmi_user_t      user,
 			      addr,
 			      msgid,
 			      msg,
+			      user_msg_data,
 			      NULL, NULL,
 			      priority,
 			      source_address,
-			      source_lun);
+			      source_lun,
+			      -1, 0);
+}
+
+static int ipmb_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char       *out = (char *) page;
+	ipmi_smi_t intf = data;
+
+	return sprintf(out, "%x\n", intf->my_address);
+}
+
+static int version_file_read_proc(char *page, char **start, off_t off,
+				  int count, int *eof, void *data)
+{
+	char       *out = (char *) page;
+	ipmi_smi_t intf = data;
+
+	return sprintf(out, "%d.%d\n",
+		       intf->version_major, intf->version_minor);
+}
+
+static int stat_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char       *out = (char *) page;
+	ipmi_smi_t intf = data;
+
+	out += sprintf(out, "sent_invalid_commands:       %d\n",
+		       intf->sent_invalid_commands);
+	out += sprintf(out, "sent_local_commands:         %d\n",
+		       intf->sent_local_commands);
+	out += sprintf(out, "handled_local_responses:     %d\n",
+		       intf->handled_local_responses);
+	out += sprintf(out, "unhandled_local_responses:   %d\n",
+		       intf->unhandled_local_responses);
+	out += sprintf(out, "sent_ipmb_commands:          %d\n",
+		       intf->sent_ipmb_commands);
+	out += sprintf(out, "sent_ipmb_command_errs:      %d\n",
+		       intf->sent_ipmb_command_errs);
+	out += sprintf(out, "retransmitted_ipmb_commands: %d\n",
+		       intf->retransmitted_ipmb_commands);
+	out += sprintf(out, "timed_out_ipmb_commands:     %d\n",
+		       intf->timed_out_ipmb_commands);
+	out += sprintf(out, "timed_out_ipmb_broadcasts:   %d\n",
+		       intf->timed_out_ipmb_broadcasts);
+	out += sprintf(out, "sent_ipmb_responses:         %d\n",
+		       intf->sent_ipmb_responses);
+	out += sprintf(out, "handled_ipmb_responses:      %d\n",
+		       intf->handled_ipmb_responses);
+	out += sprintf(out, "invalid_ipmb_responses:      %d\n",
+		       intf->invalid_ipmb_responses);
+	out += sprintf(out, "unhandled_ipmb_responses:    %d\n",
+		       intf->unhandled_ipmb_responses);
+	out += sprintf(out, "sent_lan_commands:           %d\n",
+		       intf->sent_lan_commands);
+	out += sprintf(out, "sent_lan_command_errs:       %d\n",
+		       intf->sent_lan_command_errs);
+	out += sprintf(out, "retransmitted_lan_commands:  %d\n",
+		       intf->retransmitted_lan_commands);
+	out += sprintf(out, "timed_out_lan_commands:      %d\n",
+		       intf->timed_out_lan_commands);
+	out += sprintf(out, "sent_lan_responses:          %d\n",
+		       intf->sent_lan_responses);
+	out += sprintf(out, "handled_lan_responses:       %d\n",
+		       intf->handled_lan_responses);
+	out += sprintf(out, "invalid_lan_responses:       %d\n",
+		       intf->invalid_lan_responses);
+	out += sprintf(out, "unhandled_lan_responses:     %d\n",
+		       intf->unhandled_lan_responses);
+	out += sprintf(out, "handled_commands:            %d\n",
+		       intf->handled_commands);
+	out += sprintf(out, "invalid_commands:            %d\n",
+		       intf->invalid_commands);
+	out += sprintf(out, "unhandled_commands:          %d\n",
+		       intf->unhandled_commands);
+	out += sprintf(out, "invalid_events:              %d\n",
+		       intf->invalid_events);
+	out += sprintf(out, "events:                      %d\n",
+		       intf->events);
+
+	return (out - ((char *) page));
+}
+
+int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
+			    read_proc_t *read_proc, write_proc_t *write_proc,
+			    void *data, struct module *owner)
+{
+	struct proc_dir_entry *file;
+	int                   rv = 0;
+
+	file = create_proc_entry(name, 0, smi->proc_dir);
+	if (!file)
+		rv = -ENOMEM;
+	else {
+		file->nlink = 1;
+		file->data = data;
+		file->read_proc = read_proc;
+		file->write_proc = write_proc;
+		file->owner = owner;
+	}
+
+	return rv;
+}
+
+static int add_proc_entries(ipmi_smi_t smi, int num)
+{
+	int rv = 0;
+
+	sprintf(smi->proc_dir_name, "%d", num);
+	smi->proc_dir = proc_mkdir(smi->proc_dir_name, proc_ipmi_root);
+	if (!smi->proc_dir)
+		rv = -ENOMEM;
+	else {
+		smi->proc_dir->owner = THIS_MODULE;
+	}
+
+	if (rv == 0)
+		rv = ipmi_smi_add_proc_entry(smi, "stats",
+					     stat_file_read_proc, NULL,
+					     smi, THIS_MODULE);
+
+	if (rv == 0)
+		rv = ipmi_smi_add_proc_entry(smi, "ipmb",
+					     ipmb_file_read_proc, NULL,
+					     smi, THIS_MODULE);
+
+	if (rv == 0)
+		rv = ipmi_smi_add_proc_entry(smi, "version",
+					     version_file_read_proc, NULL,
+					     smi, THIS_MODULE);
+
+	return rv;
+}
+
+static int
+send_channel_info_cmd(ipmi_smi_t intf, int chan)
+{
+	struct ipmi_msg                   msg;
+	unsigned char                     data[1];
+	struct ipmi_system_interface_addr si;
+
+	si.addr_type = IPMI_SYSTEM_INTERFACE_ADDR_TYPE;
+	si.channel = IPMI_BMC_CHANNEL;
+	si.lun = 0;
+
+	msg.netfn = IPMI_NETFN_APP_REQUEST;
+	msg.cmd = IPMI_GET_CHANNEL_INFO_CMD;
+	msg.data = data;
+	msg.data_len = 1;
+	data[0] = chan;
+	return i_ipmi_request(NULL,
+			      intf,
+			      (struct ipmi_addr *) &si,
+			      0,
+			      &msg,
+			      NULL,
+			      NULL,
+			      NULL,
+			      0,
+			      intf->my_address,
+			      intf->my_lun,
+			      -1, 0);
+}
+
+static void
+channel_handler(ipmi_smi_t intf, struct ipmi_smi_msg *msg)
+{
+	int rv = 0;
+	int chan;
+
+	if ((msg->rsp[0] == (IPMI_NETFN_APP_RESPONSE << 2))
+	    && (msg->rsp[1] == IPMI_GET_CHANNEL_INFO_CMD))
+	{
+		/* It's the one we want */
+		if (msg->rsp[2] != 0) {
+			/* Got an error from the channel, just go on. */
+			goto next_channel;
+		}
+		if (msg->rsp_size < 6) {
+			/* Message not big enough, just go on. */
+			goto next_channel;
+		}
+		chan = intf->curr_channel;
+		intf->channels[chan].medium = msg->rsp[4] & 0x7f;
+		intf->channels[chan].protocol = msg->rsp[5] & 0x1f;
+
+	next_channel:
+		intf->curr_channel++;
+		if (intf->curr_channel >= IPMI_MAX_CHANNELS)
+			wake_up(&intf->waitq);
+		else
+			rv = send_channel_info_cmd(intf, intf->curr_channel);
+
+		if (rv) {
+			/* Got an error somehow, just give up. */
+			intf->curr_channel = IPMI_MAX_CHANNELS;
+			wake_up(&intf->waitq);
+
+			printk(KERN_WARNING "ipmi_msghandler: Error sending"
+			       "channel information: 0x%x\n",
+			       rv);
+		}
+	}
 }
 
 int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
@@ -1036,7 +1642,6 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	int              i, j;
 	int              rv;
 	ipmi_smi_t       new_intf;
-	struct list_head *entry;
 	unsigned long    flags;
 
 
@@ -1055,12 +1660,16 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 	new_intf = kmalloc(sizeof(*new_intf), GFP_KERNEL);
 	if (!new_intf)
 		return -ENOMEM;
+	memset(new_intf, 0, sizeof(*new_intf));
+
+	new_intf->proc_dir = NULL;
 
 	rv = -ENOMEM;
 
 	down_write(&interfaces_sem);
 	for (i=0; i<MAX_IPMI_INTERFACES; i++) {
 		if (ipmi_interfaces[i] == NULL) {
+			new_intf->intf_num = i;
 			new_intf->version_major = version_major;
 			new_intf->version_minor = version_minor;
 			new_intf->my_address = IPMI_BMC_SLAVE_ADDR;
@@ -1081,9 +1690,12 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 			INIT_LIST_HEAD(&(new_intf->waiting_events));
 			new_intf->waiting_events_count = 0;
 			rwlock_init(&(new_intf->cmd_rcvr_lock));
+			init_waitqueue_head(&new_intf->waitq);
 			INIT_LIST_HEAD(&(new_intf->cmd_rcvrs));
 			new_intf->all_cmd_rcvr = NULL;
 
+			spin_lock_init(&(new_intf->counter_lock));
+
 			spin_lock_irqsave(&interfaces_lock, flags);
 			ipmi_interfaces[i] = new_intf;
 			spin_unlock_irqrestore(&interfaces_lock, flags);
@@ -1096,46 +1708,71 @@ int ipmi_register_smi(struct ipmi_smi_handlers *handlers,
 
 	downgrade_write(&interfaces_sem);
 
+	if (rv == 0)
+		rv = add_proc_entries(*intf, i);
+
 	if (rv == 0) {
-		/* Call all the watcher interfaces to tell them that a
-		   new interface is available. */
-		down_read(&smi_watchers_sem);
-		list_for_each(entry, &smi_watchers) {
-			struct ipmi_smi_watcher *w;
-			w = list_entry(entry, struct ipmi_smi_watcher, link);
-			w->new_smi(i);
-		}
-		up_read(&smi_watchers_sem);
+		if ((version_major > 1)
+		    || ((version_major == 1) && (version_minor >= 5)))
+		{
+			/* Start scanning the channels to see what is
+			   available. */
+			(*intf)->null_user_handler = channel_handler;
+			(*intf)->curr_channel = 0;
+			rv = send_channel_info_cmd(*intf, 0);
+			if (rv)
+				goto out;
+
+			/* Wait for the channel info to be read. */
+			up_read(&interfaces_sem);
+			wait_event((*intf)->waitq,
+				   ((*intf)->curr_channel>=IPMI_MAX_CHANNELS));
+			down_read(&interfaces_sem);
+
+			if (ipmi_interfaces[i] != new_intf)
+				/* Well, it went away.  Just return. */
+				goto out;
+		} else {
+			/* Assume a single IPMB channel at zero. */
+			(*intf)->channels[0].medium = IPMI_CHANNEL_MEDIUM_IPMB;
+			(*intf)->channels[0].protocol
+				= IPMI_CHANNEL_PROTOCOL_IPMB;
+  		}
+
+		/* Call all the watcher interfaces to tell
+		   them that a new interface is available. */
+		call_smi_watchers(i);
 	}
 
+ out:
 	up_read(&interfaces_sem);
 
-	if (rv)
+	if (rv) {
+		if (new_intf->proc_dir)
+			remove_proc_entry(new_intf->proc_dir_name,
+					  proc_ipmi_root);
 		kfree(new_intf);
+	}
 
 	return rv;
 }
 
 static void free_recv_msg_list(struct list_head *q)
 {
-	struct list_head     *entry, *entry2;
-	struct ipmi_recv_msg *msg;
+	struct ipmi_recv_msg *msg, *msg2;
 
-	list_for_each_safe(entry, entry2, q) {
-		msg = list_entry(entry, struct ipmi_recv_msg, link);
-		list_del(entry);
+	list_for_each_entry_safe(msg, msg2, q, link) {
+		list_del(&msg->link);
 		ipmi_free_recv_msg(msg);
 	}
 }
 
 static void free_cmd_rcvr_list(struct list_head *q)
 {
-	struct list_head *entry, *entry2;
-	struct cmd_rcvr  *rcvr;
+	struct cmd_rcvr  *rcvr, *rcvr2;
 
-	list_for_each_safe(entry, entry2, q) {
-		rcvr = list_entry(entry, struct cmd_rcvr, link);
-		list_del(entry);
+	list_for_each_entry_safe(rcvr, rcvr2, q, link) {
+		list_del(&rcvr->link);
 		kfree(rcvr);
 	}
 }
@@ -1159,16 +1796,18 @@ static void clean_up_interface_data(ipmi_smi_t intf)
 
 int ipmi_unregister_smi(ipmi_smi_t intf)
 {
-	int              rv = -ENODEV;
-	int              i;
-	struct list_head *entry;
-	unsigned long    flags;
+	int                     rv = -ENODEV;
+	int                     i;
+	struct ipmi_smi_watcher *w;
+	unsigned long           flags;
 
 	down_write(&interfaces_sem);
 	if (list_empty(&(intf->users)))
 	{
 		for (i=0; i<MAX_IPMI_INTERFACES; i++) {
 			if (ipmi_interfaces[i] == intf) {
+				remove_proc_entry(intf->proc_dir_name,
+						  proc_ipmi_root);
 				spin_lock_irqsave(&interfaces_lock, flags);
 				ipmi_interfaces[i] = NULL;
 				clean_up_interface_data(intf);
@@ -1191,11 +1830,7 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	/* Call all the watcher interfaces to tell them that
 	   an interface is gone. */
 	down_read(&smi_watchers_sem);
-	list_for_each(entry, &smi_watchers) {
-		struct ipmi_smi_watcher *w;
-		w = list_entry(entry,
-			       struct ipmi_smi_watcher,
-			       link);
+	list_for_each_entry(w, &smi_watchers, link) {
 		w->smi_gone(i);
 	}
 	up_read(&smi_watchers_sem);
@@ -1203,20 +1838,28 @@ int ipmi_unregister_smi(ipmi_smi_t intf)
 	return 0;
 }
 
-static int handle_get_msg_rsp(ipmi_smi_t          intf,
-			      struct ipmi_smi_msg *msg)
+static int handle_ipmb_get_msg_rsp(ipmi_smi_t          intf,
+				   struct ipmi_smi_msg *msg)
 {
 	struct ipmi_ipmb_addr ipmb_addr;
 	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
 
 	
-	if (msg->rsp_size < 11)
+	/* This is 11, not 10, because the response must contain a
+	 * completion code. */
+	if (msg->rsp_size < 11) {
 		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_ipmb_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
+	}
 
-	if (msg->rsp[2] != 0)
+	if (msg->rsp[2] != 0) {
 		/* An error getting the response, just ignore it. */
 		return 0;
+	}
 
 	ipmb_addr.addr_type = IPMI_IPMB_ADDR_TYPE;
 	ipmb_addr.slave_addr = msg->rsp[6];
@@ -1235,6 +1878,9 @@ static int handle_get_msg_rsp(ipmi_smi_t          intf,
 	{
 		/* We were unable to find the sequence number,
 		   so just nuke the message. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_ipmb_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
 	}
 
@@ -1248,26 +1894,33 @@ static int handle_get_msg_rsp(ipmi_smi_t          intf,
 	recv_msg->msg.data = recv_msg->msg_data;
 	recv_msg->msg.data_len = msg->rsp_size - 10;
 	recv_msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+	spin_lock_irqsave(&intf->counter_lock, flags);
+	intf->handled_ipmb_responses++;
+	spin_unlock_irqrestore(&intf->counter_lock, flags);
 	deliver_response(recv_msg);
 
 	return 0;
 }
 
-static int handle_get_msg_cmd(ipmi_smi_t          intf,
-			      struct ipmi_smi_msg *msg)
+static int handle_ipmb_get_msg_cmd(ipmi_smi_t          intf,
+				   struct ipmi_smi_msg *msg)
 {
-	struct list_head *entry;
 	struct cmd_rcvr       *rcvr;
-	int              rv = 0;
-	unsigned char    netfn;
-	unsigned char    cmd;
-	ipmi_user_t      user = NULL;
+	int                   rv = 0;
+	unsigned char         netfn;
+	unsigned char         cmd;
+	ipmi_user_t           user = NULL;
 	struct ipmi_ipmb_addr *ipmb_addr;
 	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
 
-	if (msg->rsp_size < 10)
+	if (msg->rsp_size < 10) {
 		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
+	}
 
 	if (msg->rsp[2] != 0) {
 		/* An error getting the response, just ignore it. */
@@ -1283,8 +1936,7 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 		user = intf->all_cmd_rcvr;
 	} else {
 		/* Find the command/netfn. */
-		list_for_each(entry, &(intf->cmd_rcvrs)) {
-			rcvr = list_entry(entry, struct cmd_rcvr, link);
+		list_for_each_entry(rcvr, &(intf->cmd_rcvrs), link) {
 			if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)) {
 				user = rcvr->user;
 				break;
@@ -1295,6 +1947,10 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 
 	if (user == NULL) {
 		/* We didn't find a user, deliver an error response. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
 		msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
 		msg->data[1] = IPMI_SEND_MSG_CMD;
 		msg->data[2] = msg->rsp[3];
@@ -1309,12 +1965,25 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 		msg->data[10] = ipmb_checksum(&(msg->data[6]), 4);
 		msg->data_size = 11;
 
+#if DEBUG_MSGING
+	{
+		int m;
+		printk("Invalid command:");
+		for (m=0; m<msg->data_size; m++)
+			printk(" %2.2x", msg->data[m]);
+		printk("\n");
+	}
+#endif
 		intf->handlers->sender(intf->send_info, msg, 0);
 
 		rv = -1; /* We used the message, so return the value that
 			    causes it to not be freed or queued. */
 	} else {
 		/* Deliver the message to the user. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->handled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
 		recv_msg = ipmi_alloc_recv_msg();
 		if (! recv_msg) {
 			/* We couldn't allocate memory for the
@@ -1322,18 +1991,24 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
                            later. */
 			rv = 1;
 		} else {
+			/* Extract the source address from the data. */
 			ipmb_addr = (struct ipmi_ipmb_addr *) &recv_msg->addr;
 			ipmb_addr->addr_type = IPMI_IPMB_ADDR_TYPE;
 			ipmb_addr->slave_addr = msg->rsp[6];
 			ipmb_addr->lun = msg->rsp[7] & 3;
-			ipmb_addr->channel = msg->rsp[3];
+			ipmb_addr->channel = msg->rsp[3] & 0xf;
 
+			/* Extract the rest of the message information
+			   from the IPMB header.*/
 			recv_msg->user = user;
 			recv_msg->recv_type = IPMI_CMD_RECV_TYPE;
 			recv_msg->msgid = msg->rsp[7] >> 2;
 			recv_msg->msg.netfn = msg->rsp[4] >> 2;
 			recv_msg->msg.cmd = msg->rsp[8];
 			recv_msg->msg.data = recv_msg->msg_data;
+
+			/* We chop off 10, not 9 bytes because the checksum
+			   at the end also needs to be removed. */
 			recv_msg->msg.data_len = msg->rsp_size - 10;
 			memcpy(recv_msg->msg_data,
 			       &(msg->rsp[9]),
@@ -1345,6 +2020,169 @@ static int handle_get_msg_cmd(ipmi_smi_t          intf,
 	return rv;
 }
 
+static int handle_lan_get_msg_rsp(ipmi_smi_t          intf,
+				  struct ipmi_smi_msg *msg)
+{
+	struct ipmi_lan_addr  lan_addr;
+	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
+
+
+	/* This is 13, not 12, because the response must contain a
+	 * completion code. */
+	if (msg->rsp_size < 13) {
+		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_lan_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		return 0;
+	}
+
+	if (msg->rsp[2] != 0) {
+		/* An error getting the response, just ignore it. */
+		return 0;
+	}
+
+	lan_addr.addr_type = IPMI_LAN_ADDR_TYPE;
+	lan_addr.session_handle = msg->rsp[4];
+	lan_addr.remote_SWID = msg->rsp[8];
+	lan_addr.local_SWID = msg->rsp[5];
+	lan_addr.channel = msg->rsp[3] & 0x0f;
+	lan_addr.privilege = msg->rsp[3] >> 4;
+	lan_addr.lun = msg->rsp[9] & 3;
+
+	/* It's a response from a remote entity.  Look up the sequence
+	   number and handle the response. */
+	if (intf_find_seq(intf,
+			  msg->rsp[9] >> 2,
+			  msg->rsp[3] & 0x0f,
+			  msg->rsp[10],
+			  (msg->rsp[6] >> 2) & (~1),
+			  (struct ipmi_addr *) &(lan_addr),
+			  &recv_msg))
+	{
+		/* We were unable to find the sequence number,
+		   so just nuke the message. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_lan_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		return 0;
+	}
+
+	memcpy(recv_msg->msg_data,
+	       &(msg->rsp[11]),
+	       msg->rsp_size - 11);
+	/* The other fields matched, so no need to set them, except
+           for netfn, which needs to be the response that was
+           returned, not the request value. */
+	recv_msg->msg.netfn = msg->rsp[6] >> 2;
+	recv_msg->msg.data = recv_msg->msg_data;
+	recv_msg->msg.data_len = msg->rsp_size - 12;
+	recv_msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
+	spin_lock_irqsave(&intf->counter_lock, flags);
+	intf->handled_lan_responses++;
+	spin_unlock_irqrestore(&intf->counter_lock, flags);
+	deliver_response(recv_msg);
+
+	return 0;
+}
+
+static int handle_lan_get_msg_cmd(ipmi_smi_t          intf,
+				  struct ipmi_smi_msg *msg)
+{
+	struct cmd_rcvr       *rcvr;
+	int                   rv = 0;
+	unsigned char         netfn;
+	unsigned char         cmd;
+	ipmi_user_t           user = NULL;
+	struct ipmi_lan_addr  *lan_addr;
+	struct ipmi_recv_msg  *recv_msg;
+	unsigned long         flags;
+
+	if (msg->rsp_size < 12) {
+		/* Message not big enough, just ignore it. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+		return 0;
+	}
+
+	if (msg->rsp[2] != 0) {
+		/* An error getting the response, just ignore it. */
+		return 0;
+	}
+
+	netfn = msg->rsp[6] >> 2;
+	cmd = msg->rsp[10];
+
+	read_lock(&(intf->cmd_rcvr_lock));
+
+	if (intf->all_cmd_rcvr) {
+		user = intf->all_cmd_rcvr;
+	} else {
+		/* Find the command/netfn. */
+		list_for_each_entry(rcvr, &(intf->cmd_rcvrs), link) {
+			if ((rcvr->netfn == netfn) && (rcvr->cmd == cmd)) {
+				user = rcvr->user;
+				break;
+			}
+		}
+	}
+	read_unlock(&(intf->cmd_rcvr_lock));
+
+	if (user == NULL) {
+		/* We didn't find a user, deliver an error response. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
+		rv = 0; /* Don't do anything with these messages, just
+			   allow them to be freed. */
+	} else {
+		/* Deliver the message to the user. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->handled_commands++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
+
+		recv_msg = ipmi_alloc_recv_msg();
+		if (! recv_msg) {
+			/* We couldn't allocate memory for the
+                           message, so requeue it for handling
+                           later. */
+			rv = 1;
+		} else {
+			/* Extract the source address from the data. */
+			lan_addr = (struct ipmi_lan_addr *) &recv_msg->addr;
+			lan_addr->addr_type = IPMI_LAN_ADDR_TYPE;
+			lan_addr->session_handle = msg->rsp[4];
+			lan_addr->remote_SWID = msg->rsp[8];
+			lan_addr->local_SWID = msg->rsp[5];
+			lan_addr->lun = msg->rsp[9] & 3;
+			lan_addr->channel = msg->rsp[3] & 0xf;
+			lan_addr->privilege = msg->rsp[3] >> 4;
+
+			/* Extract the rest of the message information
+			   from the IPMB header.*/
+			recv_msg->user = user;
+			recv_msg->recv_type = IPMI_CMD_RECV_TYPE;
+			recv_msg->msgid = msg->rsp[9] >> 2;
+			recv_msg->msg.netfn = msg->rsp[6] >> 2;
+			recv_msg->msg.cmd = msg->rsp[10];
+			recv_msg->msg.data = recv_msg->msg_data;
+
+			/* We chop off 12, not 11 bytes because the checksum
+			   at the end also needs to be removed. */
+			recv_msg->msg.data_len = msg->rsp_size - 12;
+			memcpy(recv_msg->msg_data,
+			       &(msg->rsp[11]),
+			       msg->rsp_size - 12);
+			deliver_response(recv_msg);
+		}
+	}
+
+	return rv;
+}
+
 static void copy_event_into_recv_msg(struct ipmi_recv_msg *recv_msg,
 				     struct ipmi_smi_msg  *msg)
 {
@@ -1368,9 +2206,8 @@ static void copy_event_into_recv_msg(struct ipmi_recv_msg *recv_msg,
 static int handle_read_event_rsp(ipmi_smi_t          intf,
 				 struct ipmi_smi_msg *msg)
 {
-	struct ipmi_recv_msg *recv_msg;
+	struct ipmi_recv_msg *recv_msg, *recv_msg2;
 	struct list_head     msgs;
-	struct list_head     *entry, *entry2;
 	ipmi_user_t          user;
 	int                  rv = 0;
 	int                  deliver_count = 0;
@@ -1378,6 +2215,9 @@ static int handle_read_event_rsp(ipmi_smi_t          intf,
 
 	if (msg->rsp_size < 19) {
 		/* Message is too small to be an IPMB event. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->invalid_events++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		return 0;
 	}
 
@@ -1390,21 +2230,20 @@ static int handle_read_event_rsp(ipmi_smi_t          intf,
 
 	spin_lock_irqsave(&(intf->events_lock), flags);
 
+	spin_lock(&intf->counter_lock);
+	intf->events++;
+	spin_unlock(&intf->counter_lock);
+
 	/* Allocate and fill in one message for every user that is getting
 	   events. */
-	list_for_each(entry, &(intf->users)) {
-		user = list_entry(entry, struct ipmi_user, link);
-
+	list_for_each_entry(user, &(intf->users), link) {
 		if (! user->gets_events)
 			continue;
 
 		recv_msg = ipmi_alloc_recv_msg();
 		if (! recv_msg) {
-			list_for_each_safe(entry, entry2, &msgs) {
-				recv_msg = list_entry(entry,
-						      struct ipmi_recv_msg,
-						      link);
-				list_del(entry);
+			list_for_each_entry_safe(recv_msg, recv_msg2, &msgs, link) {
+				list_del(&recv_msg->link);
 				ipmi_free_recv_msg(recv_msg);
 			}
 			/* We couldn't allocate memory for the
@@ -1423,11 +2262,8 @@ static int handle_read_event_rsp(ipmi_smi_t          intf,
 
 	if (deliver_count) {
 		/* Now deliver all the messages. */
-		list_for_each_safe(entry, entry2, &msgs) {
-			recv_msg = list_entry(entry,
-					      struct ipmi_recv_msg,
-					      link);
-			list_del(entry);
+		list_for_each_entry_safe(recv_msg, recv_msg2, &msgs, link) {
+			list_del(&recv_msg->link);
 			deliver_response(recv_msg);
 		}
 	} else if (intf->waiting_events_count < MAX_EVENTS_IN_QUEUE) {
@@ -1462,15 +2298,14 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 {
 	struct ipmi_recv_msg *recv_msg;
 	int                  found = 0;
-	struct list_head     *entry;
+	struct ipmi_user     *user;
+	unsigned long        flags;
 
 	recv_msg = (struct ipmi_recv_msg *) msg->user_data;
 
 	/* Make sure the user still exists. */
-	list_for_each(entry, &(intf->users)) {
-		if (list_entry(entry, struct ipmi_user, link)
-		    == recv_msg->user)
-		{
+	list_for_each_entry(user, &(intf->users), link) {
+		if (user == recv_msg->user) {
 			/* Found it, so we can deliver it */
 			found = 1;
 			break;
@@ -1482,10 +2317,16 @@ static int handle_bmc_rsp(ipmi_smi_t          intf,
 		if (!recv_msg->user && intf->null_user_handler)
 			intf->null_user_handler(intf, msg);
 		/* The user for the message went away, so give up. */
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->unhandled_local_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		ipmi_free_recv_msg(recv_msg);
 	} else {
 		struct ipmi_system_interface_addr *smi_addr;
 
+		spin_lock_irqsave(&intf->counter_lock, flags);
+		intf->handled_local_responses++;
+		spin_unlock_irqrestore(&intf->counter_lock, flags);
 		recv_msg->recv_type = IPMI_RESPONSE_RECV_TYPE;
 		recv_msg->msgid = msg->msgid;
 		smi_addr = ((struct ipmi_system_interface_addr *)
@@ -1513,28 +2354,86 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 			       struct ipmi_smi_msg *msg)
 {
 	int requeue;
+	int chan;
 
+#if DEBUG_MSGING
+	int m;
+	printk("Recv:");
+	for (m=0; m<msg->rsp_size; m++)
+		printk(" %2.2x", msg->rsp[m]);
+	printk("\n");
+#endif
 	if (msg->rsp_size < 2) {
 		/* Message is too small to be correct. */
 		requeue = 0;
-	} else if (msg->rsp[1] == IPMI_GET_MSG_CMD) {
-#if DEBUG_MSGING
-		int m;
-		printk("Response:");
-		for (m=0; m<msg->rsp_size; m++)
-			printk(" %2.2x", msg->rsp[m]);
-		printk("\n");
-#endif
+	} else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2))
+		   && (msg->rsp[1] == IPMI_SEND_MSG_CMD)
+		   && (msg->user_data != NULL))
+	{
+		/* It's a response to a response we sent.  For this we
+		   deliver a send message response to the user. */
+		struct ipmi_recv_msg *recv_msg = msg->user_data;
+
+		requeue = 0;
+		if (msg->rsp_size < 2)
+			/* Message is too small to be correct. */
+			goto out;
+
+		chan = msg->data[2] & 0x0f;
+		if (chan >= IPMI_MAX_CHANNELS)
+			/* Invalid channel number */
+			goto out;
+
+		if (recv_msg) {
+			recv_msg->recv_type = IPMI_RESPONSE_RESPONSE_TYPE;
+			recv_msg->msg.data = recv_msg->msg_data;
+			recv_msg->msg.data_len = 1;
+			recv_msg->msg_data[0] = msg->rsp[2];
+			deliver_response(recv_msg);
+		}
+	} else if ((msg->rsp[0] == ((IPMI_NETFN_APP_REQUEST|1) << 2))
+		   && (msg->rsp[1] == IPMI_GET_MSG_CMD))
+	{
 		/* It's from the receive queue. */
-		if (msg->rsp[4] & 0x04) {
-			/* It's a response, so find the
-			   requesting message and send it up. */
-			requeue = handle_get_msg_rsp(intf, msg);
-		} else {
-			/* It's a command to the SMS from some other
-			   entity.  Handle that. */
-			requeue = handle_get_msg_cmd(intf, msg);
+		chan = msg->rsp[3] & 0xf;
+		if (chan >= IPMI_MAX_CHANNELS) {
+			/* Invalid channel number */
+			requeue = 0;
+			goto out;
+		}
+
+		switch (intf->channels[chan].medium) {
+		case IPMI_CHANNEL_MEDIUM_IPMB:
+			if (msg->rsp[4] & 0x04) {
+				/* It's a response, so find the
+				   requesting message and send it up. */
+				requeue = handle_ipmb_get_msg_rsp(intf, msg);
+			} else {
+				/* It's a command to the SMS from some other
+				   entity.  Handle that. */
+				requeue = handle_ipmb_get_msg_cmd(intf, msg);
+			}
+			break;
+
+		case IPMI_CHANNEL_MEDIUM_8023LAN:
+		case IPMI_CHANNEL_MEDIUM_ASYNC:
+			if (msg->rsp[6] & 0x04) {
+				/* It's a response, so find the
+				   requesting message and send it up. */
+				requeue = handle_lan_get_msg_rsp(intf, msg);
+			} else {
+				/* It's a command to the SMS from some other
+				   entity.  Handle that. */
+				requeue = handle_lan_get_msg_cmd(intf, msg);
+			}
+			break;
+
+		default:
+			/* We don't handle the channel type, so just
+			 * free the message. */
+			requeue = 0;
 		}
+
 	} else if (msg->rsp[1] == IPMI_READ_EVENT_MSG_BUFFER_CMD) {
 		/* It's an asyncronous event. */
 		requeue = handle_read_event_rsp(intf, msg);
@@ -1543,6 +2442,7 @@ static int handle_new_recv_msg(ipmi_smi_t          intf,
 		requeue = handle_bmc_rsp(intf, msg);
 	}
 
+ out:
 	return requeue;
 }
 
@@ -1558,10 +2458,43 @@ void ipmi_smi_msg_received(ipmi_smi_t          intf,
 	   working on it. */
 	read_lock(&(intf->users_lock));
 
-	if ((msg->data_size >= 2) && (msg->data[1] == IPMI_SEND_MSG_CMD)) {
-		/* This is the local response to a send, start the
-                   timer for these. */
-		intf_start_seq_timer(intf, msg->msgid);
+	if ((msg->data_size >= 2)
+	    && (msg->data[0] == (IPMI_NETFN_APP_REQUEST << 2))
+	    && (msg->data[1] == IPMI_SEND_MSG_CMD)
+	    && (msg->user_data == NULL)) {
+		/* This is the local response to a command send, start
+                   the timer for these.  The user_data will not be
+                   NULL if this is a response send, and we will let
+                   response sends just go through. */
+
+		/* Check for errors, if we get certain errors (ones
+                   that mean basically we can try again later), we
+                   ignore them and start the timer.  Otherwise we
+                   report the error immediately. */
+		if ((msg->rsp_size >= 3) && (msg->rsp[2] != 0)
+		    && (msg->rsp[2] != IPMI_NODE_BUSY_ERR)
+		    && (msg->rsp[2] != IPMI_LOST_ARBITRATION_ERR))
+		{
+			int chan = msg->rsp[3] & 0xf;
+
+			/* Got an error sending the message, handle it. */
+			spin_lock_irqsave(&intf->counter_lock, flags);
+			if (chan >= IPMI_MAX_CHANNELS)
+				; /* This shouldn't happen */
+			else if ((intf->channels[chan].medium
+				  == IPMI_CHANNEL_MEDIUM_8023LAN)
+				 || (intf->channels[chan].medium
+				     == IPMI_CHANNEL_MEDIUM_ASYNC))
+				intf->sent_lan_command_errs++;
+			else
+				intf->sent_ipmb_command_errs++;
+			spin_unlock_irqrestore(&intf->counter_lock, flags);
+			intf_err_seq(intf, msg->msgid, msg->rsp[2]);
+		} else {
+			/* The message was sent, start the timer. */
+			intf_start_seq_timer(intf, msg->msgid);
+		}
+
 		ipmi_free_smi_msg(msg);
 		goto out_unlock;
 	}
@@ -1593,13 +2526,10 @@ void ipmi_smi_msg_received(ipmi_smi_t          intf,
 
 void ipmi_smi_watchdog_pretimeout(ipmi_smi_t intf)
 {
-	struct list_head *entry;
-	ipmi_user_t      user;
+	ipmi_user_t user;
 
 	read_lock(&(intf->users_lock));
-	list_for_each(entry, &(intf->users)) {
-		user = list_entry(entry, struct ipmi_user, link);
-
+	list_for_each_entry(user, &(intf->users), link) {
 		if (! user->handler->ipmi_watchdog_pretimeout)
 			continue;
 
@@ -1657,10 +2587,9 @@ ipmi_timeout_handler(long timeout_period)
 {
 	ipmi_smi_t           intf;
 	struct list_head     timeouts;
-	struct ipmi_recv_msg *msg;
-	struct ipmi_smi_msg  *smi_msg;
+	struct ipmi_recv_msg *msg, *msg2;
+	struct ipmi_smi_msg  *smi_msg, *smi_msg2;
 	unsigned long        flags;
-	struct list_head     *entry, *entry2;
 	int                  i, j;
 
 	INIT_LIST_HEAD(&timeouts);
@@ -1675,10 +2604,9 @@ ipmi_timeout_handler(long timeout_period)
 
 		/* See if any waiting messages need to be processed. */
 		spin_lock_irqsave(&(intf->waiting_msgs_lock), flags);
-		list_for_each_safe(entry, entry2, &(intf->waiting_msgs)) {
-			smi_msg = list_entry(entry, struct ipmi_smi_msg, link);
+		list_for_each_entry_safe(smi_msg, smi_msg2, &(intf->waiting_msgs), link) {
 			if (! handle_new_recv_msg(intf, smi_msg)) {
-				list_del(entry);
+				list_del(&smi_msg->link);
 				ipmi_free_smi_msg(smi_msg);
 			} else {
 				/* To preserve message order, quit if we
@@ -1706,6 +2634,15 @@ ipmi_timeout_handler(long timeout_period)
 				ent->inuse = 0;
 				msg = ent->recv_msg;
 				list_add_tail(&(msg->link), &timeouts);
+				spin_lock(&intf->counter_lock);
+				if (ent->broadcast)
+					intf->timed_out_ipmb_broadcasts++;
+				else if (ent->recv_msg->addr.addr_type
+					 == IPMI_LAN_ADDR_TYPE)
+					intf->timed_out_lan_commands++;
+				else
+					intf->timed_out_ipmb_commands++;
+				spin_unlock(&intf->counter_lock);
 			} else {
 				/* More retries, send again. */
 
@@ -1715,12 +2652,18 @@ ipmi_timeout_handler(long timeout_period)
 				ent->retries_left--;
 				send_from_recv_msg(intf, ent->recv_msg, NULL,
 						   j, ent->seqid);
+				spin_lock(&intf->counter_lock);
+				if (ent->recv_msg->addr.addr_type
+				    == IPMI_LAN_ADDR_TYPE)
+					intf->retransmitted_lan_commands++;
+				else
+					intf->retransmitted_ipmb_commands++;
+				spin_unlock(&intf->counter_lock);
 			}
 		}
 		spin_unlock_irqrestore(&(intf->seq_lock), flags);
 
-		list_for_each_safe(entry, entry2, &timeouts) {
-			msg = list_entry(entry, struct ipmi_recv_msg, link);
+		list_for_each_entry_safe(msg, msg2, &timeouts, link) {
 			handle_msg_timeout(msg);
 		}
 
@@ -1747,13 +2690,16 @@ static void ipmi_request_event(void)
 
 static struct timer_list ipmi_timer;
 
-/* Call every 100 ms. */
+/* Call every ~100 ms. */
 #define IPMI_TIMEOUT_TIME	100
-#define IPMI_TIMEOUT_JIFFIES	((IPMI_TIMEOUT_TIME * HZ)/1000)
 
-/* Request events from the queue every second.  Hopefully, in the
-   future, IPMI will add a way to know immediately if an event is
-   in the queue. */
+/* How many jiffies does it take to get to the timeout time. */
+#define IPMI_TIMEOUT_JIFFIES	((IPMI_TIMEOUT_TIME * HZ) / 1000)
+
+/* Request events from the queue every second (this is the number of
+   IPMI_TIMEOUT_TIMES between event requests).  Hopefully, in the
+   future, IPMI will add a way to know immediately if an event is in
+   the queue and this silliness can go away. */
 #define IPMI_REQUEST_EV_TIME	(1000 / (IPMI_TIMEOUT_TIME))
 
 static volatile int stop_operation = 0;
@@ -1796,6 +2742,7 @@ struct ipmi_smi_msg *ipmi_alloc_smi_msg(void)
 	rv = kmalloc(sizeof(struct ipmi_smi_msg), GFP_ATOMIC);
 	if (rv) {
 		rv->done = free_smi_msg;
+		rv->user_data = NULL;
 		atomic_inc(&smi_msg_inuse_count);
 	}
 	return rv;
@@ -1907,11 +2854,13 @@ static void send_panic_events(char *str)
 			       &addr,
 			       0,
 			       &msg,
+			       NULL,
 			       &smi_msg,
 			       &recv_msg,
 			       0,
 			       intf->my_address,
-			       intf->my_lun);
+			       intf->my_lun,
+			       0, 1); /* Don't retry, and don't wait. */
 	}
 
 #ifdef CONFIG_IPMI_PANIC_STRING
@@ -1951,11 +2900,13 @@ static void send_panic_events(char *str)
 			       &addr,
 			       0,
 			       &msg,
+			       NULL,
 			       &smi_msg,
 			       &recv_msg,
 			       0,
 			       intf->my_address,
-			       intf->my_lun);
+			       intf->my_lun,
+			       0, 1); /* Don't retry, and don't wait. */
 
 		if (intf->local_event_generator) {
 			/* Request the event receiver from the local MC. */
@@ -1969,11 +2920,13 @@ static void send_panic_events(char *str)
 				       &addr,
 				       0,
 				       &msg,
+				       NULL,
 				       &smi_msg,
 				       &recv_msg,
 				       0,
 				       intf->my_address,
-				       intf->my_lun);
+				       intf->my_lun,
+				       0, 1); /* no retry, and no wait. */
 		}
 		intf->null_user_handler = NULL;
 
@@ -2029,11 +2982,13 @@ static void send_panic_events(char *str)
 				       &addr,
 				       0,
 				       &msg,
+				       NULL,
 				       &smi_msg,
 				       &recv_msg,
 				       0,
 				       intf->my_address,
-				       intf->my_lun);
+				       intf->my_lun,
+				       0, 1); /* no retry, and no wait. */
 		}
 	}	
 #endif /* CONFIG_IPMI_PANIC_STRING */
@@ -2075,7 +3030,6 @@ static struct notifier_block panic_block = {
 	200   /* priority: INT_MAX >= x >= 0 */
 };
 
-
 static __init int ipmi_init_msghandler(void)
 {
 	int i;
@@ -2083,10 +3037,21 @@ static __init int ipmi_init_msghandler(void)
 	if (initialized)
 		return 0;
 
+	printk(KERN_INFO "ipmi message handler version "
+	       IPMI_MSGHANDLER_VERSION "\n");
+
 	for (i=0; i<MAX_IPMI_INTERFACES; i++) {
 		ipmi_interfaces[i] = NULL;
 	}
 
+	proc_ipmi_root = proc_mkdir("ipmi", 0);
+	if (!proc_ipmi_root) {
+	    printk("Unable to create IPMI proc dir");
+	    return -ENOMEM;
+	}
+
+	proc_ipmi_root->owner = THIS_MODULE;
+
 	init_timer(&ipmi_timer);
 	ipmi_timer.data = 0;
 	ipmi_timer.function = ipmi_timeout;
@@ -2097,8 +3062,6 @@ static __init int ipmi_init_msghandler(void)
 
 	initialized = 1;
 
-	printk(KERN_INFO "ipmi: message handler initialized\n");
-
 	return 0;
 }
 
@@ -2118,9 +3081,12 @@ static __exit void cleanup_ipmi(void)
 	   problems with race conditions removing the timer here. */
 	stop_operation = 1;
 	while (!timer_stopped) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
 	}
 
+	remove_proc_entry(proc_ipmi_root->name, &proc_root);
+
 	initialized = 0;
 
 	/* Check for buffer leaks. */
@@ -2143,6 +3109,7 @@ EXPORT_SYMBOL(ipmi_create_user);
 EXPORT_SYMBOL(ipmi_destroy_user);
 EXPORT_SYMBOL(ipmi_get_version);
 EXPORT_SYMBOL(ipmi_request);
+EXPORT_SYMBOL(ipmi_request_settime);
 EXPORT_SYMBOL(ipmi_request_supply_msgs);
 EXPORT_SYMBOL(ipmi_request_with_source);
 EXPORT_SYMBOL(ipmi_register_smi);
@@ -2164,3 +3131,4 @@ EXPORT_SYMBOL(ipmi_set_my_address);
 EXPORT_SYMBOL(ipmi_get_my_address);
 EXPORT_SYMBOL(ipmi_set_my_LUN);
 EXPORT_SYMBOL(ipmi_get_my_LUN);
+EXPORT_SYMBOL(ipmi_smi_add_proc_entry);
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
new file mode 100644
index 000000000000..42b7e5d22de9
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -0,0 +1,2052 @@
+/*
+ * ipmi_si.c
+ *
+ * The interface to the IPMI driver for the system interfaces (KCS, SMIC,
+ * BT).
+ *
+ * Author: MontaVista Software, Inc.
+ *         Corey Minyard <minyard@mvista.com>
+ *         source@mvista.com
+ *
+ * Copyright 2002 MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * This file holds the "policy" for the interface to the SMI state
+ * machine.  It does the configuration, handles timers and interrupts,
+ * and drives the real SMI state machine.
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <asm/system.h>
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+#include <linux/ioport.h>
+#ifdef CONFIG_HIGH_RES_TIMERS
+#include <linux/hrtime.h>
+# if defined(schedule_next_int)
+/* Old high-res timer code, do translations. */
+#  define get_arch_cycles(a) quick_update_jiffies_sub(a)
+#  define arch_cycles_per_jiffy cycles_per_jiffies
+# endif
+static inline void add_usec_to_timer(struct timer_list *t, long v)
+{
+	t->sub_expires += nsec_to_arch_cycle(v * 1000);
+	while (t->sub_expires >= arch_cycles_per_jiffy)
+	{
+		t->expires++;
+		t->sub_expires -= arch_cycles_per_jiffy;
+	}
+}
+#endif
+#include <linux/interrupt.h>
+#include <linux/rcupdate.h>
+#include <linux/ipmi_smi.h>
+#include <asm/io.h>
+#include "ipmi_si_sm.h"
+#include <linux/init.h>
+
+#define IPMI_SI_VERSION "v31"
+
+/* Measure times between events in the driver. */
+#undef DEBUG_TIMING
+
+/* Call every 10 ms. */
+#define SI_TIMEOUT_TIME_USEC	10000
+#define SI_USEC_PER_JIFFY	(1000000/HZ)
+#define SI_TIMEOUT_JIFFIES	(SI_TIMEOUT_TIME_USEC/SI_USEC_PER_JIFFY)
+#define SI_SHORT_TIMEOUT_USEC  250 /* .25ms when the SM request a
+                                       short timeout */
+
+enum si_intf_state {
+	SI_NORMAL,
+	SI_GETTING_FLAGS,
+	SI_GETTING_EVENTS,
+	SI_CLEARING_FLAGS,
+	SI_CLEARING_FLAGS_THEN_SET_IRQ,
+	SI_GETTING_MESSAGES,
+	SI_ENABLE_INTERRUPTS1,
+	SI_ENABLE_INTERRUPTS2
+	/* FIXME - add watchdog stuff. */
+};
+
+enum si_type {
+    SI_KCS, SI_SMIC, SI_BT
+};
+
+struct smi_info
+{
+	ipmi_smi_t             intf;
+	struct si_sm_data      *si_sm;
+	struct si_sm_handlers  *handlers;
+	enum si_type           si_type;
+	spinlock_t             si_lock;
+	spinlock_t             msg_lock;
+	struct list_head       xmit_msgs;
+	struct list_head       hp_xmit_msgs;
+	struct ipmi_smi_msg    *curr_msg;
+	enum si_intf_state     si_state;
+
+	/* Used to handle the various types of I/O that can occur with
+           IPMI */
+	struct si_sm_io io;
+	int (*io_setup)(struct smi_info *info);
+	void (*io_cleanup)(struct smi_info *info);
+	int (*irq_setup)(struct smi_info *info);
+	void (*irq_cleanup)(struct smi_info *info);
+	unsigned int io_size;
+
+	/* Flags from the last GET_MSG_FLAGS command, used when an ATTN
+	   is set to hold the flags until we are done handling everything
+	   from the flags. */
+#define RECEIVE_MSG_AVAIL	0x01
+#define EVENT_MSG_BUFFER_FULL	0x02
+#define WDT_PRE_TIMEOUT_INT	0x08
+	unsigned char       msg_flags;
+
+	/* If set to true, this will request events the next time the
+	   state machine is idle. */
+	atomic_t            req_events;
+
+	/* If true, run the state machine to completion on every send
+	   call.  Generally used after a panic to make sure stuff goes
+	   out. */
+	int                 run_to_completion;
+
+	/* The I/O port of an SI interface. */
+	int                 port;
+
+	/* zero if no irq; */
+	int                 irq;
+
+	/* The timer for this si. */
+	struct timer_list   si_timer;
+
+	/* The time (in jiffies) the last timeout occurred at. */
+	unsigned long       last_timeout_jiffies;
+
+	/* Used to gracefully stop the timer without race conditions. */
+	volatile int        stop_operation;
+	volatile int        timer_stopped;
+
+	/* The driver will disable interrupts when it gets into a
+	   situation where it cannot handle messages due to lack of
+	   memory.  Once that situation clears up, it will re-enable
+	   interrupts. */
+	int interrupt_disabled;
+
+	unsigned char ipmi_si_dev_rev;
+	unsigned char ipmi_si_fw_rev_major;
+	unsigned char ipmi_si_fw_rev_minor;
+	unsigned char ipmi_version_major;
+	unsigned char ipmi_version_minor;
+
+	/* Counters and things for the proc filesystem. */
+	spinlock_t count_lock;
+	unsigned long short_timeouts;
+	unsigned long long_timeouts;
+	unsigned long timeout_restarts;
+	unsigned long idles;
+	unsigned long interrupts;
+	unsigned long attentions;
+	unsigned long flag_fetches;
+	unsigned long hosed_count;
+	unsigned long complete_transactions;
+	unsigned long events;
+	unsigned long watchdog_pretimeouts;
+	unsigned long incoming_messages;
+};
+
+static void si_restart_short_timer(struct smi_info *smi_info);
+
+static void deliver_recv_msg(struct smi_info *smi_info,
+			     struct ipmi_smi_msg *msg)
+{
+	/* Deliver the message to the upper layer with the lock
+           released. */
+	spin_unlock(&(smi_info->si_lock));
+	ipmi_smi_msg_received(smi_info->intf, msg);
+	spin_lock(&(smi_info->si_lock));
+}
+
+static void return_hosed_msg(struct smi_info *smi_info)
+{
+	struct ipmi_smi_msg *msg = smi_info->curr_msg;
+
+	/* Make it a reponse */
+	msg->rsp[0] = msg->data[0] | 4;
+	msg->rsp[1] = msg->data[1];
+	msg->rsp[2] = 0xFF; /* Unknown error. */
+	msg->rsp_size = 3;
+
+	smi_info->curr_msg = NULL;
+	deliver_recv_msg(smi_info, msg);
+}
+
+static enum si_sm_result start_next_msg(struct smi_info *smi_info)
+{
+	int              rv;
+	struct list_head *entry = NULL;
+#ifdef DEBUG_TIMING
+	struct timeval t;
+#endif
+
+	/* No need to save flags, we aleady have interrupts off and we
+	   already hold the SMI lock. */
+	spin_lock(&(smi_info->msg_lock));
+
+	/* Pick the high priority queue first. */
+	if (! list_empty(&(smi_info->hp_xmit_msgs))) {
+		entry = smi_info->hp_xmit_msgs.next;
+	} else if (! list_empty(&(smi_info->xmit_msgs))) {
+		entry = smi_info->xmit_msgs.next;
+	}
+
+	if (!entry) {
+		smi_info->curr_msg = NULL;
+		rv = SI_SM_IDLE;
+	} else {
+		int err;
+
+		list_del(entry);
+		smi_info->curr_msg = list_entry(entry,
+						struct ipmi_smi_msg,
+						link);
+#ifdef DEBUG_TIMING
+		do_gettimeofday(&t);
+		printk("**Start2: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+		err = smi_info->handlers->start_transaction(
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		if (err) {
+			return_hosed_msg(smi_info);
+		}
+
+		rv = SI_SM_CALL_WITHOUT_DELAY;
+	}
+	spin_unlock(&(smi_info->msg_lock));
+
+	return rv;
+}
+
+static void start_enable_irq(struct smi_info *smi_info)
+{
+	unsigned char msg[2];
+
+	/* If we are enabling interrupts, we have to tell the
+	   BMC to use them. */
+	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+	msg[1] = IPMI_GET_BMC_GLOBAL_ENABLES_CMD;
+
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+	smi_info->si_state = SI_ENABLE_INTERRUPTS1;
+}
+
+static void start_clear_flags(struct smi_info *smi_info)
+{
+	unsigned char msg[3];
+
+	/* Make sure the watchdog pre-timeout flag is not set at startup. */
+	msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+	msg[1] = IPMI_CLEAR_MSG_FLAGS_CMD;
+	msg[2] = WDT_PRE_TIMEOUT_INT;
+
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 3);
+	smi_info->si_state = SI_CLEARING_FLAGS;
+}
+
+/* When we have a situtaion where we run out of memory and cannot
+   allocate messages, we just leave them in the BMC and run the system
+   polled until we can allocate some memory.  Once we have some
+   memory, we will re-enable the interrupt. */
+static inline void disable_si_irq(struct smi_info *smi_info)
+{
+	if ((smi_info->irq) && (!smi_info->interrupt_disabled)) {
+		disable_irq_nosync(smi_info->irq);
+		smi_info->interrupt_disabled = 1;
+	}
+}
+
+static inline void enable_si_irq(struct smi_info *smi_info)
+{
+	if ((smi_info->irq) && (smi_info->interrupt_disabled)) {
+		enable_irq(smi_info->irq);
+		smi_info->interrupt_disabled = 0;
+	}
+}
+
+static void handle_flags(struct smi_info *smi_info)
+{
+	if (smi_info->msg_flags & WDT_PRE_TIMEOUT_INT) {
+		/* Watchdog pre-timeout */
+		spin_lock(&smi_info->count_lock);
+		smi_info->watchdog_pretimeouts++;
+		spin_unlock(&smi_info->count_lock);
+
+		start_clear_flags(smi_info);
+		smi_info->msg_flags &= ~WDT_PRE_TIMEOUT_INT;
+		spin_unlock(&(smi_info->si_lock));
+		ipmi_smi_watchdog_pretimeout(smi_info->intf);
+		spin_lock(&(smi_info->si_lock));
+	} else if (smi_info->msg_flags & RECEIVE_MSG_AVAIL) {
+		/* Messages available. */
+		smi_info->curr_msg = ipmi_alloc_smi_msg();
+		if (!smi_info->curr_msg) {
+			disable_si_irq(smi_info);
+			smi_info->si_state = SI_NORMAL;
+			return;
+		}
+		enable_si_irq(smi_info);
+
+		smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		smi_info->curr_msg->data[1] = IPMI_GET_MSG_CMD;
+		smi_info->curr_msg->data_size = 2;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		smi_info->si_state = SI_GETTING_MESSAGES;
+	} else if (smi_info->msg_flags & EVENT_MSG_BUFFER_FULL) {
+		/* Events available. */
+		smi_info->curr_msg = ipmi_alloc_smi_msg();
+		if (!smi_info->curr_msg) {
+			disable_si_irq(smi_info);
+			smi_info->si_state = SI_NORMAL;
+			return;
+		}
+		enable_si_irq(smi_info);
+
+		smi_info->curr_msg->data[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		smi_info->curr_msg->data[1] = IPMI_READ_EVENT_MSG_BUFFER_CMD;
+		smi_info->curr_msg->data_size = 2;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm,
+			smi_info->curr_msg->data,
+			smi_info->curr_msg->data_size);
+		smi_info->si_state = SI_GETTING_EVENTS;
+	} else {
+		smi_info->si_state = SI_NORMAL;
+	}
+}
+
+static void handle_transaction_done(struct smi_info *smi_info)
+{
+	struct ipmi_smi_msg *msg;
+#ifdef DEBUG_TIMING
+	struct timeval t;
+
+	do_gettimeofday(&t);
+	printk("**Done: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	switch (smi_info->si_state) {
+	case SI_NORMAL:
+		if (!smi_info->curr_msg)
+			break;
+
+		smi_info->curr_msg->rsp_size
+			= smi_info->handlers->get_result(
+				smi_info->si_sm,
+				smi_info->curr_msg->rsp,
+				IPMI_MAX_MSG_LENGTH);
+
+		/* Do this here becase deliver_recv_msg() releases the
+		   lock, and a new message can be put in during the
+		   time the lock is released. */
+		msg = smi_info->curr_msg;
+		smi_info->curr_msg = NULL;
+		deliver_recv_msg(smi_info, msg);
+		break;
+
+	case SI_GETTING_FLAGS:
+	{
+		unsigned char msg[4];
+		unsigned int  len;
+
+		/* We got the flags from the SMI, now handle them. */
+		len = smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
+		if (msg[2] != 0) {
+			/* Error fetching flags, just give up for
+			   now. */
+			smi_info->si_state = SI_NORMAL;
+		} else if (len < 3) {
+			/* Hmm, no flags.  That's technically illegal, but
+			   don't use uninitialized data. */
+			smi_info->si_state = SI_NORMAL;
+		} else {
+			smi_info->msg_flags = msg[3];
+			handle_flags(smi_info);
+		}
+		break;
+	}
+
+	case SI_CLEARING_FLAGS:
+	case SI_CLEARING_FLAGS_THEN_SET_IRQ:
+	{
+		unsigned char msg[3];
+
+		/* We cleared the flags. */
+		smi_info->handlers->get_result(smi_info->si_sm, msg, 3);
+		if (msg[2] != 0) {
+			/* Error clearing flags */
+			printk(KERN_WARNING
+			       "ipmi_si: Error clearing flags: %2.2x\n",
+			       msg[2]);
+		}
+		if (smi_info->si_state == SI_CLEARING_FLAGS_THEN_SET_IRQ)
+			start_enable_irq(smi_info);
+		else
+			smi_info->si_state = SI_NORMAL;
+		break;
+	}
+
+	case SI_GETTING_EVENTS:
+	{
+		smi_info->curr_msg->rsp_size
+			= smi_info->handlers->get_result(
+				smi_info->si_sm,
+				smi_info->curr_msg->rsp,
+				IPMI_MAX_MSG_LENGTH);
+
+		/* Do this here becase deliver_recv_msg() releases the
+		   lock, and a new message can be put in during the
+		   time the lock is released. */
+		msg = smi_info->curr_msg;
+		smi_info->curr_msg = NULL;
+		if (msg->rsp[2] != 0) {
+			/* Error getting event, probably done. */
+			msg->done(msg);
+
+			/* Take off the event flag. */
+			smi_info->msg_flags &= ~EVENT_MSG_BUFFER_FULL;
+		} else {
+			spin_lock(&smi_info->count_lock);
+			smi_info->events++;
+			spin_unlock(&smi_info->count_lock);
+
+			deliver_recv_msg(smi_info, msg);
+		}
+		handle_flags(smi_info);
+		break;
+	}
+
+	case SI_GETTING_MESSAGES:
+	{
+		smi_info->curr_msg->rsp_size
+			= smi_info->handlers->get_result(
+				smi_info->si_sm,
+				smi_info->curr_msg->rsp,
+				IPMI_MAX_MSG_LENGTH);
+
+		/* Do this here becase deliver_recv_msg() releases the
+		   lock, and a new message can be put in during the
+		   time the lock is released. */
+		msg = smi_info->curr_msg;
+		smi_info->curr_msg = NULL;
+		if (msg->rsp[2] != 0) {
+			/* Error getting event, probably done. */
+			msg->done(msg);
+
+			/* Take off the msg flag. */
+			smi_info->msg_flags &= ~RECEIVE_MSG_AVAIL;
+		} else {
+			spin_lock(&smi_info->count_lock);
+			smi_info->incoming_messages++;
+			spin_unlock(&smi_info->count_lock);
+
+			deliver_recv_msg(smi_info, msg);
+		}
+		handle_flags(smi_info);
+		break;
+	}
+
+	case SI_ENABLE_INTERRUPTS1:
+	{
+		unsigned char msg[4];
+
+		/* We got the flags from the SMI, now handle them. */
+		smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
+		if (msg[2] != 0) {
+			printk(KERN_WARNING
+			       "ipmi_si: Could not enable interrupts"
+			       ", failed get, using polled mode.\n");
+			smi_info->si_state = SI_NORMAL;
+		} else {
+			msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+			msg[1] = IPMI_SET_BMC_GLOBAL_ENABLES_CMD;
+			msg[2] = msg[3] | 1; /* enable msg queue int */
+			smi_info->handlers->start_transaction(
+				smi_info->si_sm, msg, 3);
+			smi_info->si_state = SI_ENABLE_INTERRUPTS2;
+		}
+		break;
+	}
+
+	case SI_ENABLE_INTERRUPTS2:
+	{
+		unsigned char msg[4];
+
+		/* We got the flags from the SMI, now handle them. */
+		smi_info->handlers->get_result(smi_info->si_sm, msg, 4);
+		if (msg[2] != 0) {
+			printk(KERN_WARNING
+			       "ipmi_si: Could not enable interrupts"
+			       ", failed set, using polled mode.\n");
+		}
+		smi_info->si_state = SI_NORMAL;
+		break;
+	}
+	}
+}
+
+/* Called on timeouts and events.  Timeouts should pass the elapsed
+   time, interrupts should pass in zero. */
+static enum si_sm_result smi_event_handler(struct smi_info *smi_info,
+					   int time)
+{
+	enum si_sm_result si_sm_result;
+
+ restart:
+	/* There used to be a loop here that waited a little while
+	   (around 25us) before giving up.  That turned out to be
+	   pointless, the minimum delays I was seeing were in the 300us
+	   range, which is far too long to wait in an interrupt.  So
+	   we just run until the state machine tells us something
+	   happened or it needs a delay. */
+	si_sm_result = smi_info->handlers->event(smi_info->si_sm, time);
+	time = 0;
+	while (si_sm_result == SI_SM_CALL_WITHOUT_DELAY)
+	{
+		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
+	}
+
+	if (si_sm_result == SI_SM_TRANSACTION_COMPLETE)
+	{
+		spin_lock(&smi_info->count_lock);
+		smi_info->complete_transactions++;
+		spin_unlock(&smi_info->count_lock);
+
+		handle_transaction_done(smi_info);
+		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
+	}
+	else if (si_sm_result == SI_SM_HOSED)
+	{
+		spin_lock(&smi_info->count_lock);
+		smi_info->hosed_count++;
+		spin_unlock(&smi_info->count_lock);
+
+		if (smi_info->curr_msg != NULL) {
+			/* If we were handling a user message, format
+                           a response to send to the upper layer to
+                           tell it about the error. */
+			return_hosed_msg(smi_info);
+		}
+		si_sm_result = smi_info->handlers->event(smi_info->si_sm, 0);
+		smi_info->si_state = SI_NORMAL;
+	}
+
+	/* We prefer handling attn over new messages. */
+	if (si_sm_result == SI_SM_ATTN)
+	{
+		unsigned char msg[2];
+
+		spin_lock(&smi_info->count_lock);
+		smi_info->attentions++;
+		spin_unlock(&smi_info->count_lock);
+
+		/* Got a attn, send down a get message flags to see
+                   what's causing it.  It would be better to handle
+                   this in the upper layer, but due to the way
+                   interrupts work with the SMI, that's not really
+                   possible. */
+		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm, msg, 2);
+		smi_info->si_state = SI_GETTING_FLAGS;
+		goto restart;
+	}
+
+	/* If we are currently idle, try to start the next message. */
+	if (si_sm_result == SI_SM_IDLE) {
+		spin_lock(&smi_info->count_lock);
+		smi_info->idles++;
+		spin_unlock(&smi_info->count_lock);
+
+		si_sm_result = start_next_msg(smi_info);
+		if (si_sm_result != SI_SM_IDLE)
+			goto restart;
+        }
+
+	if ((si_sm_result == SI_SM_IDLE)
+	    && (atomic_read(&smi_info->req_events)))
+	{
+		/* We are idle and the upper layer requested that I fetch
+		   events, so do so. */
+		unsigned char msg[2];
+
+		spin_lock(&smi_info->count_lock);
+		smi_info->flag_fetches++;
+		spin_unlock(&smi_info->count_lock);
+
+		atomic_set(&smi_info->req_events, 0);
+		msg[0] = (IPMI_NETFN_APP_REQUEST << 2);
+		msg[1] = IPMI_GET_MSG_FLAGS_CMD;
+
+		smi_info->handlers->start_transaction(
+			smi_info->si_sm, msg, 2);
+		smi_info->si_state = SI_GETTING_FLAGS;
+		goto restart;
+	}
+
+	return si_sm_result;
+}
+
+static void sender(void                *send_info,
+		   struct ipmi_smi_msg *msg,
+		   int                 priority)
+{
+	struct smi_info   *smi_info = send_info;
+	enum si_sm_result result;
+	unsigned long     flags;
+#ifdef DEBUG_TIMING
+	struct timeval    t;
+#endif
+
+	spin_lock_irqsave(&(smi_info->msg_lock), flags);
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**Enqueue: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+
+	if (smi_info->run_to_completion) {
+		/* If we are running to completion, then throw it in
+		   the list and run transactions until everything is
+		   clear.  Priority doesn't matter here. */
+		list_add_tail(&(msg->link), &(smi_info->xmit_msgs));
+
+		/* We have to release the msg lock and claim the smi
+		   lock in this case, because of race conditions. */
+		spin_unlock_irqrestore(&(smi_info->msg_lock), flags);
+
+		spin_lock_irqsave(&(smi_info->si_lock), flags);
+		result = smi_event_handler(smi_info, 0);
+		while (result != SI_SM_IDLE) {
+			udelay(SI_SHORT_TIMEOUT_USEC);
+			result = smi_event_handler(smi_info,
+						   SI_SHORT_TIMEOUT_USEC);
+		}
+		spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+		return;
+	} else {
+		if (priority > 0) {
+			list_add_tail(&(msg->link), &(smi_info->hp_xmit_msgs));
+		} else {
+			list_add_tail(&(msg->link), &(smi_info->xmit_msgs));
+		}
+	}
+	spin_unlock_irqrestore(&(smi_info->msg_lock), flags);
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+	if ((smi_info->si_state == SI_NORMAL)
+	    && (smi_info->curr_msg == NULL))
+	{
+		start_next_msg(smi_info);
+		si_restart_short_timer(smi_info);
+	}
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+}
+
+static void set_run_to_completion(void *send_info, int i_run_to_completion)
+{
+	struct smi_info   *smi_info = send_info;
+	enum si_sm_result result;
+	unsigned long     flags;
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+
+	smi_info->run_to_completion = i_run_to_completion;
+	if (i_run_to_completion) {
+		result = smi_event_handler(smi_info, 0);
+		while (result != SI_SM_IDLE) {
+			udelay(SI_SHORT_TIMEOUT_USEC);
+			result = smi_event_handler(smi_info,
+						   SI_SHORT_TIMEOUT_USEC);
+		}
+	}
+
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+}
+
+static void request_events(void *send_info)
+{
+	struct smi_info *smi_info = send_info;
+
+	atomic_set(&smi_info->req_events, 1);
+}
+
+static int initialized = 0;
+
+/* Must be called with interrupts off and with the si_lock held. */
+static void si_restart_short_timer(struct smi_info *smi_info)
+{
+#if defined(CONFIG_HIGH_RES_TIMERS)
+	unsigned long flags;
+	unsigned long jiffies_now;
+
+	if (del_timer(&(smi_info->si_timer))) {
+		/* If we don't delete the timer, then it will go off
+		   immediately, anyway.  So we only process if we
+		   actually delete the timer. */
+
+		/* We already have irqsave on, so no need for it
+                   here. */
+		read_lock(&xtime_lock);
+		jiffies_now = jiffies;
+		smi_info->si_timer.expires = jiffies_now;
+		smi_info->si_timer.sub_expires = get_arch_cycles(jiffies_now);
+
+		add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC);
+
+		add_timer(&(smi_info->si_timer));
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->timeout_restarts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+	}
+#endif
+}
+
+static void smi_timeout(unsigned long data)
+{
+	struct smi_info   *smi_info = (struct smi_info *) data;
+	enum si_sm_result smi_result;
+	unsigned long     flags;
+	unsigned long     jiffies_now;
+	unsigned long     time_diff;
+#ifdef DEBUG_TIMING
+	struct timeval    t;
+#endif
+
+	if (smi_info->stop_operation) {
+		smi_info->timer_stopped = 1;
+		return;
+	}
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**Timer: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	jiffies_now = jiffies;
+	time_diff = ((jiffies_now - smi_info->last_timeout_jiffies)
+		     * SI_USEC_PER_JIFFY);
+	smi_result = smi_event_handler(smi_info, time_diff);
+
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+
+	smi_info->last_timeout_jiffies = jiffies_now;
+
+	if ((smi_info->irq) && (! smi_info->interrupt_disabled)) {
+		/* Running with interrupts, only do long timeouts. */
+		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->long_timeouts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+		goto do_add_timer;
+	}
+
+	/* If the state machine asks for a short delay, then shorten
+           the timer timeout. */
+	if (smi_result == SI_SM_CALL_WITH_DELAY) {
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->short_timeouts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+#if defined(CONFIG_HIGH_RES_TIMERS)
+		read_lock(&xtime_lock);
+                smi_info->si_timer.expires = jiffies;
+                smi_info->si_timer.sub_expires
+                        = get_arch_cycles(smi_info->si_timer.expires);
+                read_unlock(&xtime_lock);
+		add_usec_to_timer(&smi_info->si_timer, SI_SHORT_TIMEOUT_USEC);
+#else
+		smi_info->si_timer.expires = jiffies + 1;
+#endif
+	} else {
+		spin_lock_irqsave(&smi_info->count_lock, flags);
+		smi_info->long_timeouts++;
+		spin_unlock_irqrestore(&smi_info->count_lock, flags);
+		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
+#if defined(CONFIG_HIGH_RES_TIMERS)
+		smi_info->si_timer.sub_expires = 0;
+#endif
+	}
+
+ do_add_timer:
+	add_timer(&(smi_info->si_timer));
+}
+
+static irqreturn_t si_irq_handler(int irq, void *data, struct pt_regs *regs)
+{
+	struct smi_info *smi_info = data;
+	unsigned long   flags;
+#ifdef DEBUG_TIMING
+	struct timeval  t;
+#endif
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+
+	spin_lock(&smi_info->count_lock);
+	smi_info->interrupts++;
+	spin_unlock(&smi_info->count_lock);
+
+	if (smi_info->stop_operation)
+		goto out;
+
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**Interrupt: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	smi_event_handler(smi_info, 0);
+ out:
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+	return IRQ_HANDLED;
+}
+
+static struct ipmi_smi_handlers handlers =
+{
+	.owner                  = THIS_MODULE,
+	.sender			= sender,
+	.request_events		= request_events,
+	.set_run_to_completion  = set_run_to_completion
+};
+
+/* There can be 4 IO ports passed in (with or without IRQs), 4 addresses,
+   a default IO port, and 1 ACPI/SPMI address.  That sets SI_MAX_DRIVERS */
+
+#define SI_MAX_PARMS 4
+#define SI_MAX_DRIVERS ((SI_MAX_PARMS * 2) + 2)
+static struct smi_info *smi_infos[SI_MAX_DRIVERS] =
+{ NULL, NULL, NULL, NULL };
+
+#define DEVICE_NAME "ipmi_si"
+
+#define DEFAULT_KCS_IO_PORT 0xca2
+#define DEFAULT_SMIC_IO_PORT 0xca9
+#define DEFAULT_BT_IO_PORT   0xe4
+
+static int           si_trydefaults = 1;
+static char          *si_type[SI_MAX_PARMS] = { NULL, NULL, NULL, NULL };
+#define MAX_SI_TYPE_STR 30
+static char          si_type_str[MAX_SI_TYPE_STR];
+static unsigned long addrs[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int num_addrs = 0;
+static unsigned int  ports[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int num_ports = 0;
+static int           irqs[SI_MAX_PARMS] = { 0, 0, 0, 0 };
+static int num_irqs = 0;
+
+
+module_param_named(trydefaults, si_trydefaults, bool, 0);
+MODULE_PARM_DESC(trydefaults, "Setting this to 'false' will disable the"
+		 " default scan of the KCS and SMIC interface at the standard"
+		 " address");
+module_param_string(type, si_type_str, MAX_SI_TYPE_STR, 0);
+MODULE_PARM_DESC(type, "Defines the type of each interface, each"
+		 " interface separated by commas.  The types are 'kcs',"
+		 " 'smic', and 'bt'.  For example si_type=kcs,bt will set"
+		 " the first interface to kcs and the second to bt");
+module_param_array(addrs, long, num_addrs, 0);
+MODULE_PARM_DESC(addrs, "Sets the memory address of each interface, the"
+		 " addresses separated by commas.  Only use if an interface"
+		 " is in memory.  Otherwise, set it to zero or leave"
+		 " it blank.");
+module_param_array(ports, int, num_ports, 0);
+MODULE_PARM_DESC(ports, "Sets the port address of each interface, the"
+		 " addresses separated by commas.  Only use if an interface"
+		 " is a port.  Otherwise, set it to zero or leave"
+		 " it blank.");
+module_param_array(irqs, int, num_irqs, 0);
+MODULE_PARM_DESC(irqs, "Sets the interrupt of each interface, the"
+		 " addresses separated by commas.  Only use if an interface"
+		 " has an interrupt.  Otherwise, set it to zero or leave"
+		 " it blank.");
+
+
+#if defined(CONFIG_ACPI_INTERPETER) || defined(CONFIG_X86) || defined(CONFIG_PCI)
+#define IPMI_MEM_ADDR_SPACE 1
+#define IPMI_IO_ADDR_SPACE  2
+static int is_new_interface(int intf, u8 addr_space, unsigned long base_addr)
+{
+	int i;
+
+	for (i = 0; i < SI_MAX_PARMS; ++i) {
+		/* Don't check our address. */
+		if (i == intf)
+			continue;
+		if (si_type[i] != NULL) {
+			if ((addr_space == IPMI_MEM_ADDR_SPACE &&
+			     base_addr == addrs[i]) ||
+			    (addr_space == IPMI_IO_ADDR_SPACE &&
+			     base_addr == ports[i]))
+				return 0;
+		}
+		else
+			break;
+	}
+
+	return 1;
+}
+#endif
+
+static int std_irq_setup(struct smi_info *info)
+{
+	int rv;
+
+	if (!info->irq)
+		return 0;
+
+	rv = request_irq(info->irq,
+			 si_irq_handler,
+			 SA_INTERRUPT,
+			 DEVICE_NAME,
+			 info);
+	if (rv) {
+		printk(KERN_WARNING
+		       "ipmi_si: %s unable to claim interrupt %d,"
+		       " running polled\n",
+		       DEVICE_NAME, info->irq);
+		info->irq = 0;
+	} else {
+		printk("  Using irq %d\n", info->irq);
+	}
+
+	return rv;
+}
+
+static void std_irq_cleanup(struct smi_info *info)
+{
+	if (!info->irq)
+		return;
+
+	free_irq(info->irq, info);
+}
+
+static unsigned char port_inb(struct si_sm_io *io, unsigned int offset)
+{
+	unsigned int *addr = io->info;
+
+	return inb((*addr)+offset);
+}
+
+static void port_outb(struct si_sm_io *io, unsigned int offset,
+		      unsigned char b)
+{
+	unsigned int *addr = io->info;
+
+	outb(b, (*addr)+offset);
+}
+
+static int port_setup(struct smi_info *info)
+{
+	unsigned int *addr = info->io.info;
+
+	if (!addr || (!*addr))
+		return -ENODEV;
+
+	if (request_region(*addr, info->io_size, DEVICE_NAME) == NULL)
+		return -EIO;
+	return 0;
+}
+
+static void port_cleanup(struct smi_info *info)
+{
+	unsigned int *addr = info->io.info;
+
+	if (addr && (*addr))
+		release_region (*addr, info->io_size);
+	kfree(info);
+}
+
+static int try_init_port(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info *info;
+
+	if (!ports[intf_num])
+		return -ENODEV;
+
+	if (!is_new_interface(intf_num, IPMI_IO_ADDR_SPACE,
+			      ports[intf_num]))
+		return -ENODEV;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (1)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	info->io_setup = port_setup;
+	info->io_cleanup = port_cleanup;
+	info->io.inputb = port_inb;
+	info->io.outputb = port_outb;
+	info->io.info = &(ports[intf_num]);
+	info->io.addr = NULL;
+	info->irq = 0;
+	info->irq_setup = NULL;
+	*new_info = info;
+
+	if (si_type[intf_num] == NULL)
+		si_type[intf_num] = "kcs";
+
+	printk("ipmi_si: Trying \"%s\" at I/O port 0x%x\n",
+	       si_type[intf_num], ports[intf_num]);
+	return 0;
+}
+
+static unsigned char mem_inb(struct si_sm_io *io, unsigned int offset)
+{
+	return readb((io->addr)+offset);
+}
+
+static void mem_outb(struct si_sm_io *io, unsigned int offset,
+		     unsigned char b)
+{
+	writeb(b, (io->addr)+offset);
+}
+
+static int mem_setup(struct smi_info *info)
+{
+	unsigned long *addr = info->io.info;
+
+	if (!addr || (!*addr))
+		return -ENODEV;
+
+	if (request_mem_region(*addr, info->io_size, DEVICE_NAME) == NULL)
+		return -EIO;
+
+	info->io.addr = ioremap(*addr, info->io_size);
+	if (info->io.addr == NULL) {
+		release_mem_region(*addr, info->io_size);
+		return -EIO;
+	}
+	return 0;
+}
+
+static void mem_cleanup(struct smi_info *info)
+{
+	unsigned long *addr = info->io.info;
+
+	if (info->io.addr) {
+		iounmap(info->io.addr);
+		release_mem_region(*addr, info->io_size);
+	}
+	kfree(info);
+}
+
+static int try_init_mem(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info *info;
+
+	if (!addrs[intf_num])
+		return -ENODEV;
+
+	if (!is_new_interface(intf_num, IPMI_MEM_ADDR_SPACE,
+			      addrs[intf_num]))
+		return -ENODEV;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (2)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	info->io_setup = mem_setup;
+	info->io_cleanup = mem_cleanup;
+	info->io.inputb = mem_inb;
+	info->io.outputb = mem_outb;
+	info->io.info = (void *) addrs[intf_num];
+	info->io.addr = NULL;
+	info->irq = 0;
+	info->irq_setup = NULL;
+	*new_info = info;
+
+	if (si_type[intf_num] == NULL)
+		si_type[intf_num] = "kcs";
+
+	printk("ipmi_si: Trying \"%s\" at memory address 0x%lx\n",
+	       si_type[intf_num], addrs[intf_num]);
+	return 0;
+}
+
+
+#ifdef CONFIG_ACPI_INTERPRETER
+
+#include <linux/acpi.h>
+
+/* Once we get an ACPI failure, we don't try any more, because we go
+   through the tables sequentially.  Once we don't find a table, there
+   are no more. */
+static int acpi_failure = 0;
+
+/* For GPE-type interrupts. */
+void ipmi_acpi_gpe(void *context)
+{
+	struct smi_info *smi_info = context;
+	unsigned long   flags;
+#ifdef DEBUG_TIMING
+	struct timeval t;
+#endif
+
+	spin_lock_irqsave(&(smi_info->si_lock), flags);
+
+	spin_lock(&smi_info->count_lock);
+	smi_info->interrupts++;
+	spin_unlock(&smi_info->count_lock);
+
+	if (smi_info->stop_operation)
+		goto out;
+
+#ifdef DEBUG_TIMING
+	do_gettimeofday(&t);
+	printk("**ACPI_GPE: %d.%9.9d\n", t.tv_sec, t.tv_usec);
+#endif
+	smi_event_handler(smi_info, 0);
+ out:
+	spin_unlock_irqrestore(&(smi_info->si_lock), flags);
+}
+
+static int acpi_gpe_irq_setup(struct smi_info *info)
+{
+	acpi_status status;
+
+	if (!info->irq)
+		return 0;
+
+	/* FIXME - is level triggered right? */
+	status = acpi_install_gpe_handler(NULL,
+					  info->irq,
+					  ACPI_GPE_LEVEL_TRIGGERED,
+					  ipmi_acpi_gpe,
+					  info);
+	if (status != AE_OK) {
+		printk(KERN_WARNING
+		       "ipmi_si: %s unable to claim ACPI GPE %d,"
+		       " running polled\n",
+		       DEVICE_NAME, info->irq);
+		info->irq = 0;
+		return -EINVAL;
+	} else {
+		printk("  Using ACPI GPE %d\n", info->irq);
+		return 0;
+	}
+
+}
+
+static void acpi_gpe_irq_cleanup(struct smi_info *info)
+{
+	if (!info->irq)
+		return;
+
+	acpi_remove_gpe_handler(NULL, info->irq, ipmi_acpi_gpe);
+}
+
+/*
+ * Defined at
+ * http://h21007.www2.hp.com/dspp/files/unprotected/devresource/Docs/TechPapers/IA64/hpspmi.pdf
+ */
+struct SPMITable {
+	s8	Signature[4];
+	u32	Length;
+	u8	Revision;
+	u8	Checksum;
+	s8	OEMID[6];
+	s8	OEMTableID[8];
+	s8	OEMRevision[4];
+	s8	CreatorID[4];
+	s8	CreatorRevision[4];
+	u8	InterfaceType;
+	u8	IPMIlegacy;
+	s16	SpecificationRevision;
+
+	/*
+	 * Bit 0 - SCI interrupt supported
+	 * Bit 1 - I/O APIC/SAPIC
+	 */
+	u8	InterruptType;
+
+	/* If bit 0 of InterruptType is set, then this is the SCI
+           interrupt in the GPEx_STS register. */
+	u8	GPE;
+
+	s16	Reserved;
+
+	/* If bit 1 of InterruptType is set, then this is the I/O
+           APIC/SAPIC interrupt. */
+	u32	GlobalSystemInterrupt;
+
+	/* The actual register address. */
+	struct acpi_generic_address addr;
+
+	u8	UID[4];
+
+	s8      spmi_id[1]; /* A '\0' terminated array starts here. */
+};
+
+static int try_init_acpi(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info  *info;
+	acpi_status      status;
+	struct SPMITable *spmi;
+	char             *io_type;
+	u8 		 addr_space;
+
+	if (acpi_failure)
+		return -ENODEV;
+
+	status = acpi_get_firmware_table("SPMI", intf_num+1,
+					 ACPI_LOGICAL_ADDRESSING,
+					 (struct acpi_table_header **) &spmi);
+	if (status != AE_OK) {
+		acpi_failure = 1;
+		return -ENODEV;
+	}
+
+	if (spmi->IPMIlegacy != 1) {
+	    printk(KERN_INFO "IPMI: Bad SPMI legacy %d\n", spmi->IPMIlegacy);
+  	    return -ENODEV;
+	}
+
+	if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY)
+		addr_space = IPMI_MEM_ADDR_SPACE;
+	else
+		addr_space = IPMI_IO_ADDR_SPACE;
+	if (!is_new_interface(-1, addr_space, spmi->addr.address))
+		return -ENODEV;
+
+	/* Figure out the interface type. */
+	switch (spmi->InterfaceType)
+	{
+	case 1:	/* KCS */
+		si_type[intf_num] = "kcs";
+		break;
+
+	case 2:	/* SMIC */
+		si_type[intf_num] = "smic";
+		break;
+
+	case 3:	/* BT */
+		si_type[intf_num] = "bt";
+		break;
+
+	default:
+		printk(KERN_INFO "ipmi_si: Unknown ACPI/SPMI SI type %d\n",
+			spmi->InterfaceType);
+		return -EIO;
+	}
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (3)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	if (spmi->InterruptType & 1) {
+		/* We've got a GPE interrupt. */
+		info->irq = spmi->GPE;
+		info->irq_setup = acpi_gpe_irq_setup;
+		info->irq_cleanup = acpi_gpe_irq_cleanup;
+	} else if (spmi->InterruptType & 2) {
+		/* We've got an APIC/SAPIC interrupt. */
+		info->irq = spmi->GlobalSystemInterrupt;
+		info->irq_setup = std_irq_setup;
+		info->irq_cleanup = std_irq_cleanup;
+	} else {
+		/* Use the default interrupt setting. */
+		info->irq = 0;
+		info->irq_setup = NULL;
+	}
+
+	if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_MEMORY) {
+		io_type = "memory";
+		info->io_setup = mem_setup;
+		info->io_cleanup = mem_cleanup;
+		addrs[intf_num] = spmi->addr.address;
+		info->io.inputb = mem_inb;
+		info->io.outputb = mem_outb;
+		info->io.info = &(addrs[intf_num]);
+	} else if (spmi->addr.address_space_id == ACPI_ADR_SPACE_SYSTEM_IO) {
+		io_type = "I/O";
+		info->io_setup = port_setup;
+		info->io_cleanup = port_cleanup;
+		ports[intf_num] = spmi->addr.address;
+		info->io.inputb = port_inb;
+		info->io.outputb = port_outb;
+		info->io.info = &(ports[intf_num]);
+	} else {
+		kfree(info);
+		printk("ipmi_si: Unknown ACPI I/O Address type\n");
+		return -EIO;
+	}
+
+	*new_info = info;
+
+	printk("ipmi_si: ACPI/SPMI specifies \"%s\" %s SI @ 0x%lx\n",
+	       si_type[intf_num], io_type, (unsigned long) spmi->addr.address);
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_X86
+
+typedef struct dmi_ipmi_data
+{
+	u8   		type;
+	u8   		addr_space;
+	unsigned long	base_addr;
+	u8   		irq;
+}dmi_ipmi_data_t;
+
+typedef struct dmi_header
+{
+	u8	type;
+	u8	length;
+	u16	handle;
+}dmi_header_t;
+
+static int decode_dmi(dmi_header_t *dm, dmi_ipmi_data_t *ipmi_data)
+{
+	u8		*data = (u8 *)dm;
+	unsigned long  	base_addr;
+
+	ipmi_data->type = data[0x04];
+
+	memcpy(&base_addr,&data[0x08],sizeof(unsigned long));
+	if (base_addr & 1) {
+		/* I/O */
+		base_addr &= 0xFFFE;
+		ipmi_data->addr_space = IPMI_IO_ADDR_SPACE;
+	}
+	else {
+		/* Memory */
+		ipmi_data->addr_space = IPMI_MEM_ADDR_SPACE;
+	}
+
+	ipmi_data->base_addr = base_addr;
+	ipmi_data->irq = data[0x11];
+
+	if (is_new_interface(-1, ipmi_data->addr_space,ipmi_data->base_addr))
+	    return 0;
+
+	memset(ipmi_data,0,sizeof(dmi_ipmi_data_t));
+
+	return -1;
+}
+
+static int dmi_table(u32 base, int len, int num,
+	dmi_ipmi_data_t *ipmi_data)
+{
+	u8 		  *buf;
+	struct dmi_header *dm;
+	u8 		  *data;
+	int 		  i=1;
+	int		  status=-1;
+
+	buf = ioremap(base, len);
+	if(buf==NULL)
+		return -1;
+
+	data = buf;
+
+	while(i<num && (data - buf) < len)
+	{
+		dm=(dmi_header_t *)data;
+
+		if((data-buf+dm->length) >= len)
+        		break;
+
+		if (dm->type == 38) {
+			if (decode_dmi(dm, ipmi_data) == 0) {
+				status = 0;
+				break;
+			}
+		}
+
+	        data+=dm->length;
+		while((data-buf) < len && (*data || data[1]))
+			data++;
+		data+=2;
+		i++;
+	}
+	iounmap(buf);
+
+	return status;
+}
+
+inline static int dmi_checksum(u8 *buf)
+{
+	u8   sum=0;
+	int  a;
+
+	for(a=0; a<15; a++)
+		sum+=buf[a];
+	return (sum==0);
+}
+
+static int dmi_iterator(dmi_ipmi_data_t *ipmi_data)
+{
+	u8   buf[15];
+	u32  fp=0xF0000;
+
+#ifdef CONFIG_SIMNOW
+	return -1;
+#endif
+
+	while(fp < 0xFFFFF)
+	{
+		isa_memcpy_fromio(buf, fp, 15);
+		if(memcmp(buf, "_DMI_", 5)==0 && dmi_checksum(buf))
+		{
+			u16 num=buf[13]<<8|buf[12];
+			u16 len=buf[7]<<8|buf[6];
+			u32 base=buf[11]<<24|buf[10]<<16|buf[9]<<8|buf[8];
+
+			if(dmi_table(base, len, num, ipmi_data) == 0)
+				return 0;
+		}
+		fp+=16;
+	}
+
+	return -1;
+}
+
+static int try_init_smbios(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info   *info;
+	dmi_ipmi_data_t   ipmi_data;
+	char              *io_type;
+	int               status;
+
+	status = dmi_iterator(&ipmi_data);
+
+	if (status < 0)
+		return -ENODEV;
+
+	switch(ipmi_data.type) {
+		case 0x01: /* KCS */
+			si_type[intf_num] = "kcs";
+			break;
+		case 0x02: /* SMIC */
+			si_type[intf_num] = "smic";
+			break;
+		case 0x03: /* BT */
+			si_type[intf_num] = "bt";
+			break;
+		default:
+			printk("ipmi_si: Unknown SMBIOS SI type.\n");
+			return -EIO;
+	}
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (4)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	if (ipmi_data.addr_space == 1) {
+		io_type = "memory";
+		info->io_setup = mem_setup;
+		info->io_cleanup = mem_cleanup;
+		addrs[intf_num] = ipmi_data.base_addr;
+		info->io.inputb = mem_inb;
+		info->io.outputb = mem_outb;
+		info->io.info = &(addrs[intf_num]);
+	} else if (ipmi_data.addr_space == 2) {
+		io_type = "I/O";
+		info->io_setup = port_setup;
+		info->io_cleanup = port_cleanup;
+		ports[intf_num] = ipmi_data.base_addr;
+		info->io.inputb = port_inb;
+		info->io.outputb = port_outb;
+		info->io.info = &(ports[intf_num]);
+	} else {
+		kfree(info);
+		printk("ipmi_si: Unknown SMBIOS I/O Address type.\n");
+		return -EIO;
+	}
+
+	irqs[intf_num] = ipmi_data.irq;
+
+	*new_info = info;
+
+	printk("ipmi_si: Found SMBIOS-specified state machine at %s"
+	       " address 0x%lx\n",
+	       io_type, (unsigned long)ipmi_data.base_addr);
+	return 0;
+}
+#endif /* CONFIG_X86 */
+
+#ifdef CONFIG_PCI
+
+#define PCI_ERMC_CLASSCODE  0x0C0700
+#define PCI_HP_VENDOR_ID    0x103C
+#define PCI_MMC_DEVICE_ID   0x121A
+#define PCI_MMC_ADDR_CW     0x10
+
+/* Avoid more than one attempt to probe pci smic. */
+static int pci_smic_checked = 0;
+
+static int find_pci_smic(int intf_num, struct smi_info **new_info)
+{
+	struct smi_info  *info;
+	int              error;
+	struct pci_dev   *pci_dev = NULL;
+	u16    		 base_addr;
+	int              fe_rmc = 0;
+
+	if (pci_smic_checked)
+		return -ENODEV;
+
+	pci_smic_checked = 1;
+
+	if ((pci_dev = pci_find_device(PCI_HP_VENDOR_ID, PCI_MMC_DEVICE_ID,
+				       NULL)))
+		;
+	else if ((pci_dev = pci_find_class(PCI_ERMC_CLASSCODE, NULL)) &&
+		 pci_dev->subsystem_vendor == PCI_HP_VENDOR_ID)
+		fe_rmc = 1;
+	else
+		return -ENODEV;
+
+	error = pci_read_config_word(pci_dev, PCI_MMC_ADDR_CW, &base_addr);
+	if (error)
+	{
+		printk(KERN_ERR
+		       "ipmi_si: pci_read_config_word() failed (%d).\n",
+		       error);
+		return -ENODEV;
+	}
+
+	/* Bit 0: 1 specifies programmed I/O, 0 specifies memory mapped I/O */
+	if (!(base_addr & 0x0001))
+	{
+		printk(KERN_ERR
+		       "ipmi_si: memory mapped I/O not supported for PCI"
+		       " smic.\n");
+		return -ENODEV;
+	}
+
+	base_addr &= 0xFFFE;
+	if (!fe_rmc)
+		/* Data register starts at base address + 1 in eRMC */
+		++base_addr;
+
+	if (!is_new_interface(-1, IPMI_IO_ADDR_SPACE, base_addr))
+	    return -ENODEV;
+
+	info = kmalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		printk(KERN_ERR "ipmi_si: Could not allocate SI data (5)\n");
+		return -ENOMEM;
+	}
+	memset(info, 0, sizeof(*info));
+
+	info->io_setup = port_setup;
+	info->io_cleanup = port_cleanup;
+	ports[intf_num] = base_addr;
+	info->io.inputb = port_inb;
+	info->io.outputb = port_outb;
+	info->io.info = &(ports[intf_num]);
+
+	*new_info = info;
+
+	irqs[intf_num] = pci_dev->irq;
+	si_type[intf_num] = "smic";
+
+	printk("ipmi_si: Found PCI SMIC at I/O address 0x%lx\n",
+		(long unsigned int) base_addr);
+
+	return 0;
+}
+#endif /* CONFIG_PCI */
+
+static int try_init_plug_and_play(int intf_num, struct smi_info **new_info)
+{
+#ifdef CONFIG_PCI
+	if (find_pci_smic(intf_num, new_info)==0)
+		return 0;
+#endif
+	/* Include other methods here. */
+
+	return -ENODEV;
+}
+
+
+static int try_get_dev_id(struct smi_info *smi_info)
+{
+	unsigned char      msg[2];
+	unsigned char      *resp;
+	unsigned long      resp_len;
+	enum si_sm_result smi_result;
+	int               rv = 0;
+
+	resp = kmalloc(IPMI_MAX_MSG_LENGTH, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	/* Do a Get Device ID command, since it comes back with some
+	   useful info. */
+	msg[0] = IPMI_NETFN_APP_REQUEST << 2;
+	msg[1] = IPMI_GET_DEVICE_ID_CMD;
+	smi_info->handlers->start_transaction(smi_info->si_sm, msg, 2);
+
+	smi_result = smi_info->handlers->event(smi_info->si_sm, 0);
+	for (;;)
+	{
+		if (smi_result == SI_SM_CALL_WITH_DELAY) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(1);
+			smi_result = smi_info->handlers->event(
+				smi_info->si_sm, 100);
+		}
+		else if (smi_result == SI_SM_CALL_WITHOUT_DELAY)
+		{
+			smi_result = smi_info->handlers->event(
+				smi_info->si_sm, 0);
+		}
+		else
+			break;
+	}
+	if (smi_result == SI_SM_HOSED) {
+		/* We couldn't get the state machine to run, so whatever's at
+		   the port is probably not an IPMI SMI interface. */
+		rv = -ENODEV;
+		goto out;
+	}
+
+	/* Otherwise, we got some data. */
+	resp_len = smi_info->handlers->get_result(smi_info->si_sm,
+						  resp, IPMI_MAX_MSG_LENGTH);
+	if (resp_len < 6) {
+		/* That's odd, it should be longer. */
+		rv = -EINVAL;
+		goto out;
+	}
+
+	if ((resp[1] != IPMI_GET_DEVICE_ID_CMD) || (resp[2] != 0)) {
+		/* That's odd, it shouldn't be able to fail. */
+		rv = -EINVAL;
+		goto out;
+	}
+
+	/* Record info from the get device id, in case we need it. */
+	smi_info->ipmi_si_dev_rev = resp[4] & 0xf;
+	smi_info->ipmi_si_fw_rev_major = resp[5] & 0x7f;
+	smi_info->ipmi_si_fw_rev_minor = resp[6];
+	smi_info->ipmi_version_major = resp[7] & 0xf;
+	smi_info->ipmi_version_minor = resp[7] >> 4;
+
+ out:
+	kfree(resp);
+	return rv;
+}
+
+static int type_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char            *out = (char *) page;
+	struct smi_info *smi = data;
+
+	switch (smi->si_type) {
+	    case SI_KCS:
+		return sprintf(out, "kcs\n");
+	    case SI_SMIC:
+		return sprintf(out, "smic\n");
+	    case SI_BT:
+		return sprintf(out, "bt\n");
+	    default:
+		return 0;
+	}
+}
+
+static int stat_file_read_proc(char *page, char **start, off_t off,
+			       int count, int *eof, void *data)
+{
+	char            *out = (char *) page;
+	struct smi_info *smi = data;
+
+	out += sprintf(out, "interrupts_enabled:    %d\n",
+		       smi->irq && !smi->interrupt_disabled);
+	out += sprintf(out, "short_timeouts:        %ld\n",
+		       smi->short_timeouts);
+	out += sprintf(out, "long_timeouts:         %ld\n",
+		       smi->long_timeouts);
+	out += sprintf(out, "timeout_restarts:      %ld\n",
+		       smi->timeout_restarts);
+	out += sprintf(out, "idles:                 %ld\n",
+		       smi->idles);
+	out += sprintf(out, "interrupts:            %ld\n",
+		       smi->interrupts);
+	out += sprintf(out, "attentions:            %ld\n",
+		       smi->attentions);
+	out += sprintf(out, "flag_fetches:          %ld\n",
+		       smi->flag_fetches);
+	out += sprintf(out, "hosed_count:           %ld\n",
+		       smi->hosed_count);
+	out += sprintf(out, "complete_transactions: %ld\n",
+		       smi->complete_transactions);
+	out += sprintf(out, "events:                %ld\n",
+		       smi->events);
+	out += sprintf(out, "watchdog_pretimeouts:  %ld\n",
+		       smi->watchdog_pretimeouts);
+	out += sprintf(out, "incoming_messages:     %ld\n",
+		       smi->incoming_messages);
+
+	return (out - ((char *) page));
+}
+
+/* Returns 0 if initialized, or negative on an error. */
+static int init_one_smi(int intf_num, struct smi_info **smi)
+{
+	int		rv;
+	struct smi_info *new_smi;
+
+
+	rv = try_init_mem(intf_num, &new_smi);
+	if (rv)
+		rv = try_init_port(intf_num, &new_smi);
+#ifdef CONFIG_ACPI_INTERPRETER
+	if ((rv) && (si_trydefaults)) {
+		rv = try_init_acpi(intf_num, &new_smi);
+	}
+#endif
+#ifdef CONFIG_X86
+	if ((rv) && (si_trydefaults)) {
+		rv = try_init_smbios(intf_num, &new_smi);
+        }
+#endif
+	if ((rv) && (si_trydefaults)) {
+		rv = try_init_plug_and_play(intf_num, &new_smi);
+	}
+
+
+	if (rv)
+		return rv;
+
+	/* So we know not to free it unless we have allocated one. */
+	new_smi->intf = NULL;
+	new_smi->si_sm = NULL;
+	new_smi->handlers = 0;
+
+	if (!new_smi->irq_setup) {
+		new_smi->irq = irqs[intf_num];
+		new_smi->irq_setup = std_irq_setup;
+		new_smi->irq_cleanup = std_irq_cleanup;
+	}
+
+	/* Default to KCS if no type is specified. */
+	if (si_type[intf_num] == NULL) {
+		if (si_trydefaults)
+			si_type[intf_num] = "kcs";
+		else {
+			rv = -EINVAL;
+			goto out_err;
+		}
+	}
+
+	/* Set up the state machine to use. */
+	if (strcmp(si_type[intf_num], "kcs") == 0) {
+		new_smi->handlers = &kcs_smi_handlers;
+		new_smi->si_type = SI_KCS;
+	} else if (strcmp(si_type[intf_num], "smic") == 0) {
+		new_smi->handlers = &smic_smi_handlers;
+		new_smi->si_type = SI_SMIC;
+	} else if (strcmp(si_type[intf_num], "bt") == 0) {
+		new_smi->handlers = &bt_smi_handlers;
+		new_smi->si_type = SI_BT;
+	} else {
+		/* No support for anything else yet. */
+		rv = -EIO;
+		goto out_err;
+	}
+
+	/* Allocate the state machine's data and initialize it. */
+	new_smi->si_sm = kmalloc(new_smi->handlers->size(), GFP_KERNEL);
+	if (!new_smi->si_sm) {
+		printk(" Could not allocate state machine memory\n");
+		rv = -ENOMEM;
+		goto out_err;
+	}
+	new_smi->io_size = new_smi->handlers->init_data(new_smi->si_sm,
+							&new_smi->io);
+
+	/* Now that we know the I/O size, we can set up the I/O. */
+	rv = new_smi->io_setup(new_smi);
+	if (rv) {
+		printk(" Could not set up I/O space\n");
+		goto out_err;
+	}
+
+	spin_lock_init(&(new_smi->si_lock));
+	spin_lock_init(&(new_smi->msg_lock));
+	spin_lock_init(&(new_smi->count_lock));
+
+	/* Do low-level detection first. */
+	if (new_smi->handlers->detect(new_smi->si_sm)) {
+		rv = -ENODEV;
+		goto out_err;
+	}
+
+	/* Attempt a get device id command.  If it fails, we probably
+           don't have a SMI here. */
+	rv = try_get_dev_id(new_smi);
+	if (rv)
+		goto out_err;
+
+	/* Try to claim any interrupts. */
+	new_smi->irq_setup(new_smi);
+
+	INIT_LIST_HEAD(&(new_smi->xmit_msgs));
+	INIT_LIST_HEAD(&(new_smi->hp_xmit_msgs));
+	new_smi->curr_msg = NULL;
+	atomic_set(&new_smi->req_events, 0);
+	new_smi->run_to_completion = 0;
+
+	rv = ipmi_register_smi(&handlers,
+			       new_smi,
+			       new_smi->ipmi_version_major,
+			       new_smi->ipmi_version_minor,
+			       &(new_smi->intf));
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to register device: error %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	rv = ipmi_smi_add_proc_entry(new_smi->intf, "type",
+				     type_file_read_proc, NULL,
+				     new_smi, THIS_MODULE);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to create proc entry: %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	rv = ipmi_smi_add_proc_entry(new_smi->intf, "si_stats",
+				     stat_file_read_proc, NULL,
+				     new_smi, THIS_MODULE);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to create proc entry: %d\n",
+		       rv);
+		goto out_err;
+	}
+
+	start_clear_flags(new_smi);
+
+	/* IRQ is defined to be set when non-zero. */
+	if (new_smi->irq)
+		new_smi->si_state = SI_CLEARING_FLAGS_THEN_SET_IRQ;
+
+	new_smi->interrupt_disabled = 0;
+	new_smi->timer_stopped = 0;
+	new_smi->stop_operation = 0;
+
+	init_timer(&(new_smi->si_timer));
+	new_smi->si_timer.data = (long) new_smi;
+	new_smi->si_timer.function = smi_timeout;
+	new_smi->last_timeout_jiffies = jiffies;
+	new_smi->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
+	add_timer(&(new_smi->si_timer));
+
+	*smi = new_smi;
+
+	printk(" IPMI %s interface initialized\n", si_type[intf_num]);
+
+	return 0;
+
+ out_err:
+	if (new_smi->intf)
+		ipmi_unregister_smi(new_smi->intf);
+
+	new_smi->irq_cleanup(new_smi);
+	if (new_smi->si_sm) {
+		if (new_smi->handlers)
+			new_smi->handlers->cleanup(new_smi->si_sm);
+		kfree(new_smi->si_sm);
+	}
+	new_smi->io_cleanup(new_smi);
+	return rv;
+}
+
+static __init int init_ipmi_si(void)
+{
+	int  rv = 0;
+	int  pos = 0;
+	int  i;
+	char *str;
+
+	if (initialized)
+		return 0;
+	initialized = 1;
+
+	/* Parse out the si_type string into its components. */
+	str = si_type_str;
+	if (*str != '\0') {
+		for (i=0; (i<SI_MAX_PARMS) && (*str != '\0'); i++) {
+			si_type[i] = str;
+			str = strchr(str, ',');
+			if (str) {
+				*str = '\0';
+				str++;
+			} else {
+				break;
+			}
+		}
+	}
+
+	printk(KERN_INFO "IPMI System Interface driver version "
+	       IPMI_SI_VERSION);
+	if (kcs_smi_handlers.version)
+		printk(", KCS version %s", kcs_smi_handlers.version);
+	if (smic_smi_handlers.version)
+		printk(", SMIC version %s", smic_smi_handlers.version);
+	if (bt_smi_handlers.version)
+   	        printk(", BT version %s", bt_smi_handlers.version);
+	printk("\n");
+
+	rv = init_one_smi(0, &(smi_infos[pos]));
+	if (rv && !ports[0] && si_trydefaults) {
+		/* If we are trying defaults and the initial port is
+                   not set, then set it. */
+		si_type[0] = "kcs";
+		ports[0] = DEFAULT_KCS_IO_PORT;
+		rv = init_one_smi(0, &(smi_infos[pos]));
+		if (rv) {
+			/* No KCS - try SMIC */
+			si_type[0] = "smic";
+			ports[0] = DEFAULT_SMIC_IO_PORT;
+			rv = init_one_smi(0, &(smi_infos[pos]));
+		}
+		if (rv) {
+			/* No SMIC - try BT */
+			si_type[0] = "bt";
+			ports[0] = DEFAULT_BT_IO_PORT;
+			rv = init_one_smi(0, &(smi_infos[pos]));
+		}
+	}
+	if (rv == 0)
+		pos++;
+
+	for (i=1; i < SI_MAX_PARMS; i++) {
+		rv = init_one_smi(i, &(smi_infos[pos]));
+		if (rv == 0)
+			pos++;
+	}
+
+	if (smi_infos[0] == NULL) {
+		printk("ipmi_si: Unable to find any System Interface(s)\n");
+		return -ENODEV;
+	}
+
+	return 0;
+}
+module_init(init_ipmi_si);
+
+void __exit cleanup_one_si(struct smi_info *to_clean)
+{
+	int           rv;
+	unsigned long flags;
+
+	if (! to_clean)
+		return;
+
+	/* Tell the timer and interrupt handlers that we are shutting
+	   down. */
+	spin_lock_irqsave(&(to_clean->si_lock), flags);
+	spin_lock(&(to_clean->msg_lock));
+
+	to_clean->stop_operation = 1;
+
+	to_clean->irq_cleanup(to_clean);
+
+	spin_unlock(&(to_clean->msg_lock));
+	spin_unlock_irqrestore(&(to_clean->si_lock), flags);
+
+	/* Wait until we know that we are out of any interrupt
+	   handlers might have been running before we freed the
+	   interrupt. */
+	synchronize_kernel();
+
+	/* Wait for the timer to stop.  This avoids problems with race
+	   conditions removing the timer here. */
+	while (!to_clean->timer_stopped) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(1);
+	}
+
+	rv = ipmi_unregister_smi(to_clean->intf);
+	if (rv) {
+		printk(KERN_ERR
+		       "ipmi_si: Unable to unregister device: errno=%d\n",
+		       rv);
+	}
+
+	to_clean->handlers->cleanup(to_clean->si_sm);
+
+	kfree(to_clean->si_sm);
+
+	to_clean->io_cleanup(to_clean);
+}
+
+static __exit void cleanup_ipmi_si(void)
+{
+	int i;
+
+	if (!initialized)
+		return;
+
+	for (i=0; i<SI_MAX_DRIVERS; i++) {
+		cleanup_one_si(smi_infos[i]);
+	}
+}
+module_exit(cleanup_ipmi_si);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/char/ipmi/ipmi_si_sm.h b/drivers/char/ipmi/ipmi_si_sm.h
new file mode 100644
index 000000000000..f3506552c5a5
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_si_sm.h
@@ -0,0 +1,117 @@
+/*
+ * ipmi_si_sm.h
+ *
+ * State machine interface for low-level IPMI system management
+ * interface state machines.  This code is the interface between
+ * the ipmi_smi code (that handles the policy of a KCS, SMIC, or
+ * BT interface) and the actual low-level state machine.
+ *
+ * Author: MontaVista Software, Inc.
+ *         Corey Minyard <minyard@mvista.com>
+ *         source@mvista.com
+ *
+ * Copyright 2002 MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* This is defined by the state machines themselves, it is an opaque
+   data type for them to use. */
+struct si_sm_data;
+
+/* The structure for doing I/O in the state machine.  The state
+   machine doesn't have the actual I/O routines, they are done through
+   this interface. */
+struct si_sm_io
+{
+	unsigned char (*inputb)(struct si_sm_io *io, unsigned int offset);
+	void (*outputb)(struct si_sm_io *io,
+			unsigned int  offset,
+			unsigned char b);
+
+	/* Generic info used by the actual handling routines, the
+           state machine shouldn't touch these. */
+	void *info;
+	void *addr;
+};
+
+/* Results of SMI events. */
+enum si_sm_result
+{
+	SI_SM_CALL_WITHOUT_DELAY, /* Call the driver again immediately */
+	SI_SM_CALL_WITH_DELAY,	/* Delay some before calling again. */
+	SI_SM_TRANSACTION_COMPLETE, /* A transaction is finished. */
+	SI_SM_IDLE,		/* The SM is in idle state. */
+	SI_SM_HOSED,		/* The hardware violated the state machine. */
+	SI_SM_ATTN		/* The hardware is asserting attn and the
+				   state machine is idle. */
+};
+
+/* Handlers for the SMI state machine. */
+struct si_sm_handlers
+{
+	/* Put the version number of the state machine here so the
+           upper layer can print it. */
+	char *version;
+
+	/* Initialize the data and return the amount of I/O space to
+           reserve for the space. */
+	unsigned int (*init_data)(struct si_sm_data *smi,
+				  struct si_sm_io   *io);
+
+	/* Start a new transaction in the state machine.  This will
+	   return -2 if the state machine is not idle, -1 if the size
+	   is invalid (to large or too small), or 0 if the transaction
+	   is successfully completed. */
+	int (*start_transaction)(struct si_sm_data *smi,
+				 unsigned char *data, unsigned int size);
+
+	/* Return the results after the transaction.  This will return
+	   -1 if the buffer is too small, zero if no transaction is
+	   present, or the actual length of the result data. */
+	int (*get_result)(struct si_sm_data *smi,
+			  unsigned char *data, unsigned int length);
+
+	/* Call this periodically (for a polled interface) or upon
+	   receiving an interrupt (for a interrupt-driven interface).
+	   If interrupt driven, you should probably poll this
+	   periodically when not in idle state.  This should be called
+	   with the time that passed since the last call, if it is
+	   significant.  Time is in microseconds. */
+	enum si_sm_result (*event)(struct si_sm_data *smi, long time);
+
+	/* Attempt to detect an SMI.  Returns 0 on success or nonzero
+           on failure. */
+	int (*detect)(struct si_sm_data *smi);
+
+	/* The interface is shutting down, so clean it up. */
+	void (*cleanup)(struct si_sm_data *smi);
+
+	/* Return the size of the SMI structure in bytes. */
+	int (*size)(void);
+};
+
+/* Current state machines that we can use. */
+extern struct si_sm_handlers kcs_smi_handlers;
+extern struct si_sm_handlers smic_smi_handlers;
+extern struct si_sm_handlers bt_smi_handlers;
+
diff --git a/drivers/char/ipmi/ipmi_smic_sm.c b/drivers/char/ipmi/ipmi_smic_sm.c
new file mode 100644
index 000000000000..7bd7041e3d2f
--- /dev/null
+++ b/drivers/char/ipmi/ipmi_smic_sm.c
@@ -0,0 +1,599 @@
+/*
+ * ipmi_smic_sm.c
+ *
+ * The state-machine driver for an IPMI SMIC driver
+ *
+ * It started as a copy of Corey Minyard's driver for the KSC interface
+ * and the kernel patch "mmcdev-patch-245" by HP
+ *
+ * modified by:	Hannes Schulz <schulz@schwaar.com>
+ *		ipmi@schwaar.com
+ *
+ *
+ * Corey Minyard's driver for the KSC interface has the following
+ * copyright notice:
+ *   Copyright 2002 MontaVista Software Inc.
+ *
+ * the kernel patch "mmcdev-patch-245" by HP has the following
+ * copyright notice:
+ * (c) Copyright 2001 Grant Grundler (c) Copyright
+ * 2001 Hewlett-Packard Company
+ *
+ *
+ *  This program is free software; you can redistribute it and/or modify it
+ *  under the terms of the GNU General Public License as published by the
+ *  Free Software Foundation; either version 2 of the License, or (at your
+ *  option) any later version.
+ *
+ *
+ *  THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ *  WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ *  OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ *  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ *  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+ *  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *  You should have received a copy of the GNU General Public License along
+ *  with this program; if not, write to the Free Software Foundation, Inc.,
+ *  675 Mass Ave, Cambridge, MA 02139, USA.  */
+
+#include <linux/kernel.h> /* For printk. */
+#include <linux/string.h>
+#include <linux/ipmi_msgdefs.h>		/* for completion codes */
+#include "ipmi_si_sm.h"
+
+#define IPMI_SMIC_VERSION "v31"
+
+/* smic_debug is a bit-field
+ *	SMIC_DEBUG_ENABLE -	turned on for now
+ *	SMIC_DEBUG_MSG -	commands and their responses
+ *	SMIC_DEBUG_STATES -	state machine
+*/
+#define SMIC_DEBUG_STATES	4
+#define SMIC_DEBUG_MSG		2
+#define	SMIC_DEBUG_ENABLE	1
+
+static int smic_debug = 1;
+
+enum smic_states {
+	SMIC_IDLE,
+	SMIC_START_OP,
+	SMIC_OP_OK,
+	SMIC_WRITE_START,
+	SMIC_WRITE_NEXT,
+	SMIC_WRITE_END,
+	SMIC_WRITE2READ,
+	SMIC_READ_START,
+	SMIC_READ_NEXT,
+	SMIC_READ_END,
+	SMIC_HOSED
+};
+
+#define MAX_SMIC_READ_SIZE 80
+#define MAX_SMIC_WRITE_SIZE 80
+#define SMIC_MAX_ERROR_RETRIES 3
+
+/* Timeouts in microseconds. */
+#define SMIC_RETRY_TIMEOUT 100000
+
+/* SMIC Flags Register Bits */
+#define SMIC_RX_DATA_READY	0x80
+#define SMIC_TX_DATA_READY	0x40
+#define SMIC_SMI		0x10
+#define SMIC_EVM_DATA_AVAIL	0x08
+#define SMIC_SMS_DATA_AVAIL	0x04
+#define SMIC_FLAG_BSY		0x01
+
+/* SMIC Error Codes */
+#define	EC_NO_ERROR		0x00
+#define	EC_ABORTED		0x01
+#define	EC_ILLEGAL_CONTROL	0x02
+#define	EC_NO_RESPONSE		0x03
+#define	EC_ILLEGAL_COMMAND	0x04
+#define	EC_BUFFER_FULL		0x05
+
+struct si_sm_data
+{
+	enum smic_states state;
+	struct si_sm_io *io;
+        unsigned char	 write_data[MAX_SMIC_WRITE_SIZE];
+        int		 write_pos;
+        int		 write_count;
+        int		 orig_write_count;
+        unsigned char	 read_data[MAX_SMIC_READ_SIZE];
+        int		 read_pos;
+        int		 truncated;
+        unsigned int	 error_retries;
+        long		 smic_timeout;
+};
+
+static unsigned int init_smic_data (struct si_sm_data *smic,
+				    struct si_sm_io *io)
+{
+	smic->state = SMIC_IDLE;
+	smic->io = io;
+	smic->write_pos = 0;
+	smic->write_count = 0;
+	smic->orig_write_count = 0;
+	smic->read_pos = 0;
+	smic->error_retries = 0;
+	smic->truncated = 0;
+	smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+
+	/* We use 3 bytes of I/O. */
+	return 3;
+}
+
+static int start_smic_transaction(struct si_sm_data *smic,
+				  unsigned char *data, unsigned int size)
+{
+	unsigned int i;
+
+	if ((size < 2) || (size > MAX_SMIC_WRITE_SIZE)) {
+		return -1;
+	}
+	if ((smic->state != SMIC_IDLE) && (smic->state != SMIC_HOSED)) {
+		return -2;
+	}
+	if (smic_debug & SMIC_DEBUG_MSG) {
+		printk(KERN_INFO "start_smic_transaction -");
+		for (i = 0; i < size; i ++) {
+			printk (" %02x", (unsigned char) (data [i]));
+		}
+		printk ("\n");
+	}
+	smic->error_retries = 0;
+	memcpy(smic->write_data, data, size);
+	smic->write_count = size;
+	smic->orig_write_count = size;
+	smic->write_pos = 0;
+	smic->read_pos = 0;
+	smic->state = SMIC_START_OP;
+	smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+	return 0;
+}
+
+static int smic_get_result(struct si_sm_data *smic,
+			   unsigned char *data, unsigned int length)
+{
+	int i;
+
+	if (smic_debug & SMIC_DEBUG_MSG) {
+		printk (KERN_INFO "smic_get result -");
+		for (i = 0; i < smic->read_pos; i ++) {
+			printk (" %02x", (smic->read_data [i]));
+		}
+		printk ("\n");
+	}
+	if (length < smic->read_pos) {
+		smic->read_pos = length;
+		smic->truncated = 1;
+	}
+	memcpy(data, smic->read_data, smic->read_pos);
+
+	if ((length >= 3) && (smic->read_pos < 3)) {
+		data[2] = IPMI_ERR_UNSPECIFIED;
+		smic->read_pos = 3;
+	}
+	if (smic->truncated) {
+		data[2] = IPMI_ERR_MSG_TRUNCATED;
+		smic->truncated = 0;
+	}
+	return smic->read_pos;
+}
+
+static inline unsigned char read_smic_flags(struct si_sm_data *smic)
+{
+	return smic->io->inputb(smic->io, 2);
+}
+
+static inline unsigned char read_smic_status(struct si_sm_data *smic)
+{
+	return smic->io->inputb(smic->io, 1);
+}
+
+static inline unsigned char read_smic_data(struct si_sm_data *smic)
+{
+	return smic->io->inputb(smic->io, 0);
+}
+
+static inline void write_smic_flags(struct si_sm_data *smic,
+				    unsigned char   flags)
+{
+	smic->io->outputb(smic->io, 2, flags);
+}
+
+static inline void write_smic_control(struct si_sm_data *smic,
+				      unsigned char   control)
+{
+	smic->io->outputb(smic->io, 1, control);
+}
+
+static inline void write_si_sm_data (struct si_sm_data *smic,
+				   unsigned char   data)
+{
+	smic->io->outputb(smic->io, 0, data);
+}
+
+static inline void start_error_recovery(struct si_sm_data *smic, char *reason)
+{
+	(smic->error_retries)++;
+	if (smic->error_retries > SMIC_MAX_ERROR_RETRIES) {
+		if (smic_debug & SMIC_DEBUG_ENABLE) {
+			printk(KERN_WARNING
+			       "ipmi_smic_drv: smic hosed: %s\n", reason);
+		}
+		smic->state = SMIC_HOSED;
+	} else {
+		smic->write_count = smic->orig_write_count;
+		smic->write_pos = 0;
+		smic->read_pos = 0;
+		smic->state = SMIC_START_OP;
+		smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+	}
+}
+
+static inline void write_next_byte(struct si_sm_data *smic)
+{
+	write_si_sm_data(smic, smic->write_data[smic->write_pos]);
+	(smic->write_pos)++;
+	(smic->write_count)--;
+}
+
+static inline void read_next_byte (struct si_sm_data *smic)
+{
+	if (smic->read_pos >= MAX_SMIC_READ_SIZE) {
+		read_smic_data (smic);
+		smic->truncated = 1;
+	} else {
+		smic->read_data[smic->read_pos] = read_smic_data(smic);
+		(smic->read_pos)++;
+	}
+}
+
+/*  SMIC Control/Status Code Components */
+#define	SMIC_GET_STATUS		0x00	/* Control form's name */
+#define	SMIC_READY		0x00	/* Status  form's name */
+#define	SMIC_WR_START		0x01	/* Unified Control/Status names... */
+#define	SMIC_WR_NEXT		0x02
+#define	SMIC_WR_END		0x03
+#define	SMIC_RD_START		0x04
+#define	SMIC_RD_NEXT		0x05
+#define	SMIC_RD_END		0x06
+#define	SMIC_CODE_MASK		0x0f
+
+#define	SMIC_CONTROL		0x00
+#define	SMIC_STATUS		0x80
+#define	SMIC_CS_MASK		0x80
+
+#define	SMIC_SMS		0x40
+#define	SMIC_SMM		0x60
+#define	SMIC_STREAM_MASK	0x60
+
+/*  SMIC Control Codes */
+#define	SMIC_CC_SMS_GET_STATUS	(SMIC_CONTROL|SMIC_SMS|SMIC_GET_STATUS)
+#define	SMIC_CC_SMS_WR_START	(SMIC_CONTROL|SMIC_SMS|SMIC_WR_START)
+#define	SMIC_CC_SMS_WR_NEXT	(SMIC_CONTROL|SMIC_SMS|SMIC_WR_NEXT)
+#define	SMIC_CC_SMS_WR_END	(SMIC_CONTROL|SMIC_SMS|SMIC_WR_END)
+#define	SMIC_CC_SMS_RD_START	(SMIC_CONTROL|SMIC_SMS|SMIC_RD_START)
+#define	SMIC_CC_SMS_RD_NEXT	(SMIC_CONTROL|SMIC_SMS|SMIC_RD_NEXT)
+#define	SMIC_CC_SMS_RD_END	(SMIC_CONTROL|SMIC_SMS|SMIC_RD_END)
+
+#define	SMIC_CC_SMM_GET_STATUS	(SMIC_CONTROL|SMIC_SMM|SMIC_GET_STATUS)
+#define	SMIC_CC_SMM_WR_START	(SMIC_CONTROL|SMIC_SMM|SMIC_WR_START)
+#define	SMIC_CC_SMM_WR_NEXT	(SMIC_CONTROL|SMIC_SMM|SMIC_WR_NEXT)
+#define	SMIC_CC_SMM_WR_END	(SMIC_CONTROL|SMIC_SMM|SMIC_WR_END)
+#define	SMIC_CC_SMM_RD_START	(SMIC_CONTROL|SMIC_SMM|SMIC_RD_START)
+#define	SMIC_CC_SMM_RD_NEXT	(SMIC_CONTROL|SMIC_SMM|SMIC_RD_NEXT)
+#define	SMIC_CC_SMM_RD_END	(SMIC_CONTROL|SMIC_SMM|SMIC_RD_END)
+
+/*  SMIC Status Codes */
+#define	SMIC_SC_SMS_READY	(SMIC_STATUS|SMIC_SMS|SMIC_READY)
+#define	SMIC_SC_SMS_WR_START	(SMIC_STATUS|SMIC_SMS|SMIC_WR_START)
+#define	SMIC_SC_SMS_WR_NEXT	(SMIC_STATUS|SMIC_SMS|SMIC_WR_NEXT)
+#define	SMIC_SC_SMS_WR_END	(SMIC_STATUS|SMIC_SMS|SMIC_WR_END)
+#define	SMIC_SC_SMS_RD_START	(SMIC_STATUS|SMIC_SMS|SMIC_RD_START)
+#define	SMIC_SC_SMS_RD_NEXT	(SMIC_STATUS|SMIC_SMS|SMIC_RD_NEXT)
+#define	SMIC_SC_SMS_RD_END	(SMIC_STATUS|SMIC_SMS|SMIC_RD_END)
+
+#define	SMIC_SC_SMM_READY	(SMIC_STATUS|SMIC_SMM|SMIC_READY)
+#define	SMIC_SC_SMM_WR_START	(SMIC_STATUS|SMIC_SMM|SMIC_WR_START)
+#define	SMIC_SC_SMM_WR_NEXT	(SMIC_STATUS|SMIC_SMM|SMIC_WR_NEXT)
+#define	SMIC_SC_SMM_WR_END	(SMIC_STATUS|SMIC_SMM|SMIC_WR_END)
+#define	SMIC_SC_SMM_RD_START	(SMIC_STATUS|SMIC_SMM|SMIC_RD_START)
+#define	SMIC_SC_SMM_RD_NEXT	(SMIC_STATUS|SMIC_SMM|SMIC_RD_NEXT)
+#define	SMIC_SC_SMM_RD_END	(SMIC_STATUS|SMIC_SMM|SMIC_RD_END)
+
+/* these are the control/status codes we actually use
+	SMIC_CC_SMS_GET_STATUS	0x40
+	SMIC_CC_SMS_WR_START	0x41
+	SMIC_CC_SMS_WR_NEXT	0x42
+	SMIC_CC_SMS_WR_END	0x43
+	SMIC_CC_SMS_RD_START	0x44
+	SMIC_CC_SMS_RD_NEXT	0x45
+	SMIC_CC_SMS_RD_END	0x46
+
+	SMIC_SC_SMS_READY	0xC0
+	SMIC_SC_SMS_WR_START	0xC1
+	SMIC_SC_SMS_WR_NEXT	0xC2
+	SMIC_SC_SMS_WR_END	0xC3
+	SMIC_SC_SMS_RD_START	0xC4
+	SMIC_SC_SMS_RD_NEXT	0xC5
+	SMIC_SC_SMS_RD_END	0xC6
+*/
+
+static enum si_sm_result smic_event (struct si_sm_data *smic, long time)
+{
+	unsigned char status;
+	unsigned char flags;
+	unsigned char data;
+
+	if (smic->state == SMIC_HOSED) {
+		init_smic_data(smic, smic->io);
+		return SI_SM_HOSED;
+	}
+	if (smic->state != SMIC_IDLE) {
+		if (smic_debug & SMIC_DEBUG_STATES) {
+			printk(KERN_INFO
+			       "smic_event - smic->smic_timeout = %ld,"
+			       " time = %ld\n",
+			       smic->smic_timeout, time);
+		}
+/* FIXME: smic_event is sometimes called with time > SMIC_RETRY_TIMEOUT */
+		if (time < SMIC_RETRY_TIMEOUT) {
+			smic->smic_timeout -= time;
+			if (smic->smic_timeout < 0) {
+				start_error_recovery(smic, "smic timed out.");
+				return SI_SM_CALL_WITH_DELAY;
+			}
+		}
+	}
+	flags = read_smic_flags(smic);
+	if (flags & SMIC_FLAG_BSY)
+		return SI_SM_CALL_WITH_DELAY;
+
+	status = read_smic_status (smic);
+	if (smic_debug & SMIC_DEBUG_STATES)
+		printk(KERN_INFO
+		       "smic_event - state = %d, flags = 0x%02x,"
+		       " status = 0x%02x\n",
+		       smic->state, flags, status);
+
+	switch (smic->state) {
+	case SMIC_IDLE:
+		/* in IDLE we check for available messages */
+		if (flags & (SMIC_SMI |
+			     SMIC_EVM_DATA_AVAIL | SMIC_SMS_DATA_AVAIL))
+		{
+			return SI_SM_ATTN;
+		}
+		return SI_SM_IDLE;
+
+	case SMIC_START_OP:
+		/* sanity check whether smic is really idle */
+		write_smic_control(smic, SMIC_CC_SMS_GET_STATUS);
+		write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		smic->state = SMIC_OP_OK;
+		break;
+
+	case SMIC_OP_OK:
+		if (status != SMIC_SC_SMS_READY) {
+				/* this should not happen */
+			start_error_recovery(smic,
+					     "state = SMIC_OP_OK,"
+					     " status != SMIC_SC_SMS_READY");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* OK so far; smic is idle let us start ... */
+		write_smic_control(smic, SMIC_CC_SMS_WR_START);
+		write_next_byte(smic);
+		write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		smic->state = SMIC_WRITE_START;
+		break;
+
+	case SMIC_WRITE_START:
+		if (status != SMIC_SC_SMS_WR_START) {
+			start_error_recovery(smic,
+					     "state = SMIC_WRITE_START, "
+					     "status != SMIC_SC_SMS_WR_START");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* we must not issue WR_(NEXT|END) unless
+                   TX_DATA_READY is set */
+		if (flags & SMIC_TX_DATA_READY) {
+			if (smic->write_count == 1) {
+				/* last byte */
+				write_smic_control(smic, SMIC_CC_SMS_WR_END);
+				smic->state = SMIC_WRITE_END;
+			} else {
+				write_smic_control(smic, SMIC_CC_SMS_WR_NEXT);
+				smic->state = SMIC_WRITE_NEXT;
+			}
+			write_next_byte(smic);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		}
+		else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_WRITE_NEXT:
+		if (status != SMIC_SC_SMS_WR_NEXT) {
+			start_error_recovery(smic,
+					     "state = SMIC_WRITE_NEXT, "
+					     "status != SMIC_SC_SMS_WR_NEXT");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* this is the same code as in SMIC_WRITE_START */
+		if (flags & SMIC_TX_DATA_READY) {
+			if (smic->write_count == 1) {
+				write_smic_control(smic, SMIC_CC_SMS_WR_END);
+				smic->state = SMIC_WRITE_END;
+			}
+			else {
+				write_smic_control(smic, SMIC_CC_SMS_WR_NEXT);
+				smic->state = SMIC_WRITE_NEXT;
+			}
+			write_next_byte(smic);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+		}
+		else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_WRITE_END:
+		if (status != SMIC_SC_SMS_WR_END) {
+			start_error_recovery (smic,
+					      "state = SMIC_WRITE_END, "
+					      "status != SMIC_SC_SMS_WR_END");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		/* data register holds an error code */
+		data = read_smic_data(smic);
+		if (data != 0) {
+			if (smic_debug & SMIC_DEBUG_ENABLE) {
+				printk(KERN_INFO
+				       "SMIC_WRITE_END: data = %02x\n", data);
+			}
+			start_error_recovery(smic,
+					     "state = SMIC_WRITE_END, "
+					     "data != SUCCESS");
+			return SI_SM_CALL_WITH_DELAY;
+		} else {
+			smic->state = SMIC_WRITE2READ;
+		}
+		break;
+
+	case SMIC_WRITE2READ:
+		/* we must wait for RX_DATA_READY to be set before we
+                   can continue */
+		if (flags & SMIC_RX_DATA_READY) {
+			write_smic_control(smic, SMIC_CC_SMS_RD_START);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+			smic->state = SMIC_READ_START;
+		} else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_READ_START:
+		if (status != SMIC_SC_SMS_RD_START) {
+			start_error_recovery(smic,
+					     "state = SMIC_READ_START, "
+					     "status != SMIC_SC_SMS_RD_START");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		if (flags & SMIC_RX_DATA_READY) {
+			read_next_byte(smic);
+			write_smic_control(smic, SMIC_CC_SMS_RD_NEXT);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+			smic->state = SMIC_READ_NEXT;
+		} else {
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_READ_NEXT:
+		switch (status) {
+		/* smic tells us that this is the last byte to be read
+                   --> clean up */
+		case SMIC_SC_SMS_RD_END:
+			read_next_byte(smic);
+			write_smic_control(smic, SMIC_CC_SMS_RD_END);
+			write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+			smic->state = SMIC_READ_END;
+			break;
+		case SMIC_SC_SMS_RD_NEXT:
+			if (flags & SMIC_RX_DATA_READY) {
+				read_next_byte(smic);
+				write_smic_control(smic, SMIC_CC_SMS_RD_NEXT);
+				write_smic_flags(smic, flags | SMIC_FLAG_BSY);
+				smic->state = SMIC_READ_NEXT;
+			} else {
+				return SI_SM_CALL_WITH_DELAY;
+			}
+			break;
+		default:
+			start_error_recovery(
+				smic,
+				"state = SMIC_READ_NEXT, "
+				"status != SMIC_SC_SMS_RD_(NEXT|END)");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		break;
+
+	case SMIC_READ_END:
+		if (status != SMIC_SC_SMS_READY) {
+			start_error_recovery(smic,
+					     "state = SMIC_READ_END, "
+					     "status != SMIC_SC_SMS_READY");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+		data = read_smic_data(smic);
+		/* data register holds an error code */
+		if (data != 0) {
+			if (smic_debug & SMIC_DEBUG_ENABLE) {
+				printk(KERN_INFO
+				       "SMIC_READ_END: data = %02x\n", data);
+			}
+			start_error_recovery(smic,
+					     "state = SMIC_READ_END, "
+					     "data != SUCCESS");
+			return SI_SM_CALL_WITH_DELAY;
+		} else {
+			smic->state = SMIC_IDLE;
+			return SI_SM_TRANSACTION_COMPLETE;
+		}
+
+	case SMIC_HOSED:
+		init_smic_data(smic, smic->io);
+		return SI_SM_HOSED;
+
+	default:
+		if (smic_debug & SMIC_DEBUG_ENABLE) {
+			printk(KERN_WARNING "smic->state = %d\n", smic->state);
+			start_error_recovery(smic, "state = UNKNOWN");
+			return SI_SM_CALL_WITH_DELAY;
+		}
+	}
+	smic->smic_timeout = SMIC_RETRY_TIMEOUT;
+	return SI_SM_CALL_WITHOUT_DELAY;
+}
+
+static int smic_detect(struct si_sm_data *smic)
+{
+	/* It's impossible for the SMIC fnags register to be all 1's,
+	   (assuming a properly functioning, self-initialized BMC)
+	   but that's what you get from reading a bogus address, so we
+	   test that first. */
+	if (read_smic_flags(smic) == 0xff)
+		return 1;
+
+	return 0;
+}
+
+static void smic_cleanup(struct si_sm_data *kcs)
+{
+}
+
+static int smic_size(void)
+{
+	return sizeof(struct si_sm_data);
+}
+
+struct si_sm_handlers smic_smi_handlers =
+{
+	.version           = IPMI_SMIC_VERSION,
+	.init_data         = init_smic_data,
+	.start_transaction = start_smic_transaction,
+	.get_result        = smic_get_result,
+	.event             = smic_event,
+	.detect            = smic_detect,
+	.cleanup           = smic_cleanup,
+	.size              = smic_size,
+};
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index eb213e416d23..50aa9590be30 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -33,6 +33,7 @@
 
 #include <linux/config.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/ipmi.h>
 #include <linux/ipmi_smi.h>
 #include <linux/watchdog.h>
@@ -50,6 +51,8 @@
 #include <asm/apic.h>
 #endif
 
+#define IPMI_WATCHDOG_VERSION "v31"
+
 /*
  * The IPMI command/response information for the watchdog timer.
  */
@@ -137,26 +140,41 @@ static int pretimeout = 0;
 /* Default action is to reset the board on a timeout. */
 static unsigned char action_val = WDOG_TIMEOUT_RESET;
 
-static char *action = "reset";
+static char action[16] = "reset";
 
 static unsigned char preaction_val = WDOG_PRETIMEOUT_NONE;
 
-static char *preaction = "pre_none";
+static char preaction[16] = "pre_none";
 
 static unsigned char preop_val = WDOG_PREOP_NONE;
 
-static char *preop = "preop_none";
+static char preop[16] = "preop_none";
 static spinlock_t ipmi_read_lock = SPIN_LOCK_UNLOCKED;
 static char data_to_read = 0;
 static DECLARE_WAIT_QUEUE_HEAD(read_q);
 static struct fasync_struct *fasync_q = NULL;
 static char pretimeout_since_last_heartbeat = 0;
 
-MODULE_PARM(timeout, "i");
-MODULE_PARM(pretimeout, "i");
-MODULE_PARM(action, "s");
-MODULE_PARM(preaction, "s");
-MODULE_PARM(preop, "s");
+/* If true, the driver will start running as soon as it is configured
+   and ready. */
+static int start_now = 0;
+
+module_param(timeout, int, 0);
+MODULE_PARM_DESC(timeout, "Timeout value in seconds.");
+module_param(pretimeout, int, 0);
+MODULE_PARM_DESC(pretimeout, "Pretimeout value in seconds.");
+module_param_string(action, action, sizeof(action), 0);
+MODULE_PARM_DESC(action, "Timeout action. One of: "
+		 "reset, none, power_cycle, power_off.");
+module_param_string(preaction, preaction, sizeof(preaction), 0);
+MODULE_PARM_DESC(preaction, "Pretimeout action.  One of: "
+		 "pre_none, pre_smi, pre_nmi, pre_int.");
+module_param_string(preop, preop, sizeof(preop), 0);
+MODULE_PARM_DESC(preop, "Pretimeout driver operation.  One of: "
+		 "preop_none, preop_panic, preop_give_data.");
+module_param(start_now, int, 0);
+MODULE_PARM_DESC(start_now, "Set to 1 to start the watchdog as"
+		 "soon as the driver is loaded.");
 
 /* Default state of the timer. */
 static unsigned char ipmi_watchdog_state = WDOG_TIMEOUT_NONE;
@@ -167,10 +185,6 @@ static int ipmi_ignore_heartbeat = 0;
 /* Is someone using the watchdog?  Only one user is allowed. */
 static int ipmi_wdog_open = 0;
 
-/* If true, the driver will start running as soon as it is configured
-   and ready. */
-static int start_now = 0;
-
 /* If set to 1, the heartbeat command will set the state to reset and
    start the timer.  The timer doesn't normally run when the driver is
    first opened until the heartbeat is set the first time, this
@@ -260,6 +274,7 @@ static int i_ipmi_set_timeout(struct ipmi_smi_msg  *smi_msg,
 				      (struct ipmi_addr *) &addr,
 				      0,
 				      &msg,
+				      NULL,
 				      smi_msg,
 				      recv_msg,
 				      1);
@@ -435,6 +450,7 @@ static int ipmi_heartbeat(void)
 				      (struct ipmi_addr *) &addr,
 				      0,
 				      &msg,
+				      NULL,
 				      &heartbeat_smi_msg,
 				      &heartbeat_recv_msg,
 				      1);
@@ -483,6 +499,7 @@ static void panic_halt_ipmi_heartbeat(void)
 				 (struct ipmi_addr *) &addr,
 				 0,
 				 &msg,
+				 NULL,
 				 &panic_halt_heartbeat_smi_msg,
 				 &panic_halt_heartbeat_recv_msg,
 				 1);
@@ -903,6 +920,7 @@ static void ipmi_smi_gone(int if_num)
 
 static struct ipmi_smi_watcher smi_watcher =
 {
+	.owner    = THIS_MODULE,
 	.new_smi  = ipmi_new_smi,
 	.smi_gone = ipmi_smi_gone
 };
@@ -911,6 +929,9 @@ static int __init ipmi_wdog_init(void)
 {
 	int rv;
 
+	printk(KERN_INFO "IPMI watchdog driver version "
+	       IPMI_WATCHDOG_VERSION "\n");
+
 	if (strcmp(action, "reset") == 0) {
 		action_val = WDOG_TIMEOUT_RESET;
 	} else if (strcmp(action, "none") == 0) {
@@ -999,14 +1020,10 @@ static int __init ipmi_wdog_init(void)
 	register_reboot_notifier(&wdog_reboot_notifier);
 	notifier_chain_register(&panic_notifier_list, &wdog_panic_notifier);
 
-	printk(KERN_INFO "IPMI watchdog by "
-	       "Corey Minyard (minyard@mvista.com)\n");
-
 	return 0;
 }
 
-#ifdef MODULE
-static void ipmi_unregister_watchdog(void)
+static __exit void ipmi_unregister_watchdog(void)
 {
 	int rv;
 
@@ -1034,6 +1051,7 @@ static void ipmi_unregister_watchdog(void)
 	   pointers to our buffers, we want to make sure they are done before
 	   we release our memory. */
 	while (atomic_read(&set_timeout_tofree)) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(1);
 	}
 
@@ -1056,76 +1074,6 @@ static void __exit ipmi_wdog_exit(void)
 	ipmi_unregister_watchdog();
 }
 module_exit(ipmi_wdog_exit);
-#else
-static int __init ipmi_wdog_setup(char *str)
-{
-	int  val;
-	int  rv;
-	char *option;
-
-	rv = get_option(&str, &val);
-	if (rv == 0)
-		return 1;
-	if (val > 0)
-		timeout = val;
-	if (rv == 1)
-		return 1;
-
-	rv = get_option(&str, &val);
-	if (rv == 0)
-		return 1;
-	if (val >= 0)
-		pretimeout = val;
-	if (rv == 1)
-		return 1;
-
-	while ((option = strsep(&str, ",")) != NULL) {
-		if (strcmp(option, "reset") == 0) {
-			action = "reset";
-		}
-		else if (strcmp(option, "none") == 0) {
-			action = "none";
-		}
-		else if (strcmp(option, "power_cycle") == 0) {
-			action = "power_cycle";
-		}
-		else if (strcmp(option, "power_off") == 0) {
-			action = "power_off";
-		}
-		else if (strcmp(option, "pre_none") == 0) {
-			preaction = "pre_none";
-		}
-		else if (strcmp(option, "pre_smi") == 0) {
-			preaction = "pre_smi";
-		}
-#ifdef HAVE_NMI_HANDLER
-		else if (strcmp(option, "pre_nmi") == 0) {
-			preaction = "pre_nmi";
-		}
-#endif
-		else if (strcmp(option, "pre_int") == 0) {
-			preaction = "pre_int";
-		}
-		else if (strcmp(option, "start_now") == 0) {
-			start_now = 1;
-		}
-		else if (strcmp(option, "preop_none") == 0) {
-			preop = "preop_none";
-		}
-		else if (strcmp(option, "preop_panic") == 0) {
-			preop = "preop_panic";
-		}
-		else if (strcmp(option, "preop_give_data") == 0) {
-			preop = "preop_give_data";
-		} else {
-		    printk("Unknown IPMI watchdog option: '%s'\n", option);
-		}
-	}
-
-	return 1;
-}
-__setup("ipmi_wdog=", ipmi_wdog_setup);
-#endif
 
 EXPORT_SYMBOL(ipmi_delayed_shutdown);
 
diff --git a/include/linux/ipmi.h b/include/linux/ipmi.h
index 9479a550b924..75311f205806 100644
--- a/include/linux/ipmi.h
+++ b/include/linux/ipmi.h
@@ -109,6 +109,35 @@ struct ipmi_ipmb_addr
 	unsigned char lun;
 };
 
+/*
+ * A LAN Address.  This is an address to/from a LAN interface bridged
+ * by the BMC, not an address actually out on the LAN.
+ *
+ * A concious decision was made here to deviate slightly from the IPMI
+ * spec.  We do not use rqSWID and rsSWID like it shows in the
+ * message.  Instead, we use remote_SWID and local_SWID.  This means
+ * that any message (a request or response) from another device will
+ * always have exactly the same address.  If you didn't do this,
+ * requests and responses from the same device would have different
+ * addresses, and that's not too cool.
+ *
+ * In this address, the remote_SWID is always the SWID the remote
+ * message came from, or the SWID we are sending the message to.
+ * local_SWID is always our SWID.  Note that having our SWID in the
+ * message is a little wierd, but this is required.
+ */
+#define IPMI_LAN_ADDR_TYPE		0x04
+struct ipmi_lan_addr
+{
+	int           addr_type;
+	short         channel;
+	unsigned char privilege;
+	unsigned char session_handle;
+	unsigned char remote_SWID;
+	unsigned char local_SWID;
+	unsigned char lun;
+};
+
 
 /*
  * Channel for talking directly with the BMC.  When using this
@@ -145,10 +174,20 @@ struct ipmi_msg
  * Receive types for messages coming from the receive interface.  This
  * is used for the receive in-kernel interface and in the receive
  * IOCTL.
+ *
+ * The "IPMI_RESPONSE_RESPNOSE_TYPE" is a little strange sounding, but
+ * it allows you to get the message results when you send a response
+ * message.
  */
 #define IPMI_RESPONSE_RECV_TYPE		1 /* A response to a command */
 #define IPMI_ASYNC_EVENT_RECV_TYPE	2 /* Something from the event queue */
 #define IPMI_CMD_RECV_TYPE		3 /* A command from somewhere else */
+#define IPMI_RESPONSE_RESPONSE_TYPE	4 /* The response for
+					      a sent response, giving any
+					      error status for sending the
+					      response.  When you send a
+					      response message, this will
+					      be returned. */
 /* Note that async events and received commands do not have a completion
    code as the first byte of the incoming data, unlike a response. */
 
@@ -160,6 +199,7 @@ struct ipmi_msg
  * The in-kernel interface.
  */
 #include <linux/list.h>
+#include <linux/module.h>
 
 /* Opaque type for a IPMI message user.  One of these is needed to
    send and receive messages. */
@@ -185,6 +225,12 @@ struct ipmi_recv_msg
 	long             msgid;
 	struct ipmi_msg  msg;
 
+	/* The user_msg_data is the data supplied when a message was
+	   sent, if this is a response to a sent message.  If this is
+	   not a response to a sent message, then user_msg_data will
+	   be NULL. */
+	void             *user_msg_data;
+
 	/* Call this when done with the message.  It will presumably free
 	   the message and do any other necessary cleanup. */
 	void (*done)(struct ipmi_recv_msg *msg);
@@ -206,9 +252,10 @@ struct ipmi_user_hndl
         /* Routine type to call when a message needs to be routed to
 	   the upper layer.  This will be called with some locks held,
 	   the only IPMI routines that can be called are ipmi_request
-	   and the alloc/free operations. */
+	   and the alloc/free operations.  The handler_data is the
+	   variable supplied when the receive handler was registered. */
 	void (*ipmi_recv_hndl)(struct ipmi_recv_msg *msg,
-			       void                 *handler_data);
+			       void                 *user_msg_data);
 
 	/* Called when the interface detects a watchdog pre-timeout.  If
 	   this is NULL, it will be ignored for the user. */
@@ -221,7 +268,12 @@ int ipmi_create_user(unsigned int          if_num,
 		     void                  *handler_data,
 		     ipmi_user_t           *user);
 
-/* Destroy the given user of the IPMI layer. */
+/* Destroy the given user of the IPMI layer.  Note that after this
+   function returns, the system is guaranteed to not call any
+   callbacks for the user.  Thus as long as you destroy all the users
+   before you unload a module, you will be safe.  And if you destroy
+   the users before you destroy the callback structures, it should be
+   safe, too. */
 int ipmi_destroy_user(ipmi_user_t user);
 
 /* Get the IPMI version of the BMC we are talking to. */
@@ -253,13 +305,43 @@ unsigned char ipmi_get_my_LUN(ipmi_user_t user);
  * in the msgid field of the received command.  If the priority is >
  * 0, the message will go into a high-priority queue and be sent
  * first.  Otherwise, it goes into a normal-priority queue.
+ * The user_msg_data field will be returned in any response to this
+ * message.
+ *
+ * Note that if you send a response (with the netfn lower bit set),
+ * you *will* get back a SEND_MSG response telling you what happened
+ * when the response was sent.  You will not get back a response to
+ * the message itself.
  */
 int ipmi_request(ipmi_user_t      user,
 		 struct ipmi_addr *addr,
 		 long             msgid,
 		 struct ipmi_msg  *msg,
+		 void             *user_msg_data,
 		 int              priority);
 
+/*
+ * Like ipmi_request, but lets you specify the number of retries and
+ * the retry time.  The retries is the number of times the message
+ * will be resent if no reply is received.  If set to -1, the default
+ * value will be used.  The retry time is the time in milliseconds
+ * between retries.  If set to zero, the default value will be
+ * used.
+ *
+ * Don't use this unless you *really* have to.  It's primarily for the
+ * IPMI over LAN converter; since the LAN stuff does its own retries,
+ * it makes no sense to do it here.  However, this can be used if you
+ * have unusual requirements.
+ */
+int ipmi_request_settime(ipmi_user_t      user,
+			 struct ipmi_addr *addr,
+			 long             msgid,
+			 struct ipmi_msg  *msg,
+			 void             *user_msg_data,
+			 int              priority,
+			 int              max_retries,
+			 unsigned int     retry_time_ms);
+
 /*
  * Like ipmi_request, but lets you specify the slave return address.
  */
@@ -267,6 +349,7 @@ int ipmi_request_with_source(ipmi_user_t      user,
 			     struct ipmi_addr *addr,
 			     long             msgid,
 			     struct ipmi_msg  *msg,
+			     void             *user_msg_data,
 			     int              priority,
 			     unsigned char    source_address,
 			     unsigned char    source_lun);
@@ -284,6 +367,7 @@ int ipmi_request_supply_msgs(ipmi_user_t          user,
 			     struct ipmi_addr     *addr,
 			     long                 msgid,
 			     struct ipmi_msg      *msg,
+			     void                 *user_msg_data,
 			     void                 *supplied_smi,
 			     struct ipmi_recv_msg *supplied_recv,
 			     int                  priority);
@@ -331,6 +415,10 @@ struct ipmi_smi_watcher
 {
 	struct list_head link;
 
+	/* You must set the owner to the current module, if you are in
+	   a module (generally just set it to "THIS_MODULE"). */
+	struct module *owner;
+
 	/* These two are called with read locks held for the interface
 	   the watcher list.  So you can add and remove users from the
 	   IPMI interface, send messages, etc., but you cannot add
@@ -422,6 +510,29 @@ struct ipmi_req
 #define IPMICTL_SEND_COMMAND		_IOR(IPMI_IOC_MAGIC, 13,	\
 					     struct ipmi_req)
 
+/* Messages sent to the interface with timing parameters are this
+   format. */
+struct ipmi_req_settime
+{
+	struct ipmi_req req;
+
+	/* See ipmi_request_settime() above for details on these
+           values. */
+	int          retries;
+	unsigned int retry_time_ms;
+};
+/*
+ * Send a message to the interfaces with timing parameters.  error values
+ * are:
+ *   - EFAULT - an address supplied was invalid.
+ *   - EINVAL - The address supplied was not valid, or the command
+ *              was not allowed.
+ *   - EMSGSIZE - The message to was too large.
+ *   - ENOMEM - Buffers could not be allocated for the command.
+ */
+#define IPMICTL_SEND_COMMAND_SETTIME	_IOR(IPMI_IOC_MAGIC, 21,	\
+					     struct ipmi_req_settime)
+
 /* Messages received from the interface are this format. */
 struct ipmi_recv
 {
@@ -513,4 +624,18 @@ struct ipmi_cmdspec
 #define IPMICTL_SET_MY_LUN_CMD		_IOR(IPMI_IOC_MAGIC, 19, unsigned int)
 #define IPMICTL_GET_MY_LUN_CMD		_IOR(IPMI_IOC_MAGIC, 20, unsigned int)
 
+/*
+ * Get/set the default timing values for an interface.  You shouldn't
+ * generally mess with these.
+ */
+struct ipmi_timing_parms
+{
+	int          retries;
+	unsigned int retry_time_ms;
+};
+#define IPMICTL_SET_TIMING_PARMS_CMD	_IOR(IPMI_IOC_MAGIC, 22, \
+					     struct ipmi_timing_parms)
+#define IPMICTL_GET_TIMING_PARMS_CMD	_IOR(IPMI_IOC_MAGIC, 23, \
+					     struct ipmi_timing_parms)
+
 #endif /* __LINUX_IPMI_H */
diff --git a/include/linux/ipmi_msgdefs.h b/include/linux/ipmi_msgdefs.h
index ccdb9386faed..40ed591fd84b 100644
--- a/include/linux/ipmi_msgdefs.h
+++ b/include/linux/ipmi_msgdefs.h
@@ -53,6 +53,7 @@
 #define IPMI_SET_BMC_GLOBAL_ENABLES_CMD	0x2e
 #define IPMI_GET_BMC_GLOBAL_ENABLES_CMD	0x2f
 #define IPMI_READ_EVENT_MSG_BUFFER_CMD	0x35
+#define IPMI_GET_CHANNEL_INFO_CMD	0x42
 
 #define IPMI_NETFN_STORAGE_REQUEST		0x0a
 #define IPMI_NETFN_STORAGE_RESPONSE		0x0b
@@ -61,8 +62,39 @@
 /* The default slave address */
 #define IPMI_BMC_SLAVE_ADDR	0x20
 
-#define IPMI_MAX_MSG_LENGTH	80
+/* The BT interface on high-end HP systems supports up to 255 bytes in
+ * one transfer.  Its "virtual" BMC supports some commands that are longer
+ * than 128 bytes.  Use the full 256, plus NetFn/LUN, Cmd, cCode, plus
+ * some overhead.  It would be nice to base this on the "BT Capabilities"
+ * but that's too hard to propogate to the rest of the driver. */
+#define IPMI_MAX_MSG_LENGTH	272	/* multiple of 16 */
 
-#define IPMI_CC_NO_ERROR	0
+#define IPMI_CC_NO_ERROR		0x00
+#define IPMI_NODE_BUSY_ERR		0xc0
+#define IPMI_ERR_MSG_TRUNCATED		0xc6
+#define IPMI_LOST_ARBITRATION_ERR	0x81
+#define IPMI_ERR_UNSPECIFIED		0xff
+
+#define IPMI_CHANNEL_PROTOCOL_IPMB	1
+#define IPMI_CHANNEL_PROTOCOL_ICMB	2
+#define IPMI_CHANNEL_PROTOCOL_SMBUS	4
+#define IPMI_CHANNEL_PROTOCOL_KCS	5
+#define IPMI_CHANNEL_PROTOCOL_SMIC	6
+#define IPMI_CHANNEL_PROTOCOL_BT10	7
+#define IPMI_CHANNEL_PROTOCOL_BT15	8
+#define IPMI_CHANNEL_PROTOCOL_TMODE	9
+
+#define IPMI_CHANNEL_MEDIUM_IPMB	1
+#define IPMI_CHANNEL_MEDIUM_ICMB10	2
+#define IPMI_CHANNEL_MEDIUM_ICMB09	3
+#define IPMI_CHANNEL_MEDIUM_8023LAN	4
+#define IPMI_CHANNEL_MEDIUM_ASYNC	5
+#define IPMI_CHANNEL_MEDIUM_OTHER_LAN	6
+#define IPMI_CHANNEL_MEDIUM_PCI_SMBUS	7
+#define IPMI_CHANNEL_MEDIUM_SMBUS1	8
+#define IPMI_CHANNEL_MEDIUM_SMBUS2	9
+#define IPMI_CHANNEL_MEDIUM_USB1	10
+#define IPMI_CHANNEL_MEDIUM_USB2	11
+#define IPMI_CHANNEL_MEDIUM_SYSINTF	12
 
 #endif /* __LINUX_IPMI_MSGDEFS_H */
diff --git a/include/linux/ipmi_smi.h b/include/linux/ipmi_smi.h
index f18187b00c05..6b42943ac3a3 100644
--- a/include/linux/ipmi_smi.h
+++ b/include/linux/ipmi_smi.h
@@ -35,6 +35,8 @@
 #define __LINUX_IPMI_SMI_H
 
 #include <linux/ipmi_msgdefs.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
 
 /* This files describes the interface for IPMI system management interface
    drivers to bind into the IPMI message handler. */
@@ -48,7 +50,7 @@ typedef struct ipmi_smi *ipmi_smi_t;
  * been received, it will report this same data structure back up to
  * the upper layer.  If an error occurs, it should fill in the
  * response with an error code in the completion code location. When
- * asyncronous data is received, one of these is allocated, the
+ * asynchronous data is received, one of these is allocated, the
  * data_size is set to zero and the response holds the data from the
  * get message or get event command that the interface initiated.
  * Note that it is the interfaces responsibility to detect
@@ -62,9 +64,6 @@ struct ipmi_smi_msg
 	long    msgid;
 	void    *user_data;
 
-	/* If 0, add to the end of the queue.  If 1, add to the beginning. */
-	int     prio;
-
 	int           data_size;
 	unsigned char data[IPMI_MAX_MSG_LENGTH];
 
@@ -134,4 +133,11 @@ static inline void ipmi_free_smi_msg(struct ipmi_smi_msg *msg)
 	msg->done(msg);
 }
 
+/* Allow the lower layer to add things to the proc filesystem
+   directory for this interface.  Note that the entry will
+   automatically be dstroyed when the interface is destroyed. */
+int ipmi_smi_add_proc_entry(ipmi_smi_t smi, char *name,
+			    read_proc_t *read_proc, write_proc_t *write_proc,
+			    void *data, struct module *owner);
+
 #endif /* __LINUX_IPMI_SMI_H */
-- 
cgit v1.2.3


From 7860b37198b0650f51bfafebac820386b552a071 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:46 -0700
Subject: [PATCH] move job control fields from task_struct to signal_struct

From: Roland McGrath <roland@redhat.com>

This patch moves all the fields relating to job control from task_struct to
signal_struct, so that all this info is properly per-process rather than
being per-thread.
---
 arch/ia64/kernel/unaligned.c    |   2 +-
 arch/sparc64/solaris/misc.c     |   2 +-
 drivers/char/n_tty.c            |   3 +-
 drivers/char/rocket.c           |   2 +-
 drivers/char/sx.c               |   2 +-
 drivers/char/tty_io.c           | 116 +++++++++++++++++-----------------------
 drivers/char/vt.c               |   2 +-
 drivers/char/vt_ioctl.c         |   3 +-
 drivers/net/slip.c              |   2 +-
 drivers/s390/char/keyboard.c    |   2 +-
 fs/binfmt_elf.c                 |   4 +-
 fs/compat_ioctl.c               |   2 +-
 fs/dquot.c                      |  10 ++--
 fs/exec.c                       |   5 ++
 fs/open.c                       |   2 +-
 fs/proc/array.c                 |  22 ++++----
 include/linux/sched.h           |  17 +++---
 kernel/acct.c                   |   2 +-
 kernel/exit.c                   |  22 ++++----
 kernel/fork.c                   |  10 ++--
 kernel/pid.c                    |   8 +--
 kernel/signal.c                 |   5 +-
 kernel/sys.c                    |  18 +++----
 net/bridge/netfilter/ebtables.c |   2 +-
 net/ipv4/netfilter/ipt_owner.c  |   2 +-
 net/ipv6/netfilter/ip6t_owner.c |   2 +-
 26 files changed, 133 insertions(+), 136 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/kernel/unaligned.c b/arch/ia64/kernel/unaligned.c
index 2247254be7ac..b1a68e4367bc 100644
--- a/arch/ia64/kernel/unaligned.c
+++ b/arch/ia64/kernel/unaligned.c
@@ -1337,7 +1337,7 @@ ia64_handle_unaligned (unsigned long ifa, struct pt_regs *regs)
 			 * be holding locks...
 			 */
 			if (user_mode(regs))
-				tty_write_message(current->tty, buf);
+				tty_write_message(current->signal->tty, buf);
 			buf[len-1] = '\0';	/* drop '\r' */
 			printk(KERN_WARNING "%s", buf);	/* watch for command names containing %s */
 		}
diff --git a/arch/sparc64/solaris/misc.c b/arch/sparc64/solaris/misc.c
index ea7b2c439653..cea38c0cbb5c 100644
--- a/arch/sparc64/solaris/misc.c
+++ b/arch/sparc64/solaris/misc.c
@@ -402,7 +402,7 @@ asmlinkage int solaris_procids(int cmd, s32 pid, s32 pgid)
 			   Solaris setpgrp and setsid? */
 			ret = sys_setpgid(0, 0);
 			if (ret) return ret;
-			current->tty = NULL;
+			current->signal->tty = NULL;
 			return process_group(current);
 		}
 	case 2: /* getsid */
diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index 0c02e2debbb1..08f46259e183 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -999,7 +999,8 @@ do_it_again:
 	/* NOTE: not yet done after every sleep pending a thorough
 	   check of the logic of this change. -- jlc */
 	/* don't stop on /dev/console */
-	if (file->f_op->write != redirected_tty_write && current->tty == tty) {
+	if (file->f_op->write != redirected_tty_write &&
+	    current->signal->tty == tty) {
 		if (tty->pgrp <= 0)
 			printk("read_chan: tty->pgrp <= 0!\n");
 		else if (process_group(current) != tty->pgrp) {
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 38544de9fbd9..b0da37eab8e7 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -953,7 +953,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	info->session = current->session;
+	info->session = current->signal->session;
 	info->pgrp = process_group(current);
 
 	if ((info->flags & ROCKET_INITIALIZED) == 0) {
diff --git a/drivers/char/sx.c b/drivers/char/sx.c
index 25c95fbc65d3..643163b08a8f 100644
--- a/drivers/char/sx.c
+++ b/drivers/char/sx.c
@@ -1420,7 +1420,7 @@ static int sx_open  (struct tty_struct * tty, struct file * filp)
 
 	line = tty->index;
 	sx_dprintk (SX_DEBUG_OPEN, "%d: opening line %d. tty=%p ctty=%p, np=%d)\n", 
-	            current->pid, line, tty, current->tty, sx_nports);
+	            current->pid, line, tty, current->signal->tty, sx_nports);
 
 	if ((line < 0) || (line >= SX_NPORTS) || (line >= sx_nports))
 		return -ENODEV;
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 0ba52078f637..e4607d86a755 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -321,7 +321,7 @@ struct tty_driver *get_tty_driver(dev_t device, int *index)
  */
 int tty_check_change(struct tty_struct * tty)
 {
-	if (current->tty != tty)
+	if (current->signal->tty != tty)
 		return 0;
 	if (tty->pgrp <= 0) {
 		printk(KERN_WARNING "tty_check_change: tty->pgrp <= 0!\n");
@@ -486,17 +486,14 @@ void do_tty_hangup(void *data)
 	if (tty->session > 0) {
 		struct list_head *l;
 		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-			task_t *task = p;
-			do {
-				if (task->tty == tty)
-					task->tty = NULL;
-				if (task->leader) {
-					send_group_sig_info(SIGHUP, SEND_SIG_PRIV, task);
-					send_group_sig_info(SIGCONT, SEND_SIG_PRIV, task);
-				}
-			} while_each_thread(p, task);
+			if (p->signal->tty == tty)
+				p->signal->tty = NULL;
+			if (!p->signal->leader)
+				continue;
+			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
-				p->tty_old_pgrp = tty->pgrp;
+				p->signal->tty_old_pgrp = tty->pgrp;
 		}
 	}
 	read_unlock(&tasklist_lock);
@@ -575,15 +572,15 @@ void disassociate_ctty(int on_exit)
 
 	lock_kernel();
 
-	tty = current->tty;
+	tty = current->signal->tty;
 	if (tty) {
 		tty_pgrp = tty->pgrp;
 		if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY)
 			tty_vhangup(tty);
 	} else {
-		if (current->tty_old_pgrp) {
-			kill_pg(current->tty_old_pgrp, SIGHUP, on_exit);
-			kill_pg(current->tty_old_pgrp, SIGCONT, on_exit);
+		if (current->signal->tty_old_pgrp) {
+			kill_pg(current->signal->tty_old_pgrp, SIGHUP, on_exit);
+			kill_pg(current->signal->tty_old_pgrp, SIGCONT, on_exit);
 		}
 		unlock_kernel();	
 		return;
@@ -594,17 +591,13 @@ void disassociate_ctty(int on_exit)
 			kill_pg(tty_pgrp, SIGCONT, on_exit);
 	}
 
-	current->tty_old_pgrp = 0;
+	current->signal->tty_old_pgrp = 0;
 	tty->session = 0;
 	tty->pgrp = -1;
 
 	read_lock(&tasklist_lock);
-	for_each_task_pid(current->session, PIDTYPE_SID, p, l, pid) {
-		task_t *task = p;
-		do {
-			task->tty = NULL;
-		} while_each_thread(p, task);
-	}
+	for_each_task_pid(current->signal->session, PIDTYPE_SID, p, l, pid)
+		p->signal->tty = NULL;
 	read_unlock(&tasklist_lock);
 	unlock_kernel();
 }
@@ -1257,20 +1250,11 @@ static void release_dev(struct file * filp)
 		struct pid *pid;
 
 		read_lock(&tasklist_lock);
-		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-			task_t *task = p;
-			do {
-				task->tty = NULL;
-			} while_each_thread(p, task);
-		}
-		if (o_tty) {
-			for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) {
-				task_t *task = p;
-				do {
-					task->tty = NULL;
-				} while_each_thread(p, task);
-			}
-		}
+		for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid)
+			p->signal->tty = NULL;
+		if (o_tty)
+			for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid)
+				p->signal->tty = NULL;
 		read_unlock(&tasklist_lock);
 	}
 
@@ -1341,10 +1325,10 @@ static int tty_open(struct inode * inode, struct file * filp)
 retry_open:
 	noctty = filp->f_flags & O_NOCTTY;
 	if (device == MKDEV(TTYAUX_MAJOR,0)) {
-		if (!current->tty)
+		if (!current->signal->tty)
 			return -ENXIO;
-		driver = current->tty->driver;
-		index = current->tty->index;
+		driver = current->signal->tty->driver;
+		index = current->signal->tty->index;
 		filp->f_flags |= O_NONBLOCK; /* Don't let /dev/tty block */
 		/* noctty = 1; */
 		goto got_driver;
@@ -1445,14 +1429,14 @@ got_driver:
 		goto retry_open;
 	}
 	if (!noctty &&
-	    current->leader &&
-	    !current->tty &&
+	    current->signal->leader &&
+	    !current->signal->tty &&
 	    tty->session == 0) {
 	    	task_lock(current);
-		current->tty = tty;
+		current->signal->tty = tty;
 		task_unlock(current);
-		current->tty_old_pgrp = 0;
-		tty->session = current->session;
+		current->signal->tty_old_pgrp = 0;
+		tty->session = current->signal->session;
 		tty->pgrp = process_group(current);
 	}
 	return 0;
@@ -1510,7 +1494,7 @@ static int tiocsti(struct tty_struct *tty, char * arg)
 {
 	char ch, mbz = 0;
 
-	if ((current->tty != tty) && !capable(CAP_SYS_ADMIN))
+	if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (get_user(ch, arg))
 		return -EFAULT;
@@ -1601,14 +1585,14 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 	struct pid *pid;
 	task_t *p;
 
-	if (current->leader &&
-	    (current->session == tty->session))
+	if (current->signal->leader &&
+	    (current->signal->session == tty->session))
 		return 0;
 	/*
 	 * The process must be a session leader and
 	 * not have a controlling tty already.
 	 */
-	if (!current->leader || current->tty)
+	if (!current->signal->leader || current->signal->tty)
 		return -EPERM;
 	if (tty->session > 0) {
 		/*
@@ -1621,21 +1605,17 @@ static int tiocsctty(struct tty_struct *tty, int arg)
 			 */
 
 			read_lock(&tasklist_lock);
-			for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) {
-				task_t *task = p;
-				do {
-					task->tty = NULL;
-				} while_each_thread(p, task);
-			}
+			for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid)
+				p->signal->tty = NULL;
 			read_unlock(&tasklist_lock);
 		} else
 			return -EPERM;
 	}
 	task_lock(current);
-	current->tty = tty;
+	current->signal->tty = tty;
 	task_unlock(current);
-	current->tty_old_pgrp = 0;
-	tty->session = current->session;
+	current->signal->tty_old_pgrp = 0;
+	tty->session = current->signal->session;
 	tty->pgrp = process_group(current);
 	return 0;
 }
@@ -1646,7 +1626,7 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	 */
-	if (tty == real_tty && current->tty != real_tty)
+	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 	return put_user(real_tty->pgrp, arg);
 }
@@ -1660,15 +1640,15 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t
 		return -ENOTTY;
 	if (retval)
 		return retval;
-	if (!current->tty ||
-	    (current->tty != real_tty) ||
-	    (real_tty->session != current->session))
+	if (!current->signal->tty ||
+	    (current->signal->tty != real_tty) ||
+	    (real_tty->session != current->signal->session))
 		return -ENOTTY;
 	if (get_user(pgrp, (pid_t *) arg))
 		return -EFAULT;
 	if (pgrp < 0)
 		return -EINVAL;
-	if (session_of_pgrp(pgrp) != current->session)
+	if (session_of_pgrp(pgrp) != current->signal->session)
 		return -EPERM;
 	real_tty->pgrp = pgrp;
 	return 0;
@@ -1680,7 +1660,7 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t *
 	 * (tty == real_tty) is a cheap way of
 	 * testing if the tty is NOT a master pty.
 	*/
-	if (tty == real_tty && current->tty != real_tty)
+	if (tty == real_tty && current->signal->tty != real_tty)
 		return -ENOTTY;
 	if (real_tty->session <= 0)
 		return -ENOTTY;
@@ -1838,12 +1818,12 @@ int tty_ioctl(struct inode * inode, struct file * file,
 			clear_bit(TTY_EXCLUSIVE, &tty->flags);
 			return 0;
 		case TIOCNOTTY:
-			if (current->tty != tty)
+			if (current->signal->tty != tty)
 				return -ENOTTY;
-			if (current->leader)
+			if (current->signal->leader)
 				disassociate_ctty(0);
 			task_lock(current);
-			current->tty = NULL;
+			current->signal->tty = NULL;
 			task_unlock(current);
 			return 0;
 		case TIOCSCTTY:
@@ -1947,9 +1927,9 @@ static void __do_SAK(void *arg)
 		tty->driver->flush_buffer(tty);
 	read_lock(&tasklist_lock);
 	for_each_task_pid(session, PIDTYPE_SID, p, l, pid) {
-		if (p->tty == tty || session > 0) {
+		if (p->signal->tty == tty || session > 0) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): p->session==tty->session\n",
+			    " (%s): p->signal->session==tty->session\n",
 			    p->pid, p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 2febed52e19f..a1a59abc915c 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -2278,7 +2278,7 @@ int tioclinux(struct tty_struct *tty, unsigned long arg)
 
 	if (tty->driver->type != TTY_DRIVER_TYPE_CONSOLE)
 		return -EINVAL;
-	if (current->tty != tty && !capable(CAP_SYS_ADMIN))
+	if (current->signal->tty != tty && !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	if (get_user(type, (char *)arg))
 		return -EFAULT;
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index d8c6acc8e62c..0685fe7be2d1 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -382,7 +382,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
 	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
 	perm = 0;
-	if (current->tty == tty || capable(CAP_SYS_TTY_CONFIG))
+	if (current->signal->tty == tty || capable(CAP_SYS_TTY_CONFIG))
 		perm = 1;
  
 	kbd = kbd_table + console;
@@ -1221,4 +1221,3 @@ void change_console(unsigned int new_console)
 
 	complete_change_console(new_console);
 }
-
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index 601df52ebb29..e783ac0fa71e 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -1307,7 +1307,7 @@ static int sl_ioctl(struct net_device *dev,struct ifreq *rq,int cmd)
 		/* Resolve race condition, when ioctl'ing hanged up 
 		   and opened by another process device.
 		 */
-		if (sl->tty != current->tty && sl->pid != current->pid) {
+		if (sl->tty != current->signal->tty && sl->pid != current->pid) {
 			spin_unlock_bh(&sl->lock);
 			return -EPERM;
 		}
diff --git a/drivers/s390/char/keyboard.c b/drivers/s390/char/keyboard.c
index 892ebc7739b0..b124ebb7fc9b 100644
--- a/drivers/s390/char/keyboard.c
+++ b/drivers/s390/char/keyboard.c
@@ -471,7 +471,7 @@ kbd_ioctl(struct kbd_data *kbd, struct file *file,
 	 * To have permissions to do most of the vt ioctls, we either have
 	 * to be the owner of the tty, or have CAP_SYS_TTY_CONFIG.
 	 */
-	perm = current->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG);
+	perm = current->signal->tty == kbd->tty || capable(CAP_SYS_TTY_CONFIG);
 	switch (cmd) {
 	case KDGKBTYPE:
 		return put_user(KB_101, (char*) arg);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 9cc7cc648b42..e5b79a294c80 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1129,7 +1129,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
 	prstatus->pr_pid = p->pid;
 	prstatus->pr_ppid = p->parent->pid;
 	prstatus->pr_pgrp = process_group(p);
-	prstatus->pr_sid = p->session;
+	prstatus->pr_sid = p->signal->session;
 	jiffies_to_timeval(p->utime, &prstatus->pr_utime);
 	jiffies_to_timeval(p->stime, &prstatus->pr_stime);
 	jiffies_to_timeval(p->cutime, &prstatus->pr_cutime);
@@ -1157,7 +1157,7 @@ static void fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	psinfo->pr_pid = p->pid;
 	psinfo->pr_ppid = p->parent->pid;
 	psinfo->pr_pgrp = process_group(p);
-	psinfo->pr_sid = p->session;
+	psinfo->pr_sid = p->signal->session;
 
 	i = p->state ? ffz(~p->state) + 1 : 0;
 	psinfo->pr_state = i;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 23baed6180ff..de45d833d0f4 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1604,7 +1604,7 @@ static int vt_check(struct file *file)
 	 * To have permissions to do most of the vt ioctls, we either have
 	 * to be the owner of the tty, or super-user.
 	 */
-	if (current->tty == tty || capable(CAP_SYS_ADMIN))
+	if (current->signal->tty == tty || capable(CAP_SYS_ADMIN))
 		return 1;
 	return 0;                                                    
 }
diff --git a/fs/dquot.c b/fs/dquot.c
index e6b39e66207a..5749044d028e 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -689,12 +689,12 @@ static void print_warning(struct dquot *dquot, const char warntype)
 
 	if (!need_print_warning(dquot) || (flag && test_and_set_bit(flag, &dquot->dq_flags)))
 		return;
-	tty_write_message(current->tty, dquot->dq_sb->s_id);
+	tty_write_message(current->signal->tty, dquot->dq_sb->s_id);
 	if (warntype == ISOFTWARN || warntype == BSOFTWARN)
-		tty_write_message(current->tty, ": warning, ");
+		tty_write_message(current->signal->tty, ": warning, ");
 	else
-		tty_write_message(current->tty, ": write failed, ");
-	tty_write_message(current->tty, quotatypes[dquot->dq_type]);
+		tty_write_message(current->signal->tty, ": write failed, ");
+	tty_write_message(current->signal->tty, quotatypes[dquot->dq_type]);
 	switch (warntype) {
 		case IHARDWARN:
 			msg = " file limit reached.\n";
@@ -715,7 +715,7 @@ static void print_warning(struct dquot *dquot, const char warntype)
 			msg = " block quota exceeded.\n";
 			break;
 	}
-	tty_write_message(current->tty, msg);
+	tty_write_message(current->signal->tty, msg);
 }
 
 static inline void flush_warnings(struct dquot **dquots, char *warntype)
diff --git a/fs/exec.c b/fs/exec.c
index 225afb0d94e5..62bf2c537abd 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -601,6 +601,11 @@ static inline int de_thread(struct task_struct *tsk)
 		newsig->group_stop_count = 0;
 		newsig->curr_target = NULL;
 		init_sigpending(&newsig->shared_pending);
+
+		newsig->pgrp = oldsig->pgrp;
+		newsig->session = oldsig->session;
+		newsig->leader = oldsig->leader;
+		newsig->tty_old_pgrp = oldsig->tty_old_pgrp;
 	}
 
 	if (thread_group_empty(current))
diff --git a/fs/open.c b/fs/open.c
index 9a9ce5be4dbc..ce11096afcad 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1037,7 +1037,7 @@ EXPORT_SYMBOL(sys_close);
 asmlinkage long sys_vhangup(void)
 {
 	if (capable(CAP_SYS_TTY_CONFIG)) {
-		tty_vhangup(current->tty);
+		tty_vhangup(current->signal->tty);
 		return 0;
 	}
 	return -EPERM;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 7af62577287e..ac9ccac5d1ee 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -168,7 +168,7 @@ static inline char * task_state(struct task_struct *p, char *buffer)
 		p->pid && p->ptrace ? p->parent->pid : 0,
 		p->uid, p->euid, p->suid, p->fsuid,
 		p->gid, p->egid, p->sgid, p->fsgid);
-	read_unlock(&tasklist_lock);	
+	read_unlock(&tasklist_lock);
 	task_lock(p);
 	buffer += sprintf(buffer,
 		"FDSize:\t%d\n"
@@ -301,7 +301,7 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	sigset_t sigign, sigcatch;
 	char state;
 	int res;
-	pid_t ppid;
+ 	pid_t ppid, pgid = -1, sid = -1;
 	int num_threads = 0;
 	struct mm_struct *mm;
 
@@ -311,10 +311,6 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 	mm = task->mm;
 	if(mm)
 		mm = mmgrab(mm);
-	if (task->tty) {
-		tty_pgrp = task->tty->pgrp;
-		tty_nr = new_encode_dev(tty_devnum(task->tty));
-	}
 	task_unlock(task);
 	if (mm) {
 		down_read(&mm->mmap_sem);
@@ -335,7 +331,15 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		collect_sigign_sigcatch(task, &sigign, &sigcatch);
 		spin_unlock_irq(&task->sighand->siglock);
 	}
-	read_unlock(&tasklist_lock);		
+	if (task->signal) {
+		if (task->signal->tty) {
+			tty_pgrp = task->signal->tty->pgrp;
+			tty_nr = new_encode_dev(tty_devnum(task->signal->tty));
+		}
+		pgid = process_group(task);
+		sid = task->signal->session;
+	}
+	read_unlock(&tasklist_lock);
 
 	/* scale priority and nice values from timeslices to -20..20 */
 	/* to make it look like a "normal" Unix priority/nice value  */
@@ -352,8 +356,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		task->comm,
 		state,
 		ppid,
-		process_group(task),
-		task->session,
+		pgid,
+		sid,
 		tty_nr,
 		tty_pgrp,
 		task->flags,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 054b3c0d5962..5a1229121123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -269,6 +269,15 @@ struct signal_struct {
 
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
+
+	/* job control IDs */
+	pid_t pgrp;
+	pid_t tty_old_pgrp;
+	pid_t session;
+	/* boolean value for session group leader */
+	int leader;
+
+	struct tty_struct *tty; /* NULL if no tty */
 };
 
 /*
@@ -398,12 +407,7 @@ struct task_struct {
 	unsigned long personality;
 	int did_exec:1;
 	pid_t pid;
-	pid_t __pgrp;		/* Accessed via process_group() */
-	pid_t tty_old_pgrp;
-	pid_t session;
 	pid_t tgid;
-	/* boolean value for session group leader */
-	int leader;
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
@@ -446,7 +450,6 @@ struct task_struct {
 	char comm[16];
 /* file system info */
 	int link_count, total_link_count;
-	struct tty_struct *tty; /* NULL if no tty */
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 /* CPU-specific state of this task */
@@ -499,7 +502,7 @@ struct task_struct {
 
 static inline pid_t process_group(struct task_struct *tsk)
 {
-	return tsk->group_leader->__pgrp;
+	return tsk->signal->pgrp;
 }
 
 extern void __put_task_struct(struct task_struct *tsk);
diff --git a/kernel/acct.c b/kernel/acct.c
index 9dbab88b2d31..b417066778a7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -347,7 +347,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	/* we really need to bite the bullet and change layout */
 	ac.ac_uid = current->uid;
 	ac.ac_gid = current->gid;
-	ac.ac_tty = current->tty ? old_encode_dev(tty_devnum(current->tty)) : 0;
+	ac.ac_tty = current->signal->tty ? old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 
 	ac.ac_flag = 0;
 	if (current->flags & PF_FORKNOEXEC)
diff --git a/kernel/exit.c b/kernel/exit.c
index 308f6959add6..810eebd77559 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -136,13 +136,13 @@ int session_of_pgrp(int pgrp)
 
 	read_lock(&tasklist_lock);
 	for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid)
-		if (p->session > 0) {
-			sid = p->session;
+		if (p->signal->session > 0) {
+			sid = p->signal->session;
 			goto out;
 		}
 	p = find_task_by_pid(pgrp);
 	if (p)
-		sid = p->session;
+		sid = p->signal->session;
 out:
 	read_unlock(&tasklist_lock);
 	
@@ -170,7 +170,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
 				|| p->real_parent->pid == 1)
 			continue;
 		if (process_group(p->real_parent) != pgrp
-			    && p->real_parent->session == p->session) {
+			    && p->real_parent->signal->session == p->signal->session) {
 			ret = 0;
 			break;
 		}
@@ -259,14 +259,14 @@ void __set_special_pids(pid_t session, pid_t pgrp)
 {
 	struct task_struct *curr = current;
 
-	if (curr->session != session) {
+	if (curr->signal->session != session) {
 		detach_pid(curr, PIDTYPE_SID);
-		curr->session = session;
+		curr->signal->session = session;
 		attach_pid(curr, PIDTYPE_SID, session);
 	}
 	if (process_group(curr) != pgrp) {
 		detach_pid(curr, PIDTYPE_PGID);
-		curr->group_leader->__pgrp = pgrp;
+		curr->signal->pgrp = pgrp;
 		attach_pid(curr, PIDTYPE_PGID, pgrp);
 	}
 }
@@ -341,7 +341,7 @@ void daemonize(const char *name, ...)
 	exit_mm(current);
 
 	set_special_pids(1, 1);
-	current->tty = NULL;
+	current->signal->tty = NULL;
 
 	/* Block and flush all signals */
 	sigfillset(&blocked);
@@ -564,7 +564,7 @@ static inline void reparent_thread(task_t *p, task_t *father, int traced)
 	 * outside, so the child pgrp is now orphaned.
 	 */
 	if ((process_group(p) != process_group(father)) &&
-	    (p->session == father->session)) {
+	    (p->signal->session == father->signal->session)) {
 		int pgrp = process_group(p);
 
 		if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) {
@@ -675,7 +675,7 @@ static void exit_notify(struct task_struct *tsk)
 	t = tsk->real_parent;
 	
 	if ((process_group(t) != process_group(tsk)) &&
-	    (t->session == tsk->session) &&
+	    (t->signal->session == tsk->signal->session) &&
 	    will_become_orphaned_pgrp(process_group(tsk), tsk) &&
 	    has_stopped_jobs(process_group(tsk))) {
 		__kill_pg_info(SIGHUP, (void *)1, process_group(tsk));
@@ -780,7 +780,7 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	exit_itimers(tsk);
 	exit_thread();
 
-	if (tsk->leader)
+	if (tsk->signal->leader)
 		disassociate_ctty(1);
 
 	module_put(tsk->thread_info->exec_domain->module);
diff --git a/kernel/fork.c b/kernel/fork.c
index a1f20cabbdd3..d2dd97e866bb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -811,6 +811,12 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
 
+	sig->tty = current->signal->tty;
+	sig->pgrp = process_group(current);
+	sig->session = current->signal->session;
+	sig->leader = 0;	/* session leadership doesn't inherit */
+	sig->tty_old_pgrp = 0;
+
 	return 0;
 }
 
@@ -935,8 +941,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;
 
-	p->leader = 0;		/* session leadership doesn't inherit */
-	p->tty_old_pgrp = 0;
 	p->utime = p->stime = 0;
 	p->cutime = p->cstime = 0;
 	p->lock_depth = -1;		/* -1 = no lock */
@@ -1055,7 +1059,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_TGID, p->tgid);
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
-		attach_pid(p, PIDTYPE_SID, p->session);
+		attach_pid(p, PIDTYPE_SID, p->signal->session);
 		if (p->pid)
 			__get_cpu_var(process_counts)++;
 	} else
diff --git a/kernel/pid.c b/kernel/pid.c
index 4c85144759c5..6ed44f56ca45 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -253,14 +253,14 @@ void switch_exec_pids(task_t *leader, task_t *thread)
 
 	attach_pid(thread, PIDTYPE_PID, thread->pid);
 	attach_pid(thread, PIDTYPE_TGID, thread->tgid);
-	attach_pid(thread, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(thread, PIDTYPE_SID, thread->session);
+	attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp);
+	attach_pid(thread, PIDTYPE_SID, thread->signal->session);
 	list_add_tail(&thread->tasks, &init_task.tasks);
 
 	attach_pid(leader, PIDTYPE_PID, leader->pid);
 	attach_pid(leader, PIDTYPE_TGID, leader->tgid);
-	attach_pid(leader, PIDTYPE_PGID, leader->__pgrp);
-	attach_pid(leader, PIDTYPE_SID, leader->session);
+	attach_pid(leader, PIDTYPE_PGID, leader->signal->pgrp);
+	attach_pid(leader, PIDTYPE_SID, leader->signal->session);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index e6b7904df68f..7a4b479a6f45 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -588,7 +588,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
 	error = -EPERM;
 	if ((!info || ((unsigned long)info != 1 &&
 			(unsigned long)info != 2 && SI_FROMUSER(info)))
-	    && ((sig != SIGCONT) || (current->session != t->session))
+	    && ((sig != SIGCONT) ||
+		(current->signal->session != t->signal->session))
 	    && (current->euid ^ t->suid) && (current->euid ^ t->uid)
 	    && (current->uid ^ t->suid) && (current->uid ^ t->uid)
 	    && !capable(CAP_KILL))
@@ -1103,7 +1104,7 @@ kill_sl_info(int sig, struct siginfo *info, pid_t sid)
 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) {
-		if (!p->leader)
+		if (!p->signal->leader)
 			continue;
 		err = group_send_sig_info(sig, info, p);
 		if (retval)
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f9e02f2071..9d57482758f3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -990,7 +990,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 
 	if (p->parent == current || p->real_parent == current) {
 		err = -EPERM;
-		if (p->session != current->session)
+		if (p->signal->session != current->signal->session)
 			goto out;
 		err = -EACCES;
 		if (p->did_exec)
@@ -1002,7 +1002,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 	}
 
 	err = -EPERM;
-	if (p->leader)
+	if (p->signal->leader)
 		goto out;
 
 	if (pgid != pid) {
@@ -1011,7 +1011,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid)
 		struct list_head *l;
 
 		for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid)
-			if (p->session == current->session)
+			if (p->signal->session == current->signal->session)
 				goto ok_pgid;
 		goto out;
 	}
@@ -1023,7 +1023,7 @@ ok_pgid:
 
 	if (process_group(p) != pgid) {
 		detach_pid(p, PIDTYPE_PGID);
-		p->group_leader->__pgrp = pgid;
+		p->signal->pgrp = pgid;
 		attach_pid(p, PIDTYPE_PGID, pgid);
 	}
 
@@ -1065,7 +1065,7 @@ asmlinkage long sys_getpgrp(void)
 asmlinkage long sys_getsid(pid_t pid)
 {
 	if (!pid) {
-		return current->session;
+		return current->signal->session;
 	} else {
 		int retval;
 		struct task_struct *p;
@@ -1077,7 +1077,7 @@ asmlinkage long sys_getsid(pid_t pid)
 		if(p) {
 			retval = security_task_getsid(p);
 			if (!retval)
-				retval = p->session;
+				retval = p->signal->session;
 		}
 		read_unlock(&tasklist_lock);
 		return retval;
@@ -1098,10 +1098,10 @@ asmlinkage long sys_setsid(void)
 	if (pid)
 		goto out;
 
-	current->leader = 1;
+	current->signal->leader = 1;
 	__set_special_pids(current->pid, current->pid);
-	current->tty = NULL;
-	current->tty_old_pgrp = 0;
+	current->signal->tty = NULL;
+	current->signal->tty_old_pgrp = 0;
 	err = process_group(current);
 out:
 	write_unlock_irq(&tasklist_lock);
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 33b687d60efe..f76563312ee4 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -46,7 +46,7 @@ static void print_string(char *str)
 	struct tty_struct *my_tty;
 
 	/* The tty for the current task */
-	my_tty = current->tty;
+	my_tty = current->signal->tty;
 	if (my_tty != NULL) {
 		my_tty->driver->write(my_tty, 0, str, strlen(str));
 		my_tty->driver->write(my_tty, 0, "\015\012", 2);
diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c
index a1529896ec1b..91c3fd3f1f8f 100644
--- a/net/ipv4/netfilter/ipt_owner.c
+++ b/net/ipv4/netfilter/ipt_owner.c
@@ -95,7 +95,7 @@ match_sid(const struct sk_buff *skb, pid_t sid)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct files_struct *files;
-		if (p->session != sid)
+		if (p->signal->session != sid)
 			continue;
 
 		task_lock(p);
diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c
index 02e5ee4e7418..0bb9c661b73c 100644
--- a/net/ipv6/netfilter/ip6t_owner.c
+++ b/net/ipv6/netfilter/ip6t_owner.c
@@ -61,7 +61,7 @@ match_sid(const struct sk_buff *skb, pid_t sid)
 	read_lock(&tasklist_lock);
 	do_each_thread(g, p) {
 		struct files_struct *files;
-		if (p->session != sid)
+		if (p->signal->session != sid)
 			continue;
 
 		task_lock(p);
-- 
cgit v1.2.3


From af70f7673155616ffd004d551e1b612002a58bf0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:56:59 -0700
Subject: [PATCH] Fix page allocator lower zone protection for NUMA

From: Martin Hicks <mort@wildopensource.com>

This changes __alloc_pages() so it uses precalculated values for the "min".
This should prevent the problem of min incrementing from zone to zone across
many nodes on a NUMA machine.  The result of falling back to other nodes with
the old incremental min calculations was that the min value became very
large.
---
 include/linux/mmzone.h |  39 ++++++++++---
 kernel/sysctl.c        |   2 +-
 mm/page_alloc.c        | 150 +++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 159 insertions(+), 32 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b5398fa7be88..51b8f3f67741 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -54,6 +54,15 @@ struct per_cpu_pageset {
 	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
 } ____cacheline_aligned_in_smp;
 
+#define ZONE_DMA		0
+#define ZONE_NORMAL		1
+#define ZONE_HIGHMEM		2
+
+#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
+#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
+
+#define GFP_ZONEMASK	0x03
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -70,6 +79,19 @@ struct zone {
 	spinlock_t		lock;
 	unsigned long		free_pages;
 	unsigned long		pages_min, pages_low, pages_high;
+	/*
+	 * protection[] is a pre-calculated number of extra pages that must be
+	 * available in a zone in order for __alloc_pages() to allocate memory
+	 * from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
+	 * be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
+	 * for us to choose to allocate the page from that zone.
+	 *
+	 * It uses both min_free_kbytes and sysctl_lower_zone_protection.
+	 * The protection values are recalculated if either of these values
+	 * change.  The array elements are in zonelist order:
+	 *	[0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
+	 */
+	unsigned long		protection[MAX_NR_ZONES];
 
 	ZONE_PADDING(_pad1_)
 
@@ -157,14 +179,6 @@ struct zone {
 	unsigned long		present_pages;	/* amount of memory (excluding holes) */
 } ____cacheline_maxaligned_in_smp;
 
-#define ZONE_DMA		0
-#define ZONE_NORMAL		1
-#define ZONE_HIGHMEM		2
-
-#define MAX_NR_ZONES		3	/* Sync this with ZONES_SHIFT */
-#define ZONES_SHIFT		2	/* ceil(log2(MAX_NR_ZONES)) */
-
-#define GFP_ZONEMASK	0x03
 
 /*
  * The "priority" of VM scanning is how much of the queues we will scan in one
@@ -228,6 +242,11 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive,
 void build_all_zonelists(void);
 void wakeup_kswapd(struct zone *zone);
 
+/*
+ * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
+ */
+#define zone_idx(zone)		((zone) - (zone)->zone_pgdat->node_zones)
+
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
  * @pgdat - pointer to a pg_data_t variable
@@ -299,7 +318,9 @@ static inline int is_normal(struct zone *zone)
 struct ctl_table;
 struct file;
 int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, 
-					  void __user *, size_t *);
+					void __user *, size_t *);
+int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
+					void __user *, size_t *);
 
 #include <linux/topology.h>
 /* Returns the number of the current Node. */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f5f3123b0522..f2c8c8ce4926 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -722,7 +722,7 @@ static ctl_table vm_table[] = {
 		.data		= &sysctl_lower_zone_protection,
 		.maxlen		= sizeof(sysctl_lower_zone_protection),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &lower_zone_protection_sysctl_handler,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
 	},
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9764a4e78e45..c87ca3dd2f11 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct task_struct *p = current;
 	int i;
 	int cold;
+	int alloc_type;
 	int do_retry;
 
 	might_sleep_if(wait);
@@ -564,28 +565,27 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	if (zones[0] == NULL)     /* no zones in the zonelist */
 		return NULL;
 
+	alloc_type = zone_idx(zones[0]);
+
 	/* Go through the zonelist once, looking for a zone with enough free */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
-		unsigned long local_low;
+
+		min = (1<<order) + z->protection[alloc_type];
 
 		/*
-		 * This is the fabled 'incremental min'. We let real-time tasks
-		 * dip their real-time paws a little deeper into reserves.
+		 * We let real-time tasks dip their real-time paws a little
+		 * deeper into reserves.
 		 */
-		local_low = z->pages_low;
 		if (rt_task(p))
-			local_low >>= 1;
-		min += local_low;
+			min -= z->pages_low >> 1;
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
-		       		goto got_pg;
+				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/* we're somewhat low on memory, failed to find what we needed */
@@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		wakeup_kswapd(zones[i]);
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
-		unsigned long local_min;
 		struct zone *z = zones[i];
 
-		local_min = z->pages_min;
+		min = (1<<order) + z->protection[alloc_type];
+
 		if (gfp_mask & __GFP_HIGH)
-			local_min >>= 2;
+			min -= z->pages_low >> 2;
 		if (rt_task(p))
-			local_min >>= 1;
-		min += local_min;
+			min -= z->pages_low >> 1;
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += local_min * sysctl_lower_zone_protection;
 	}
 
 	/* here we're in the low on memory slow path */
@@ -642,18 +640,17 @@ rebalance:
 	p->flags &= ~PF_MEMALLOC;
 
 	/* go through the zonelist yet one more time */
-	min = 1UL << order;
 	for (i = 0; zones[i] != NULL; i++) {
 		struct zone *z = zones[i];
 
-		min += z->pages_min;
+		min = (1UL << order) + z->protection[alloc_type];
+
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
 			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				goto got_pg;
 		}
-		min += z->pages_low * sysctl_lower_zone_protection;
 	}
 
 	/*
@@ -1056,6 +1053,8 @@ void show_free_areas(void)
 		ps.nr_page_table_pages);
 
 	for_each_zone(zone) {
+		int i;
+
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1075,6 +1074,10 @@ void show_free_areas(void)
 			K(zone->nr_inactive),
 			K(zone->present_pages)
 			);
+		printk("protections[]:");
+		for (i = 0; i < MAX_NR_ZONES; i++)
+			printk(" %lu", zone->protection[i]);
+		printk("\n");
 	}
 
 	for_each_zone(zone) {
@@ -1272,7 +1275,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
  			j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
  
 		zonelist->zones[j++] = NULL;
-	} 
+	}
 }
 
 #endif	/* CONFIG_NUMA */
@@ -1744,6 +1747,93 @@ void __init page_alloc_init(void)
 	hotcpu_notifier(page_alloc_cpu_notify, 0);
 }
 
+static unsigned long higherzone_val(struct zone *z, int max_zone,
+					int alloc_type)
+{
+	int z_idx = zone_idx(z);
+	struct zone *higherzone;
+	unsigned long pages;
+
+	/* there is no higher zone to get a contribution from */
+	if (z_idx == MAX_NR_ZONES-1)
+		return 0;
+
+	higherzone = &z->zone_pgdat->node_zones[z_idx+1];
+
+	/* We always start with the higher zone's protection value */
+	pages = higherzone->protection[alloc_type];
+
+	/*
+	 * We get a lower-zone-protection contribution only if there are
+	 * pages in the higher zone and if we're not the highest zone
+	 * in the current zonelist.  e.g., never happens for GFP_DMA. Happens
+	 * only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
+	 * and ZONE_NORMAL for a GFP_HIGHMEM allocation.
+	 */
+	if (higherzone->present_pages && z_idx < alloc_type)
+		pages += higherzone->pages_low * sysctl_lower_zone_protection;
+
+	return pages;
+}
+
+/*
+ * setup_per_zone_protection - called whenver min_free_kbytes or
+ *	sysctl_lower_zone_protection changes.  Ensures that each zone
+ *	has a correct pages_protected value, so an adequate number of
+ *	pages are left in the zone after a successful __alloc_pages().
+ *
+ *	This algorithm is way confusing.  I tries to keep the same behavior
+ *	as we had with the incremental min iterative algorithm.
+ */
+static void setup_per_zone_protection(void)
+{
+	struct pglist_data *pgdat;
+	struct zone *zones, *zone;
+	int max_zone;
+	int i, j;
+
+	for_each_pgdat(pgdat) {
+		zones = pgdat->node_zones;
+
+		for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
+			if (zones[i].present_pages)
+				max_zone = i;
+
+		/*
+		 * For each of the different allocation types:
+		 * GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
+		 */
+		for (i = 0; i < MAX_NR_ZONES; i++) {
+			/*
+			 * For each of the zones:
+			 * ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
+			 */
+			for (j = MAX_NR_ZONES-1; j >= 0; j--) {
+				zone = &zones[j];
+
+				/*
+				 * We never protect zones that don't have memory
+				 * in them (j>max_zone) or zones that aren't in
+				 * the zonelists for a certain type of
+				 * allocation (j>i).  We have to assign these to
+				 * zero because the lower zones take
+				 * contributions from the higher zones.
+				 */
+				if (j > max_zone || j > i) {
+					zone->protection[i] = 0;
+					continue;
+				}
+				/*
+				 * The contribution of the next higher zone
+				 */
+				zone->protection[i] = higherzone_val(zone,
+								max_zone, i);
+				zone->protection[i] += zone->pages_low;
+			}
+		}
+	}
+}
+
 /*
  * setup_per_zone_pages_min - called when min_free_kbytes changes.  Ensures 
  *	that the pages_{min,low,high} values for each zone are set correctly 
@@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void)
 	unsigned long flags;
 
 	/* Calculate total number of !ZONE_HIGHMEM pages */
-	for_each_zone(zone)
+	for_each_zone(zone) {
 		if (!is_highmem(zone))
 			lowmem_pages += zone->present_pages;
+	}
 
 	for_each_zone(zone) {
 		spin_lock_irqsave(&zone->lru_lock, flags);
@@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void)
 	if (min_free_kbytes > 16384)
 		min_free_kbytes = 16384;
 	setup_per_zone_pages_min();
+	setup_per_zone_protection();
 	return 0;
 }
 module_init(init_per_zone_pages_min)
 
 /*
  * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 
- *	that we can call setup_per_zone_pages_min() whenever min_free_kbytes 
+ *	that we can call two helper functions whenever min_free_kbytes
  *	changes.
  */
 int min_free_kbytes_sysctl_handler(ctl_table *table, int write, 
@@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
 {
 	proc_dointvec(table, write, file, buffer, length);
 	setup_per_zone_pages_min();
+	setup_per_zone_protection();
+	return 0;
+}
+
+/*
+ * lower_zone_protection_sysctl_handler - just a wrapper around
+ *	proc_dointvec() so that we can call setup_per_zone_protection()
+ *	whenever sysctl_lower_zone_protection changes.
+ */
+int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
+		 struct file *file, void __user *buffer, size_t *length)
+{
+	proc_dointvec_minmax(table, write, file, buffer, length);
+	setup_per_zone_protection();
 	return 0;
 }
-- 
cgit v1.2.3


From a1ff5989c622e78d2266237396545876359f5edf Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:57:12 -0700
Subject: [PATCH] ext3 fsync() and fdatasync() speedup

ext3's fsync/fdatasync implementation is currently syncing the inode via a
full journal commit even if it was unaltered.

Fix that up by exporting the core VFS's inode sync function to modules and
calling it if the inode is dirty.  We need to do it this way so that the
inode is moved to the appropriate superblock list and so that the i_state
dirty flags are appropriately updated.

This speeds up ext3 fsync() for file overwrites by a factor of four (disk
non-writeback) to forty (disk in writeback mode).
---
 fs/ext3/fsync.c     | 38 ++++++++++++++++++++++++++++----------
 fs/fs-writeback.c   | 42 ++++++++++++++++++++++++++++++++++--------
 include/linux/fs.h  |  1 +
 mm/page-writeback.c |  2 ++
 4 files changed, 65 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index 0888abcd9757..0b6a45929030 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -24,6 +24,8 @@
 
 #include <linux/time.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/writeback.h>
 #include <linux/jbd.h>
 #include <linux/ext3_fs.h>
 #include <linux/ext3_jbd.h>
@@ -38,29 +40,28 @@
  *
  * What we do is just kick off a commit and wait on it.  This will snapshot the
  * inode to disk.
- *
- * Note that there is a serious optimisation we can make here: if the current
- * inode is not part of j_running_transaction or j_committing_transaction
- * then we have nothing to do.  That would require implementation of t_ilist,
- * which isn't too hard.
  */
 
 int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 {
 	struct inode *inode = dentry->d_inode;
+	int ret = 0;
 
 	J_ASSERT(ext3_journal_current_handle() == 0);
 
+	smp_mb();		/* prepare for lockless i_state read */
+	if (!(inode->i_state & I_DIRTY))
+		goto out;
+
 	/*
 	 * data=writeback:
 	 *  The caller's filemap_fdatawrite()/wait will sync the data.
-	 *  ext3_force_commit() will sync the metadata
+	 *  sync_inode() will sync the metadata
 	 *
 	 * data=ordered:
 	 *  The caller's filemap_fdatawrite() will write the data and
-	 *  ext3_force_commit() will wait on the buffers.  Then the caller's
-	 *  filemap_fdatawait() will wait on the pages (but all IO is complete)
-	 *  Not pretty, but it works.
+	 *  sync_inode() will write the inode if it is dirty.  Then the caller's
+	 *  filemap_fdatawait() will wait on the pages.
 	 *
 	 * data=journal:
 	 *  filemap_fdatawrite won't do anything (the buffers are clean).
@@ -70,5 +71,22 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
 	 *  (they were dirtied by commit).  But that's OK - the blocks are
 	 *  safe in-journal, which is all fsync() needs to ensure.
 	 */
-	return ext3_force_commit(inode->i_sb);
+	if (ext3_should_journal_data(inode)) {
+		ret = ext3_force_commit(inode->i_sb);
+		goto out;
+	}
+
+	/*
+	 * The VFS has written the file data.  If the inode is unaltered
+	 * then we need not start a commit.
+	 */
+	if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+			.nr_to_write = 0, /* sys_fsync did this */
+		};
+		ret = sync_inode(inode, &wbc);
+	}
+out:
+	return ret;
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 016891bb2b70..aa5f34b85747 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -137,13 +137,14 @@ static void write_inode(struct inode *inode, int sync)
  *
  * Called under inode_lock.
  */
-static void
+static int
 __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	unsigned dirty;
 	struct address_space *mapping = inode->i_mapping;
 	struct super_block *sb = inode->i_sb;
 	int wait = wbc->sync_mode == WB_SYNC_ALL;
+	int ret;
 
 	BUG_ON(inode->i_state & I_LOCK);
 
@@ -164,14 +165,17 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_unlock(&mapping->page_lock);
 	spin_unlock(&inode_lock);
 
-	do_writepages(mapping, wbc);
+	ret = do_writepages(mapping, wbc);
 
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
 	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC))
 		write_inode(inode, wait);
 
-	if (wait)
-		filemap_fdatawait(mapping);
+	if (wait) {
+		int err = filemap_fdatawait(mapping);
+		if (ret == 0)
+			ret = err;
+	}
 
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
@@ -195,18 +199,19 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 		}
 	}
 	wake_up_inode(inode);
+	return ret;
 }
 
 /*
  * Write out an inode's dirty pages.  Called under inode_lock.
  */
-static void
+static int
 __writeback_single_inode(struct inode *inode,
 			struct writeback_control *wbc)
 {
 	if ((wbc->sync_mode != WB_SYNC_ALL) && (inode->i_state & I_LOCK)) {
 		list_move(&inode->i_list, &inode->i_sb->s_dirty);
-		return;
+		return 0;
 	}
 
 	/*
@@ -219,7 +224,7 @@ __writeback_single_inode(struct inode *inode,
 		iput(inode);
 		spin_lock(&inode_lock);
 	}
-	__sync_single_inode(inode, wbc);
+	return __sync_single_inode(inode, wbc);
 }
 
 /*
@@ -499,9 +504,30 @@ void write_inode_now(struct inode *inode, int sync)
 	if (sync)
 		wait_on_inode(inode);
 }
-
 EXPORT_SYMBOL(write_inode_now);
 
+/**
+ * sync_inode - write an inode and its pages to disk.
+ * @inode: the inode to sync
+ * @wbc: controls the writeback mode
+ *
+ * sync_inode() will write an inode and its pages to disk.  It will also
+ * correctly update the inode on its superblock's dirty inode lists and will
+ * update inode->i_state.
+ *
+ * The caller must have a ref on the inode.
+ */
+int sync_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	int ret;
+
+	spin_lock(&inode_lock);
+	ret = __writeback_single_inode(inode, wbc);
+	spin_unlock(&inode_lock);
+	return ret;
+}
+EXPORT_SYMBOL(sync_inode);
+
 /**
  * generic_osync_inode - flush all dirty data for a given inode to disk
  * @inode: inode to write
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e47f6360f74c..3e71560374c0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -925,6 +925,7 @@ static inline void file_accessed(struct file *file)
 	touch_atime(file->f_vfsmnt, file->f_dentry);
 }
 
+int sync_inode(struct inode *inode, struct writeback_control *wbc);
 
 /**
  * &export_operations - for nfsd to communicate with file systems
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1ad6717ade97..f1ecbd88e846 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -441,6 +441,8 @@ void __init page_writeback_init(void)
 
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+	if (wbc->nr_to_write <= 0)
+		return 0;
 	if (mapping->a_ops->writepages)
 		return mapping->a_ops->writepages(mapping, wbc);
 	return generic_writepages(mapping, wbc);
-- 
cgit v1.2.3


From 2b38960cbf6a500bb8d2d8afd7e0cd546c72efe6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:57:51 -0700
Subject: [PATCH] JBD: ordered-data commit cleanup

For data=ordered, kjournald at commit time has to write out and wait upon a
long list of buffers.  It does this in a rather awkward way with a single
list.  it causes complexity and long lock hold times, and makes the addition
of rescheduling points quite hard

So what we do instead (based on Chris Mason's suggestion) is to add a new
buffer list (t_locked_list) to the journal.  It contains buffers which have
been placed under I/O.

So as we walk the t_sync_datalist list we move buffers over to t_locked_list
as they are written out.

When t_sync_datalist is empty we may then walk t_locked_list waiting for the
I/O to complete.

As a side-effect this means that we can remove the nasty synchronous wait in
journal_dirty_data which is there to avoid the kjournald livelock which would
otherwise occur when someone is continuously dirtying a buffer.
---
 fs/jbd/commit.c      | 143 +++++++++++++++++++++++++++------------------------
 fs/jbd/transaction.c |  13 +++--
 include/linux/jbd.h  |   9 +++-
 3 files changed, 95 insertions(+), 70 deletions(-)

(limited to 'include')

diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index de335c04c962..b3cb6bf406d1 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -78,6 +78,21 @@ nope:
 	__brelse(bh);
 }
 
+/*
+ * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
+ * held.  For ranking reasons we must trylock.  If we lose, schedule away and
+ * return 0.  j_list_lock is dropped in this case.
+ */
+static int inverted_lock(journal_t *journal, struct buffer_head *bh)
+{
+	if (!jbd_trylock_bh_state(bh)) {
+		spin_unlock(&journal->j_list_lock);
+		schedule();
+		return 0;
+	}
+	return 1;
+}
+
 /*
  * journal_commit_transaction
  *
@@ -88,7 +103,6 @@ void journal_commit_transaction(journal_t *journal)
 {
 	transaction_t *commit_transaction;
 	struct journal_head *jh, *new_jh, *descriptor;
-	struct journal_head *next_jh, *last_jh;
 	struct buffer_head *wbuf[64];
 	int bufs;
 	int flags;
@@ -222,113 +236,110 @@ void journal_commit_transaction(journal_t *journal)
 	err = 0;
 	/*
 	 * Whenever we unlock the journal and sleep, things can get added
-	 * onto ->t_datalist, so we have to keep looping back to write_out_data
-	 * until we *know* that the list is empty.
+	 * onto ->t_sync_datalist, so we have to keep looping back to
+	 * write_out_data until we *know* that the list is empty.
 	 */
-write_out_data:
-
+	bufs = 0;
 	/*
 	 * Cleanup any flushed data buffers from the data list.  Even in
 	 * abort mode, we want to flush this out as soon as possible.
-	 *
-	 * We take j_list_lock to protect the lists from
-	 * journal_try_to_free_buffers().
 	 */
+write_out_data:
+	cond_resched();
 	spin_lock(&journal->j_list_lock);
 
-write_out_data_locked:
-	bufs = 0;
-	next_jh = commit_transaction->t_sync_datalist;
-	if (next_jh == NULL)
-		goto sync_datalist_empty;
-	last_jh = next_jh->b_tprev;
-
-	do {
+	while (commit_transaction->t_sync_datalist) {
 		struct buffer_head *bh;
 
-		jh = next_jh;
-		next_jh = jh->b_tnext;
+		jh = commit_transaction->t_sync_datalist;
+		commit_transaction->t_sync_datalist = jh->b_tnext;
 		bh = jh2bh(jh);
-		if (!buffer_locked(bh)) {
+		if (buffer_locked(bh)) {
+			BUFFER_TRACE(bh, "locked");
+			if (!inverted_lock(journal, bh))
+				goto write_out_data;
+			__journal_unfile_buffer(jh);
+			__journal_file_buffer(jh, jh->b_transaction, BJ_Locked);
+			jbd_unlock_bh_state(bh);
+			if (need_resched()) {
+				spin_unlock(&journal->j_list_lock);
+				goto write_out_data;
+			}
+		} else {
 			if (buffer_dirty(bh)) {
 				BUFFER_TRACE(bh, "start journal writeout");
-				atomic_inc(&bh->b_count);
+				get_bh(bh);
 				wbuf[bufs++] = bh;
-			} else {
-				BUFFER_TRACE(bh, "writeout complete: unfile");
-				/*
-				 * We have a lock ranking problem..
-				 */
-				if (!jbd_trylock_bh_state(bh)) {
+				if (bufs == ARRAY_SIZE(wbuf)) {
+					jbd_debug(2, "submit %d writes\n",
+							bufs);
 					spin_unlock(&journal->j_list_lock);
-					schedule();
+					ll_rw_block(WRITE, bufs, wbuf);
+					journal_brelse_array(wbuf, bufs);
+					bufs = 0;
 					goto write_out_data;
 				}
+			} else {
+				BUFFER_TRACE(bh, "writeout complete: unfile");
+				if (!inverted_lock(journal, bh))
+					goto write_out_data;
 				__journal_unfile_buffer(jh);
 				jh->b_transaction = NULL;
 				jbd_unlock_bh_state(bh);
 				journal_remove_journal_head(bh);
-				__brelse(bh);
-				if (need_resched() && commit_transaction->
-							t_sync_datalist) {
-					commit_transaction->t_sync_datalist =
-								next_jh;
-					if (bufs)
-						break;
+				put_bh(bh);
+				if (need_resched()) {
 					spin_unlock(&journal->j_list_lock);
-					cond_resched();
 					goto write_out_data;
 				}
 			}
 		}
-		if (bufs == ARRAY_SIZE(wbuf)) {
-			/*
-			 * Major speedup: start here on the next scan
-			 */
-			J_ASSERT(commit_transaction->t_sync_datalist != 0);
-			commit_transaction->t_sync_datalist = jh;
-			break;
-		}
-	} while (jh != last_jh);
+	}
 
-	if (bufs || need_resched()) {
-		jbd_debug(2, "submit %d writes\n", bufs);
+	if (bufs) {
 		spin_unlock(&journal->j_list_lock);
-		if (bufs)
-			ll_rw_block(WRITE, bufs, wbuf);
-		cond_resched();
+		ll_rw_block(WRITE, bufs, wbuf);
 		journal_brelse_array(wbuf, bufs);
 		spin_lock(&journal->j_list_lock);
-		goto write_out_data_locked;
 	}
 
 	/*
-	 * Wait for all previously submitted IO on the data list to complete.
+	 * Wait for all previously submitted IO to complete.
 	 */
-	jh = commit_transaction->t_sync_datalist;
-	if (jh == NULL)
-		goto sync_datalist_empty;
-
-	do {
+	while (commit_transaction->t_locked_list) {
 		struct buffer_head *bh;
-		jh = jh->b_tprev;	/* Wait on the last written */
+
+		jh = commit_transaction->t_locked_list->b_tprev;
 		bh = jh2bh(jh);
+		get_bh(bh);
 		if (buffer_locked(bh)) {
-			get_bh(bh);
 			spin_unlock(&journal->j_list_lock);
 			wait_on_buffer(bh);
 			if (unlikely(!buffer_uptodate(bh)))
 				err = -EIO;
+			spin_lock(&journal->j_list_lock);
+		}
+		if (!inverted_lock(journal, bh)) {
 			put_bh(bh);
-			/* the journal_head may have been removed now */
-			goto write_out_data;
-		} else if (buffer_dirty(bh)) {
-			goto write_out_data_locked;
+			spin_lock(&journal->j_list_lock);
+			continue;
 		}
-	} while (jh != commit_transaction->t_sync_datalist);
-	goto write_out_data_locked;
-
-sync_datalist_empty:
+		if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
+			__journal_unfile_buffer(jh);
+			jh->b_transaction = NULL;
+			jbd_unlock_bh_state(bh);
+			journal_remove_journal_head(bh);
+			put_bh(bh);
+		} else {
+			jbd_unlock_bh_state(bh);
+		}
+		put_bh(bh);
+		if (need_resched()) {
+			spin_unlock(&journal->j_list_lock);
+			cond_resched();
+			spin_lock(&journal->j_list_lock);
+		}
+	}
 	spin_unlock(&journal->j_list_lock);
 
 	journal_write_revoke_records(journal, commit_transaction);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73ef79d97fd0..a052407712a7 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1010,7 +1010,8 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 			 * the write() data.
 			 */
 			if (jh->b_jlist != BJ_None &&
-					jh->b_jlist != BJ_SyncData) {
+					jh->b_jlist != BJ_SyncData &&
+					jh->b_jlist != BJ_Locked) {
 				JBUFFER_TRACE(jh, "Not stealing");
 				goto no_journal;
 			}
@@ -1048,7 +1049,7 @@ int journal_dirty_data(handle_t *handle, struct buffer_head *bh)
 		 * committing transaction, so might still be left on that
 		 * transaction's metadata lists.
 		 */
-		if (jh->b_jlist != BJ_SyncData) {
+		if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
 			JBUFFER_TRACE(jh, "not on correct data list: unfile");
 			J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
 			__journal_unfile_buffer(jh);
@@ -1539,6 +1540,9 @@ void __journal_unfile_buffer(struct journal_head *jh)
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
+	case BJ_Locked:
+		list = &transaction->t_locked_list;
+		break;
 	}
 
 	__blist_del_buffer(list, jh);
@@ -1576,7 +1580,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
 
 	spin_lock(&journal->j_list_lock);
 	if (jh->b_transaction != 0 && jh->b_cp_transaction == 0) {
-		if (jh->b_jlist == BJ_SyncData) {
+		if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
 			/* A written-back ordered data buffer */
 			JBUFFER_TRACE(jh, "release data");
 			__journal_unfile_buffer(jh);
@@ -1985,6 +1989,9 @@ void __journal_file_buffer(struct journal_head *jh,
 	case BJ_Reserved:
 		list = &transaction->t_reserved_list;
 		break;
+	case BJ_Locked:
+		list =  &transaction->t_locked_list;
+		break;
 	}
 
 	__blist_add_buffer(list, jh);
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index eb154bafe1e4..241387b13764 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -486,6 +486,12 @@ struct transaction_s
 	 */
 	struct journal_head	*t_reserved_list;
 
+	/*
+	 * Doubly-linked circular list of all buffers under writeout during
+	 * commit [j_list_lock]
+	 */
+	struct journal_head	*t_locked_list;
+
 	/*
 	 * Doubly-linked circular list of all metadata buffers owned by this
 	 * transaction [j_list_lock]
@@ -1079,7 +1085,8 @@ static inline int jbd_space_needed(journal_t *journal)
 #define BJ_Shadow	5	/* Buffer contents being shadowed to the log */
 #define BJ_LogCtl	6	/* Buffer contains log descriptors */
 #define BJ_Reserved	7	/* Buffer is reserved for access by journal */
-#define BJ_Types	8
+#define BJ_Locked	8	/* Locked for I/O during commit */
+#define BJ_Types	9
  
 extern int jbd_blocks_per_page(struct inode *inode);
 
-- 
cgit v1.2.3


From b9e55f3d300af426885d7b0a13e45cd2841118a2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:58:40 -0700
Subject: [PATCH] slab: updates for per-arch alignments

From: Manfred Spraul <manfred@colorfullife.com>

Description:

Right now kmem_cache_create automatically decides about the alignment of
allocated objects. The automatic decisions are sometimes wrong:

- for some objects, it's better to keep them as small as possible to
  reduce the memory usage.  Ingo already added a parameter to
  kmem_cache_create for the sigqueue cache, but it wasn't implemented.

- for s390, normal kmalloc must be 8-byte aligned.  With debugging
  enabled, the default allocation was 4-bytes.  This means that s390 cannot
  enable slab debugging.

- arm26 needs 1 kB aligned objects.  Previously this was impossible to
  generate, therefore arm has its own allocator in
  arm26/machine/small_page.c

- most objects should be cache line aligned, to avoid false sharing.  But
  the cache line size was set at compile time, often to 128 bytes for
  generic kernels.  This wastes memory.  The new code uses the runtime
  determined cache line size instead.

- some caches want an explicit alignment.  One example are the pte_chain
  objects: they must find the start of the object with addr&mask.  Right
  now pte_chain objects are scaled to the cache line size, because that was
  the only alignment that could be generated reliably.

The implementation reuses the "offset" parameter of kmem_cache_create and
now uses it to pass in the requested alignment.  offset was ignored by the
current implementation, and the only user I found is sigqueue, which
intended to set the alignment.

In the long run, it might be interesting for the main tree: due to the 128
byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9
inodes - 20% memory recovered for Athlon systems.


For generic kernels  running on P6 cpus (i.e. 32 byte cachelines), it means

Number of objects per page:

 ext2_inode_cache: 8 instead of 7
 ext3_inode_cache: 8 instead of 7
 fat_inode_cache: 9 instead of 7
 rpc_tasks: 24 instead of 15
 tcp_tw_bucket: 40 instead of 30
 arp_cache: 40 instead of 30
 nfs_write_data: 9 instead of 7
---
 arch/i386/mm/init.c          |   4 +-
 include/asm-i386/processor.h |   2 +
 kernel/fork.c                |   7 ++-
 mm/rmap.c                    |   2 +-
 mm/slab.c                    | 135 +++++++++++++++++++++++++------------------
 5 files changed, 89 insertions(+), 61 deletions(-)

(limited to 'include')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index a9923661f317..040862e6c6a0 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -530,18 +530,18 @@ void __init pgtable_cache_init(void)
 {
 	if (PTRS_PER_PMD > 1) {
 		pmd_cache = kmem_cache_create("pmd",
+					PTRS_PER_PMD*sizeof(pmd_t),
 					PTRS_PER_PMD*sizeof(pmd_t),
 					0,
-					SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 					pmd_ctor,
 					NULL);
 		if (!pmd_cache)
 			panic("pgtable_cache_init(): cannot create pmd cache");
 	}
 	pgd_cache = kmem_cache_create("pgd",
+				PTRS_PER_PGD*sizeof(pgd_t),
 				PTRS_PER_PGD*sizeof(pgd_t),
 				0,
-				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
 				pgd_ctor,
 				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
 	if (!pgd_cache)
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index 3a5e0ff2a20c..0ebe1aa1afb0 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -403,6 +403,8 @@ struct tss_struct {
 	unsigned long stack[64];
 } __attribute__((packed));
 
+#define ARCH_MIN_TASKALIGN	16
+
 struct thread_struct {
 /* cached TLS descriptors. */
 	struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
diff --git a/kernel/fork.c b/kernel/fork.c
index d2dd97e866bb..315a06125e65 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -207,11 +207,14 @@ EXPORT_SYMBOL(autoremove_wake_function);
 void __init fork_init(unsigned long mempages)
 {
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
+#ifndef ARCH_MIN_TASKALIGN
+#define ARCH_MIN_TASKALIGN	0
+#endif
 	/* create a slab on which task_structs can be allocated */
 	task_struct_cachep =
 		kmem_cache_create("task_struct",
-				  sizeof(struct task_struct),0,
-				  SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+				  sizeof(struct task_struct),ARCH_MIN_TASKALIGN,
+				  0, NULL, NULL);
 	if (!task_struct_cachep)
 		panic("fork_init(): cannot create task_struct SLAB cache");
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 7af41a9b9a4e..c1c7325996a3 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -522,9 +522,9 @@ struct pte_chain *pte_chain_alloc(int gfp_flags)
 void __init pte_chain_init(void)
 {
 	pte_chain_cache = kmem_cache_create(	"pte_chain",
+						sizeof(struct pte_chain),
 						sizeof(struct pte_chain),
 						0,
-						SLAB_MUST_HWCACHE_ALIGN,
 						pte_chain_ctor,
 						NULL);
 
diff --git a/mm/slab.c b/mm/slab.c
index d54728b6af32..b1c015cb0a02 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -121,6 +121,14 @@
 /* Shouldn't this be in a header file somewhere? */
 #define	BYTES_PER_WORD		sizeof(void *)
 
+#ifndef cache_line_size
+#define cache_line_size()	L1_CACHE_BYTES
+#endif
+
+#ifndef ARCH_KMALLOC_MINALIGN
+#define ARCH_KMALLOC_MINALIGN 0
+#endif
+
 /* Legal flag mask for kmem_cache_create(). */
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
@@ -268,6 +276,7 @@ struct kmem_cache_s {
 	unsigned int		colour_off;	/* colour offset */
 	unsigned int		colour_next;	/* cache colouring */
 	kmem_cache_t		*slabp_cache;
+	unsigned int		slab_size;
 	unsigned int		dflags;		/* dynamic flags */
 
 	/* constructor func */
@@ -490,8 +499,10 @@ static kmem_cache_t cache_cache = {
 	.objsize	= sizeof(kmem_cache_t),
 	.flags		= SLAB_NO_REAP,
 	.spinlock	= SPIN_LOCK_UNLOCKED,
-	.colour_off	= L1_CACHE_BYTES,
 	.name		= "kmem_cache",
+#if DEBUG
+	.reallen	= sizeof(kmem_cache_t),
+#endif
 };
 
 /* Guard access to the cache-chain. */
@@ -535,7 +546,7 @@ static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 }
 
 /* Cal the num objs, wastage, and bytes left over for a given slab size. */
-static void cache_estimate (unsigned long gfporder, size_t size,
+static void cache_estimate (unsigned long gfporder, size_t size, size_t align,
 		 int flags, size_t *left_over, unsigned int *num)
 {
 	int i;
@@ -548,7 +559,7 @@ static void cache_estimate (unsigned long gfporder, size_t size,
 		extra = sizeof(kmem_bufctl_t);
 	}
 	i = 0;
-	while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage)
+	while (i*size + ALIGN(base+i*extra, align) <= wastage)
 		i++;
 	if (i > 0)
 		i--;
@@ -558,7 +569,7 @@ static void cache_estimate (unsigned long gfporder, size_t size,
 
 	*num = i;
 	wastage -= i*size;
-	wastage -= L1_CACHE_ALIGN(base+i*extra);
+	wastage -= ALIGN(base+i*extra, align);
 	*left_over = wastage;
 }
 
@@ -705,16 +716,20 @@ void __init kmem_cache_init(void)
 	init_MUTEX(&cache_chain_sem);
 	INIT_LIST_HEAD(&cache_chain);
 	list_add(&cache_cache.next, &cache_chain);
+	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 
-	cache_estimate(0, cache_cache.objsize, 0,
-			&left_over, &cache_cache.num);
+	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
+
+	cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
+				&left_over, &cache_cache.num);
 	if (!cache_cache.num)
 		BUG();
 
 	cache_cache.colour = left_over/cache_cache.colour_off;
 	cache_cache.colour_next = 0;
-
+	cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
+				sizeof(struct slab), cache_line_size());
 
 	/* 2+3) create the kmalloc caches */
 	sizes = malloc_sizes;
@@ -728,7 +743,7 @@ void __init kmem_cache_init(void)
 		 * allow tighter packing of the smaller caches. */
 		sizes->cs_cachep = kmem_cache_create(
 			names->name, sizes->cs_size,
-			0, SLAB_HWCACHE_ALIGN, NULL, NULL);
+			ARCH_KMALLOC_MINALIGN, 0, NULL, NULL);
 		if (!sizes->cs_cachep)
 			BUG();
 
@@ -740,7 +755,7 @@ void __init kmem_cache_init(void)
 
 		sizes->cs_dmacachep = kmem_cache_create(
 			names->name_dma, sizes->cs_size,
-			0, SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL);
+			ARCH_KMALLOC_MINALIGN, SLAB_CACHE_DMA, NULL, NULL);
 		if (!sizes->cs_dmacachep)
 			BUG();
 
@@ -1056,7 +1071,7 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
  * @size: The size of objects to be created in this cache.
- * @offset: The offset to use within the page.
+ * @align: The required alignment for the objects.
  * @flags: SLAB flags
  * @ctor: A constructor for the objects.
  * @dtor: A destructor for the objects.
@@ -1081,16 +1096,15 @@ static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
  * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
  * memory pressure.
  *
- * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
- * cacheline.  This can be beneficial if you're counting cycles as closely
- * as davem.
+ * %SLAB_HWCACHE_ALIGN - This flag has no effect and will be removed soon.
+ *
  */
 kmem_cache_t *
-kmem_cache_create (const char *name, size_t size, size_t offset,
+kmem_cache_create (const char *name, size_t size, size_t align,
 	unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
 	void (*dtor)(void*, kmem_cache_t *, unsigned long))
 {
-	size_t left_over, align, slab_size;
+	size_t left_over, slab_size;
 	kmem_cache_t *cachep = NULL;
 
 	/*
@@ -1101,7 +1115,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 		(size < BYTES_PER_WORD) ||
 		(size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
 		(dtor && !ctor) ||
-		(offset < 0 || offset > size)) {
+		(align < 0)) {
 			printk(KERN_ERR "%s: Early error in slab %s\n",
 					__FUNCTION__, name);
 			BUG();
@@ -1118,22 +1132,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 
 #if FORCED_DEBUG
 	/*
-	 * Enable redzoning and last user accounting, except
-	 * - for caches with forced alignment: redzoning would violate the
-	 *   alignment
-	 * - for caches with large objects, if the increased size would
-	 *   increase the object size above the next power of two: caches
-	 *   with object sizes just above a power of two have a significant
-	 *   amount of internal fragmentation
+	 * Enable redzoning and last user accounting, except for caches with
+	 * large objects, if the increased size would increase the object size
+	 * above the next power of two: caches with object sizes just above a
+	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))
-			&& !(flags & SLAB_MUST_HWCACHE_ALIGN)) {
+	if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
 		flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
-	}
 	flags |= SLAB_POISON;
 #endif
 #endif
-
 	/*
 	 * Always checks flags, a caller might be expecting debug
 	 * support which isn't available.
@@ -1141,15 +1149,23 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 	if (flags & ~CREATE_MASK)
 		BUG();
 
+	if (align) {
+		/* minimum supported alignment: */
+		if (align < BYTES_PER_WORD)
+			align = BYTES_PER_WORD;
+
+		/* combinations of forced alignment and advanced debugging is
+		 * not yet implemented.
+		 */
+		flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
+	}
+
 	/* Get cache's description obj. */
 	cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto opps;
 	memset(cachep, 0, sizeof(kmem_cache_t));
 
-#if DEBUG
-	cachep->reallen = size;
-#endif
 	/* Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
@@ -1160,30 +1176,31 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 	}
 	
 #if DEBUG
+	cachep->reallen = size;
+
 	if (flags & SLAB_RED_ZONE) {
-		/*
-		 * There is no point trying to honour cache alignment
-		 * when redzoning.
-		 */
-		flags &= ~SLAB_HWCACHE_ALIGN;
+		/* redzoning only works with word aligned caches */
+		align = BYTES_PER_WORD;
+
 		/* add space for red zone words */
 		cachep->dbghead += BYTES_PER_WORD;
 		size += 2*BYTES_PER_WORD;
 	}
 	if (flags & SLAB_STORE_USER) {
-		flags &= ~SLAB_HWCACHE_ALIGN;
-		size += BYTES_PER_WORD; /* add space */
+		/* user store requires word alignment and
+		 * one word storage behind the end of the real
+		 * object.
+		 */
+		align = BYTES_PER_WORD;
+		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size > 128 && cachep->reallen > L1_CACHE_BYTES && size < PAGE_SIZE) {
+	if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
 		cachep->dbghead += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
 #endif
 #endif
-	align = BYTES_PER_WORD;
-	if (flags & SLAB_HWCACHE_ALIGN)
-		align = L1_CACHE_BYTES;
 
 	/* Determine if the slab management is 'on' or 'off' slab. */
 	if (size >= (PAGE_SIZE>>3))
@@ -1193,13 +1210,16 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 		 */
 		flags |= CFLGS_OFF_SLAB;
 
-	if (flags & SLAB_HWCACHE_ALIGN) {
-		/* Need to adjust size so that objs are cache aligned. */
-		/* Small obj size, can get at least two per cache line. */
+	if (!align) {
+		/* Default alignment: compile time specified l1 cache size.
+		 * Except if an object is really small, then squeeze multiple
+		 * into one cacheline.
+		 */
+		align = cache_line_size();
 		while (size <= align/2)
 			align /= 2;
-		size = (size+align-1)&(~(align-1));
 	}
+	size = ALIGN(size, align);
 
 	/* Cal size (in pages) of slabs, and the num of objs per slab.
 	 * This could be made much more intelligent.  For now, try to avoid
@@ -1209,7 +1229,7 @@ kmem_cache_create (const char *name, size_t size, size_t offset,
 	do {
 		unsigned int break_flag = 0;
 cal_wastage:
-		cache_estimate(cachep->gfporder, size, flags,
+		cache_estimate(cachep->gfporder, size, align, flags,
 						&left_over, &cachep->num);
 		if (break_flag)
 			break;
@@ -1243,7 +1263,8 @@ next:
 		cachep = NULL;
 		goto opps;
 	}
-	slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab));
+	slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
+				+ sizeof(struct slab), align);
 
 	/*
 	 * If the slab has been placed off-slab, and we have enough space then
@@ -1254,14 +1275,17 @@ next:
 		left_over -= slab_size;
 	}
 
-	/* Offset must be a multiple of the alignment. */
-	offset += (align-1);
-	offset &= ~(align-1);
-	if (!offset)
-		offset = L1_CACHE_BYTES;
-	cachep->colour_off = offset;
-	cachep->colour = left_over/offset;
+	if (flags & CFLGS_OFF_SLAB) {
+		/* really off slab. No need for manual alignment */
+		slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
+	}
 
+	cachep->colour_off = cache_line_size();
+	/* Offset must be a multiple of the alignment. */
+	if (cachep->colour_off < align)
+		cachep->colour_off = align;
+	cachep->colour = left_over/cachep->colour_off;
+	cachep->slab_size = slab_size;
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
@@ -1543,8 +1567,7 @@ static inline struct slab* alloc_slabmgmt (kmem_cache_t *cachep,
 			return NULL;
 	} else {
 		slabp = objp+colour_off;
-		colour_off += L1_CACHE_ALIGN(cachep->num *
-				sizeof(kmem_bufctl_t) + sizeof(struct slab));
+		colour_off += cachep->slab_size;
 	}
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
-- 
cgit v1.2.3


From cea39746deca7ce8b10f21e4a4b3e96c33381e2e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 22:59:45 -0700
Subject: [PATCH] add file_operations.fcntl

From: Chuck Lever <cel@citi.umich.edu>

O_DIRECT|O_APPEND cannot possibly work on NFS, so NFS needs some way of
preventing the user from setting this combination.  We felt that the best
way of implementing this restriction is to allow the filesytem to implement
its own fcntl() handler.

This patch does, that, and provide the appropriate handler for NFS.

Additional details from Chuck:

Forgetting O_DIRECT for a moment, O_APPEND writes on NFS don't work in any
case when multiple clients are writing to a file, since an NFS client can
never guarantee it knows where the true end of file is 100% of the time.
it works as expected iff only one client writes to an O_APPEND file at a
time.

Multi-client O_APPEND writing doesn't seem to be a problem for any
application I'm aware of.  Since it can be made to behave in the
multi-client case with careful application logic or by using file locking,
I don't think we should disallow it.

I want to drop the inode semaphore when doing NFS direct I/O because it is
synchronous; holding the i_sem means we reduce direct I/O concurrency to
one I/O per file at a time.  the important thing sct was worried about was
the case where a single client is writing with O_APPEND and O_DIRECT, and
we don't hold the i_sem during the write.

We must at least hold the i_sem when determining where the end of file is
to do the O_APPEND write.  In 2.6, I believe that is handled correctly in
the VFS layer, so this is not an issue for 2.6, right?
---
 fs/fcntl.c         | 136 ++++++++++++++++++++++++++++-------------------------
 fs/nfs/file.c      |  28 +++++++++++
 include/linux/fs.h |   5 ++
 3 files changed, 105 insertions(+), 64 deletions(-)

(limited to 'include')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index 3486b799e9e4..abad0aa00d13 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -282,80 +282,88 @@ void f_delown(struct file *filp)
 
 EXPORT_SYMBOL(f_delown);
 
-static long do_fcntl(unsigned int fd, unsigned int cmd,
-		     unsigned long arg, struct file * filp)
+long generic_file_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp)
 {
 	long err = -EINVAL;
 
 	switch (cmd) {
-		case F_DUPFD:
-			get_file(filp);
-			err = dupfd(filp, arg);
-			break;
-		case F_GETFD:
-			err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
-			break;
-		case F_SETFD:
-			err = 0;
-			set_close_on_exec(fd, arg & FD_CLOEXEC);
-			break;
-		case F_GETFL:
-			err = filp->f_flags;
-			break;
-		case F_SETFL:
-			err = setfl(fd, filp, arg);
-			break;
-		case F_GETLK:
-			err = fcntl_getlk(filp, (struct flock __user *) arg);
-			break;
-		case F_SETLK:
-		case F_SETLKW:
-			err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
-			break;
-		case F_GETOWN:
-			/*
-			 * XXX If f_owner is a process group, the
-			 * negative return value will get converted
-			 * into an error.  Oops.  If we keep the
-			 * current syscall conventions, the only way
-			 * to fix this will be in libc.
-			 */
-			err = filp->f_owner.pid;
-			force_successful_syscall_return();
-			break;
-		case F_SETOWN:
-			err = f_setown(filp, arg, 1);
-			break;
-		case F_GETSIG:
-			err = filp->f_owner.signum;
-			break;
-		case F_SETSIG:
-			/* arg == 0 restores default behaviour. */
-			if (arg < 0 || arg > _NSIG) {
-				break;
-			}
-			err = 0;
-			filp->f_owner.signum = arg;
-			break;
-		case F_GETLEASE:
-			err = fcntl_getlease(filp);
-			break;
-		case F_SETLEASE:
-			err = fcntl_setlease(fd, filp, arg);
-			break;
-		case F_NOTIFY:
-			err = fcntl_dirnotify(fd, filp, arg);
-			break;
-		default:
+	case F_DUPFD:
+		get_file(filp);
+		err = dupfd(filp, arg);
+		break;
+	case F_GETFD:
+		err = get_close_on_exec(fd) ? FD_CLOEXEC : 0;
+		break;
+	case F_SETFD:
+		err = 0;
+		set_close_on_exec(fd, arg & FD_CLOEXEC);
+		break;
+	case F_GETFL:
+		err = filp->f_flags;
+		break;
+	case F_SETFL:
+		err = setfl(fd, filp, arg);
+		break;
+	case F_GETLK:
+		err = fcntl_getlk(filp, (struct flock __user *) arg);
+		break;
+	case F_SETLK:
+	case F_SETLKW:
+		err = fcntl_setlk(filp, cmd, (struct flock __user *) arg);
+		break;
+	case F_GETOWN:
+		/*
+		 * XXX If f_owner is a process group, the
+		 * negative return value will get converted
+		 * into an error.  Oops.  If we keep the
+		 * current syscall conventions, the only way
+		 * to fix this will be in libc.
+		 */
+		err = filp->f_owner.pid;
+		force_successful_syscall_return();
+		break;
+	case F_SETOWN:
+		err = f_setown(filp, arg, 1);
+		break;
+	case F_GETSIG:
+		err = filp->f_owner.signum;
+		break;
+	case F_SETSIG:
+		/* arg == 0 restores default behaviour. */
+		if (arg < 0 || arg > _NSIG) {
 			break;
+		}
+		err = 0;
+		filp->f_owner.signum = arg;
+		break;
+	case F_GETLEASE:
+		err = fcntl_getlease(filp);
+		break;
+	case F_SETLEASE:
+		err = fcntl_setlease(fd, filp, arg);
+		break;
+	case F_NOTIFY:
+		err = fcntl_dirnotify(fd, filp, arg);
+		break;
+	default:
+		break;
 	}
-
 	return err;
 }
+EXPORT_SYMBOL(generic_file_fcntl);
 
-asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg)
+static long do_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp)
+{
+	if (filp->f_op && filp->f_op->fcntl)
+		return filp->f_op->fcntl(fd, cmd, arg, filp);
+	return generic_file_fcntl(fd, cmd, arg, filp);
+}
+
+asmlinkage long sys_fcntl(int fd, unsigned int cmd, unsigned long arg)
 {	
-	struct file * filp;
+	struct file *filp;
 	long err = -EBADF;
 
 	filp = fget(fd);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index e1203ef2275e..df23d4de5b89 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -33,6 +33,8 @@
 
 #define NFSDBG_FACILITY		NFSDBG_FILE
 
+static long nfs_file_fcntl(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp);
 static int nfs_file_open(struct inode *, struct file *);
 static int nfs_file_release(struct inode *, struct file *);
 static int  nfs_file_mmap(struct file *, struct vm_area_struct *);
@@ -55,6 +57,7 @@ struct file_operations nfs_file_operations = {
 	.fsync		= nfs_fsync,
 	.lock		= nfs_lock,
 	.sendfile	= nfs_file_sendfile,
+	.fcntl		= nfs_file_fcntl,
 };
 
 struct inode_operations nfs_file_inode_operations = {
@@ -68,6 +71,28 @@ struct inode_operations nfs_file_inode_operations = {
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
+#define nfs_invalid_flags	(O_APPEND | O_DIRECT)
+
+/*
+ * Check for special cases that NFS doesn't support, and
+ * pass the rest to the generic fcntl function.
+ */
+static long
+nfs_file_fcntl(int fd, unsigned int cmd,
+		unsigned long arg, struct file *filp)
+{
+	switch (cmd) {
+	case F_SETFL:
+		if ((filp->f_flags & nfs_invalid_flags) == nfs_invalid_flags)
+			return -EINVAL;
+		break;
+	default:
+		break;
+	}
+
+	return generic_file_fcntl(fd, cmd, arg, filp);
+}
+
 /*
  * Open file
  */
@@ -78,6 +103,9 @@ nfs_file_open(struct inode *inode, struct file *filp)
 	int (*open)(struct inode *, struct file *);
 	int res = 0;
 
+	if ((filp->f_flags & nfs_invalid_flags) == nfs_invalid_flags)
+		return -EINVAL;
+
 	lock_kernel();
 	/* Do NFSv4 open() call */
 	if ((open = server->rpc_ops->file_open) != NULL)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3e71560374c0..02976f7c9f47 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -621,6 +621,9 @@ extern struct list_head file_lock_list;
 
 #include <linux/fcntl.h>
 
+extern long generic_file_fcntl(int fd, unsigned int cmd,
+				unsigned long arg, struct file *filp);
+
 extern int fcntl_getlk(struct file *, struct flock __user *);
 extern int fcntl_setlk(struct file *, unsigned int, struct flock __user *);
 
@@ -830,6 +833,8 @@ struct file_operations {
 	ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t, void __user *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
+	long (*fcntl)(int fd, unsigned int cmd,
+			unsigned long arg, struct file *filp);
 };
 
 struct inode_operations {
-- 
cgit v1.2.3


From 2b5f94088fd373eba0dba4077077965440c4457f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:00:48 -0700
Subject: [PATCH] document unused pte bits on i386

From: Rusty Russell <rusty@rustcorp.com.au>

From:  Ed L Cashin <ecashin@uga.edu>

This small patch documents that bits 9, 10, and 11 are unused by the Linux
kernel.  The IA-32 Intel Architecture Software Developer's Manual says that
these bits are available for programmer use.
---
 include/asm-i386/pgtable.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 8fd1e78fc14c..cc7e18b5e92c 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -107,6 +107,9 @@ void paging_init(void);
 #define _PAGE_BIT_DIRTY		6
 #define _PAGE_BIT_PSE		7	/* 4 MB (or 2MB) page, Pentium+, if present.. */
 #define _PAGE_BIT_GLOBAL	8	/* Global TLB entry PPro+ */
+#define _PAGE_BIT_UNUSED1	9	/* available for programmer */
+#define _PAGE_BIT_UNUSED2	10
+#define _PAGE_BIT_UNUSED3	11
 
 #define _PAGE_PRESENT	0x001
 #define _PAGE_RW	0x002
@@ -117,6 +120,9 @@ void paging_init(void);
 #define _PAGE_DIRTY	0x040
 #define _PAGE_PSE	0x080	/* 4 MB (or 2MB) page, Pentium+, if present.. */
 #define _PAGE_GLOBAL	0x100	/* Global TLB entry PPro+ */
+#define _PAGE_UNUSED1	0x200	/* available for programmer */
+#define _PAGE_UNUSED2	0x400
+#define _PAGE_UNUSED3	0x800
 
 #define _PAGE_FILE	0x040	/* set:pagecache unset:swap */
 #define _PAGE_PROTNONE	0x080	/* If not present */
-- 
cgit v1.2.3


From 3a2d85eabe55ed976c74a4fc2c7dd079a0ba8bcc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:02:53 -0700
Subject: [PATCH] Kill duplicate #include <linux_ioport.h>

From: Rusty Russell <rusty@rustcorp.com.au>

include/linux/device.h includes include/linux/ioport.h twice.
---
 include/linux/device.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/device.h b/include/linux/device.h
index 28e101a77070..9bc07b556eea 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -17,7 +17,6 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
-#include <linux/ioport.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <asm/semaphore.h>
-- 
cgit v1.2.3


From 47b54fbff358a1d5ee4738cec8a53a08bead72e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:04:22 -0700
Subject: [PATCH] /dev/urandom scalability improvement

From: David Mosberger <davidm@napali.hpl.hp.com>

Somebody recently pointed out a performance-anomaly to me where an unusual
amount of time was being spent reading from /dev/urandom.  The problem
isn't really surprising as it happened only on >= 4-way machines and the
random driver isn't terribly scalable the way it is written today.  If
scalability _really_ mattered, I suppose per-CPU data structures would be
the way to go.  However, I found that at least for 4-way machines,
performance can be improved considerably with the attached patch.  In
particular, I saw the following performance on a 4-way ia64 machine:

Test: 3 tasks running "dd if=/dev/urandom of=/dev/null bs=1024":

			throughput:
---
 drivers/char/random.c    | 51 +++++++++++++++++++++++++++++++++---------------
 include/linux/prefetch.h | 12 ++++++++++++
 2 files changed, 47 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 117f195029a1..6941fdeb6a4b 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -490,12 +490,15 @@ static inline __u32 int_ln_12bits(__u32 word)
  **********************************************************************/
 
 struct entropy_store {
+	/* mostly-read data: */
+	struct poolinfo poolinfo;
+	__u32		*pool;
+
+	/* read-write data: */
+	spinlock_t lock ____cacheline_aligned_in_smp;
 	unsigned	add_ptr;
 	int		entropy_count;
 	int		input_rotate;
-	struct poolinfo poolinfo;
-	__u32		*pool;
-	spinlock_t lock;
 };
 
 /*
@@ -571,38 +574,54 @@ static void add_entropy_words(struct entropy_store *r, const __u32 *in,
 	static __u32 const twist_table[8] = {
 		         0, 0x3b6e20c8, 0x76dc4190, 0x4db26158,
 		0xedb88320, 0xd6d6a3e8, 0x9b64c2b0, 0xa00ae278 };
-	unsigned i;
-	int new_rotate;
+	unsigned long i, add_ptr, tap1, tap2, tap3, tap4, tap5;
+	int new_rotate, input_rotate;
 	int wordmask = r->poolinfo.poolwords - 1;
-	__u32 w;
+	__u32 w, next_w;
 	unsigned long flags;
 
+	/* Taps are constant, so we can load them without holding r->lock.  */
+	tap1 = r->poolinfo.tap1;
+	tap2 = r->poolinfo.tap2;
+	tap3 = r->poolinfo.tap3;
+	tap4 = r->poolinfo.tap4;
+	tap5 = r->poolinfo.tap5;
+	next_w = *in++;
+
 	spin_lock_irqsave(&r->lock, flags);
+	prefetch_range(r->pool, wordmask);
+	input_rotate = r->input_rotate;
+	add_ptr = r->add_ptr;
 
 	while (nwords--) {
-		w = rotate_left(r->input_rotate, *in++);
-		i = r->add_ptr = (r->add_ptr - 1) & wordmask;
+		w = rotate_left(input_rotate, next_w);
+		if (nwords > 0)
+			next_w = *in++;
+		i = add_ptr = (add_ptr - 1) & wordmask;
 		/*
 		 * Normally, we add 7 bits of rotation to the pool.
 		 * At the beginning of the pool, add an extra 7 bits
 		 * rotation, so that successive passes spread the
 		 * input bits across the pool evenly.
 		 */
-		new_rotate = r->input_rotate + 14;
+		new_rotate = input_rotate + 14;
 		if (i)
-			new_rotate = r->input_rotate + 7;
-		r->input_rotate = new_rotate & 31;
+			new_rotate = input_rotate + 7;
+		input_rotate = new_rotate & 31;
 
 		/* XOR in the various taps */
-		w ^= r->pool[(i + r->poolinfo.tap1) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap2) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap3) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap4) & wordmask];
-		w ^= r->pool[(i + r->poolinfo.tap5) & wordmask];
+		w ^= r->pool[(i + tap1) & wordmask];
+		w ^= r->pool[(i + tap2) & wordmask];
+		w ^= r->pool[(i + tap3) & wordmask];
+		w ^= r->pool[(i + tap4) & wordmask];
+		w ^= r->pool[(i + tap5) & wordmask];
 		w ^= r->pool[i];
 		r->pool[i] = (w >> 3) ^ twist_table[w & 7];
 	}
 
+	r->input_rotate = input_rotate;
+	r->add_ptr = add_ptr;
+
 	spin_unlock_irqrestore(&r->lock, flags);
 }
 
diff --git a/include/linux/prefetch.h b/include/linux/prefetch.h
index 73c4a344156b..fc86f274147f 100644
--- a/include/linux/prefetch.h
+++ b/include/linux/prefetch.h
@@ -10,6 +10,7 @@
 #ifndef _LINUX_PREFETCH_H
 #define _LINUX_PREFETCH_H
 
+#include <linux/types.h>
 #include <asm/processor.h>
 #include <asm/cache.h>
 
@@ -54,4 +55,15 @@ static inline void prefetchw(const void *x) {;}
 #define PREFETCH_STRIDE (4*L1_CACHE_BYTES)
 #endif
 
+static inline void prefetch_range(void *addr, size_t len)
+{
+#ifdef ARCH_HAS_PREFETCH
+	char *cp;
+	char *end = addr + len;
+
+	for (cp = addr; cp < end; cp += PREFETCH_STRIDE)
+		prefetch(cp);
+#endif
+}
+
 #endif
-- 
cgit v1.2.3


From 7ee168c0b7a988210cc8024d105dfd1cb3e956e6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:05:02 -0700
Subject: [PATCH] Move __this_module to modpost

From: Brian Gerst <bgerst@didntduck.org>

Move the __this_module structure to the modpost code where it really
belongs.
---
 include/linux/module.h   | 16 +---------------
 scripts/Makefile.modpost |  2 ++
 scripts/modpost.c        | 12 ++++++++++++
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include')

diff --git a/include/linux/module.h b/include/linux/module.h
index 4c9b53d5d51c..0a86652fb1cb 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -70,6 +70,7 @@ static const char __module_cat(name,__LINE__)[]				  \
 extern const struct gtype##_id __mod_##gtype##_table		\
   __attribute__ ((unused, alias(__stringify(name))))
 
+extern struct module __this_module;
 #define THIS_MODULE (&__this_module)
 
 #else  /* !MODULE */
@@ -481,21 +482,6 @@ static inline int unregister_module_notifier(struct notifier_block * nb)
 
 #endif /* CONFIG_MODULES */
 
-#ifdef MODULE
-extern struct module __this_module;
-#ifdef KBUILD_MODNAME
-/* We make the linker do some of the work. */
-struct module __this_module
-__attribute__((section(".gnu.linkonce.this_module"))) = {
-	.name = __stringify(KBUILD_MODNAME),
-	.init = init_module,
-#ifdef CONFIG_MODULE_UNLOAD
-	.exit = cleanup_module,
-#endif
-};
-#endif /* KBUILD_MODNAME */
-#endif /* MODULE */
-
 #define symbol_request(x) try_then_request_module(symbol_get(x), "symbol:" #x)
 
 /* BELOW HERE ALL THESE ARE OBSOLETE AND WILL VANISH */
diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost
index cd716ece8976..d349dda5edf7 100644
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -35,6 +35,8 @@ targets += $(modules)
 
 # Compile version info for unresolved symbols
 
+modname = $(*F)
+
 quiet_cmd_cc_o_c = CC      $@
       cmd_cc_o_c = $(CC) $(c_flags) $(CFLAGS_MODULE)	\
 		   -c -o $@ $<
diff --git a/scripts/modpost.c b/scripts/modpost.c
index a0976fcb9982..69168e18d5bf 100644
--- a/scripts/modpost.c
+++ b/scripts/modpost.c
@@ -343,6 +343,9 @@ handle_modversions(struct module *mod, struct elf_info *info,
 		/* ignore global offset table */
 		if (strcmp(symname, "_GLOBAL_OFFSET_TABLE_") == 0)
 			break;
+		/* ignore __this_module, it will be resolved shortly */
+		if (strcmp(symname, MODULE_SYMBOL_PREFIX "__this_module") == 0)
+			break;
 #ifdef STT_REGISTER
 		if (info->hdr->e_machine == EM_SPARC ||
 		    info->hdr->e_machine == EM_SPARCV9) {
@@ -468,6 +471,15 @@ add_header(struct buffer *b)
 	buf_printf(b, "#include <linux/compiler.h>\n");
 	buf_printf(b, "\n");
 	buf_printf(b, "MODULE_INFO(vermagic, VERMAGIC_STRING);\n");
+	buf_printf(b, "\n");
+	buf_printf(b, "struct module __this_module\n");
+	buf_printf(b, "__attribute__((section(\".gnu.linkonce.this_module\"))) = {\n");
+	buf_printf(b, " .name = __stringify(KBUILD_MODNAME),\n");
+	buf_printf(b, " .init = init_module,\n");
+	buf_printf(b, "#ifdef CONFIG_MODULE_UNLOAD\n");
+	buf_printf(b, " .exit = cleanup_module,\n");
+	buf_printf(b, "#endif\n");
+	buf_printf(b, "};\n");
 }
 
 /* Record CRCs for unresolved symbols */
-- 
cgit v1.2.3


From 387f7c83eb26b4f45e6d843f2ef703aafbe6c80f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:05:28 -0700
Subject: [PATCH] Support for floppies whose sectors are numbered from zero
 instead of one

From: Marcelo Tosatti <marcelo.tosatti@cyclades.com>

From: Alain Knaff <alain.knaff@lll.lu>

This patch adds support for floppy disks whose sectors are numbered
starting at 0 rather than 1 as usual disks would be.  This format is used
for some CP/M disks, and also for certain music samplers (such as Ensoniq
Ensoniq EPS 16plus).

In order to use it, you need an fdutils with the current patch from
http://fdutils.linux.lu as well, and then do setfdrpm /dev/fd0 dd zerobased
sect=10 or setfdprm /dev/fd0 hd zerobased sect.

In addtion, the patch also fixes my email addresses.  I no longer use
pobox.com.
---
 drivers/block/floppy.c | 19 +++++++++++++++----
 include/linux/fd.h     |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 8e05b64feeb2..494acc9fcfcc 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2258,6 +2258,10 @@ static void setup_format_params(int track)
 			}
 		}
 	}
+	if(_floppy->stretch & FD_ZEROBASED) {
+	    for(count = 0; count < F_SECT_PER_TRACK; count++)
+		here[count].sect--;
+	}
 }
 
 static void redo_format(void)
@@ -2679,7 +2683,8 @@ static int make_raw_rw_request(void)
 	}
 	HEAD = fsector_t / _floppy->sect;
 
-	if (((_floppy->stretch & FD_SWAPSIDES) || TESTF(FD_NEED_TWADDLE)) &&
+	if (((_floppy->stretch & (FD_SWAPSIDES | FD_ZEROBASED)) ||
+	     TESTF(FD_NEED_TWADDLE)) &&
 	    fsector_t < _floppy->sect)
 		max_sector = _floppy->sect;
 
@@ -2709,7 +2714,8 @@ static int make_raw_rw_request(void)
 	GAP = _floppy->gap;
 	CODE2SIZE;
 	SECT_PER_TRACK = _floppy->sect << 2 >> SIZECODE;
-	SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) + 1;
+	SECTOR = ((fsector_t % _floppy->sect) << 2 >> SIZECODE) +
+	    ((_floppy->stretch & FD_ZEROBASED) ? 0 : 1);
 
 	/* tracksize describes the size which can be filled up with sectors
 	 * of size ssize.
@@ -3346,7 +3352,7 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 	    g->track <= 0 ||
 	    g->track > UDP->tracks>>STRETCH(g) ||
 	    /* check if reserved bits are set */
-	    (g->stretch&~(FD_STRETCH|FD_SWAPSIDES)) != 0)
+	    (g->stretch&~(FD_STRETCH|FD_SWAPSIDES|FD_ZEROBASED)) != 0)
 		return -EINVAL;
 	if (type){
 		if (!capable(CAP_SYS_ADMIN))
@@ -3367,11 +3373,13 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 		}
 		up(&open_lock);
 	} else {
+		int oldStretch;
 		LOCK_FDC(drive,1);
 		if (cmd != FDDEFPRM)
 			/* notice a disk change immediately, else
 			 * we lose our settings immediately*/
 			CALL(poll_drive(1, FD_RAW_NEED_DISK));
+		oldStretch = g->stretch;
 		user_params[drive] = *g;
 		if (buffer_drive == drive)
 			SUPBOUND(buffer_max, user_params[drive].sect);
@@ -3386,7 +3394,10 @@ static inline int set_geometry(unsigned int cmd, struct floppy_struct *g,
 		 * whose number will change. This is useful, because
 		 * mtools often changes the geometry of the disk after
 		 * looking at the boot block */
-		if (DRS->maxblock > user_params[drive].sect || DRS->maxtrack)
+		if (DRS->maxblock > user_params[drive].sect ||
+		    DRS->maxtrack ||
+		    ((user_params[drive].sect ^ oldStretch) &
+		     (FD_SWAPSIDES | FD_ZEROBASED)))
 			invalidate_drive(bdev);
 		else
 			process_fd_request();
diff --git a/include/linux/fd.h b/include/linux/fd.h
index 187785b83958..cbbd0f876585 100644
--- a/include/linux/fd.h
+++ b/include/linux/fd.h
@@ -17,6 +17,7 @@ struct floppy_struct {
 			stretch;	/* !=0 means double track steps */
 #define FD_STRETCH 1
 #define FD_SWAPSIDES 2
+#define FD_ZEROBASED 4
 
 	unsigned char	gap,		/* gap1 size */
 
-- 
cgit v1.2.3


From 15e98d5d7a15045e2c0d3dc0f27b9100f50b5fe5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:06:21 -0700
Subject: [PATCH] summit: per-subarch NR_IRQ_VECTORS

From: James Cleverdon <jamesclv@us.ibm.com>

Break out the definition of NR_IRQ_VECTORS, etc from irq_vectors.h into
irq_vectors_limits.h, so we can change it per subarch without having code
duplication for the rest of the file.  Stick the same values back for
mach-default, and override them for mach-summit/generic which needs bigger
limits.
---
 include/asm-i386/mach-default/irq_vectors.h        | 17 +----------------
 include/asm-i386/mach-default/irq_vectors_limits.h | 21 +++++++++++++++++++++
 include/asm-i386/mach-generic/irq_vectors_limits.h | 14 ++++++++++++++
 include/asm-i386/mach-summit/irq_vectors_limits.h  | 14 ++++++++++++++
 4 files changed, 50 insertions(+), 16 deletions(-)
 create mode 100644 include/asm-i386/mach-default/irq_vectors_limits.h
 create mode 100644 include/asm-i386/mach-generic/irq_vectors_limits.h
 create mode 100644 include/asm-i386/mach-summit/irq_vectors_limits.h

(limited to 'include')

diff --git a/include/asm-i386/mach-default/irq_vectors.h b/include/asm-i386/mach-default/irq_vectors.h
index 8381e7e6197b..881c63ca61ad 100644
--- a/include/asm-i386/mach-default/irq_vectors.h
+++ b/include/asm-i386/mach-default/irq_vectors.h
@@ -84,22 +84,7 @@
  */
 #define NR_VECTORS 256
 
-#ifdef CONFIG_PCI_USE_VECTOR
-#define NR_IRQS FIRST_SYSTEM_VECTOR
-#define NR_IRQ_VECTORS NR_IRQS
-#else
-#ifdef CONFIG_X86_IO_APIC
-#define NR_IRQS 224
-# if (224 >= 32 * NR_CPUS)
-# define NR_IRQ_VECTORS NR_IRQS
-# else
-# define NR_IRQ_VECTORS (32 * NR_CPUS)
-# endif
-#else
-#define NR_IRQS 16
-#define NR_IRQ_VECTORS NR_IRQS
-#endif
-#endif
+#include "irq_vectors_limits.h"
 
 #define FPU_IRQ			13
 
diff --git a/include/asm-i386/mach-default/irq_vectors_limits.h b/include/asm-i386/mach-default/irq_vectors_limits.h
new file mode 100644
index 000000000000..78c30cbcede4
--- /dev/null
+++ b/include/asm-i386/mach-default/irq_vectors_limits.h
@@ -0,0 +1,21 @@
+#ifndef _ASM_IRQ_VECTORS_LIMITS_H
+#define _ASM_IRQ_VECTORS_LIMITS_H
+
+#ifdef CONFIG_PCI_USE_VECTOR
+#define NR_IRQS FIRST_SYSTEM_VECTOR
+#define NR_IRQ_VECTORS NR_IRQS
+#else
+#ifdef CONFIG_X86_IO_APIC
+#define NR_IRQS 224
+# if (224 >= 32 * NR_CPUS)
+# define NR_IRQ_VECTORS NR_IRQS
+# else
+# define NR_IRQ_VECTORS (32 * NR_CPUS)
+# endif
+#else
+#define NR_IRQS 16
+#define NR_IRQ_VECTORS NR_IRQS
+#endif
+#endif
+
+#endif /* _ASM_IRQ_VECTORS_LIMITS_H */
diff --git a/include/asm-i386/mach-generic/irq_vectors_limits.h b/include/asm-i386/mach-generic/irq_vectors_limits.h
new file mode 100644
index 000000000000..890ce3f5e09a
--- /dev/null
+++ b/include/asm-i386/mach-generic/irq_vectors_limits.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_IRQ_VECTORS_LIMITS_H
+#define _ASM_IRQ_VECTORS_LIMITS_H
+
+/*
+ * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs,
+ * even with uni-proc kernels, so use a big array.
+ *
+ * This value should be the same in both the generic and summit subarches.
+ * Change one, change 'em both.
+ */
+#define NR_IRQS	224
+#define NR_IRQ_VECTORS	1024
+
+#endif /* _ASM_IRQ_VECTORS_LIMITS_H */
diff --git a/include/asm-i386/mach-summit/irq_vectors_limits.h b/include/asm-i386/mach-summit/irq_vectors_limits.h
new file mode 100644
index 000000000000..890ce3f5e09a
--- /dev/null
+++ b/include/asm-i386/mach-summit/irq_vectors_limits.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_IRQ_VECTORS_LIMITS_H
+#define _ASM_IRQ_VECTORS_LIMITS_H
+
+/*
+ * For Summit or generic (i.e. installer) kernels, we have lots of I/O APICs,
+ * even with uni-proc kernels, so use a big array.
+ *
+ * This value should be the same in both the generic and summit subarches.
+ * Change one, change 'em both.
+ */
+#define NR_IRQS	224
+#define NR_IRQ_VECTORS	1024
+
+#endif /* _ASM_IRQ_VECTORS_LIMITS_H */
-- 
cgit v1.2.3


From 27b5c750bb8e5f2e01e1e7a605da7ae3383305a0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:06:32 -0700
Subject: [PATCH] summmit: increase MAX_MP_BUSSES

From: James Cleverdon <jamesclv@us.ibm.com>

Bump up MAX_MP_BUSSES for summit/generic subarch to cope with big IBM x440
systems.
---
 include/asm-i386/mach-generic/mach_mpspec.h | 4 +++-
 include/asm-i386/mach-summit/mach_mpspec.h  | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-i386/mach-generic/mach_mpspec.h b/include/asm-i386/mach-generic/mach_mpspec.h
index ef10cd205575..fbb6a40ffd91 100644
--- a/include/asm-i386/mach-generic/mach_mpspec.h
+++ b/include/asm-i386/mach-generic/mach_mpspec.h
@@ -8,6 +8,8 @@
 
 #define MAX_IRQ_SOURCES 256
 
-#define MAX_MP_BUSSES 32
+/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
+/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
+#define MAX_MP_BUSSES 260
 
 #endif /* __ASM_MACH_MPSPEC_H */
diff --git a/include/asm-i386/mach-summit/mach_mpspec.h b/include/asm-i386/mach-summit/mach_mpspec.h
index ef10cd205575..bc8f7177dc10 100644
--- a/include/asm-i386/mach-summit/mach_mpspec.h
+++ b/include/asm-i386/mach-summit/mach_mpspec.h
@@ -8,6 +8,7 @@
 
 #define MAX_IRQ_SOURCES 256
 
-#define MAX_MP_BUSSES 32
+/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
+#define MAX_MP_BUSSES 260
 
 #endif /* __ASM_MACH_MPSPEC_H */
-- 
cgit v1.2.3


From 9938e2c218f2c86f344bf8e9c707666a4cf0736b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:06:47 -0700
Subject: [PATCH] ia64 MSI support

From: "Nguyen, Tom L" <tom.l.nguyen@intel.com>

Adds MSI support for ia64.

- Modified existing code in drivers/pci/msi.c and drivers/pci/msi.h to
  include MSI support on IA64 platform.

- Based on the comments received from Zwane Mwaikambo and David Mosberger,
  this patch consolidates the vector allocators as
  assign_irq_vector(AUTO_ASSIGN) has the same semantics as
  ia64_alloc_vector() by converting the existing uses of ia64_alloc_vector()
  to assign_irq_vector(AUTO_ASSIGN).

- Based on the comments received from Zwane Mwaikambo, this patch
  consolidates the semantics of vector allocator assign_irq_vector() in
  drivers/pci/msi.c into the relevant architecture's vector allocator
  assign_irq_vector() in arch/i386/kernel/io_apic.c.

- Regarding vector allocation, this patch modifies the existing function
  assign_irq_vector() to maximize the number of allocated vectors to 188
  before going -ENOSPC.

- Based on your comments, this patch creates <asm-i386/msi.h>,
  <asm-ia64/msi.h> and <asm-x86_64/msi.h>, includes <asm/msi.h> from within
  drivers/pci/msi.h and then places all the code which is currently under
  ifdef in msi.h into the relevant architecture's <asm/msi.h> file.

- Based on your comments, this patch places pci_vector_resources() in
  existing drivers/pci/msi.c in the relevant architecture implementations
  such as into arch/.../pci/irq.c.
---
 arch/i386/kernel/io_apic.c   | 19 ++++++---
 arch/i386/pci/irq.c          | 31 +++++++++++++++
 arch/ia64/Kconfig            | 10 +++++
 arch/ia64/hp/sim/simeth.c    |  2 +-
 arch/ia64/hp/sim/simserial.c |  2 +-
 arch/ia64/kernel/iosapic.c   |  8 ++--
 arch/ia64/kernel/irq_ia64.c  |  4 +-
 arch/ia64/pci/pci.c          | 10 +++++
 drivers/pci/msi.c            | 94 ++++++++------------------------------------
 drivers/pci/msi.h            | 25 +++---------
 include/asm-i386/hw_irq.h    |  1 +
 include/asm-i386/msi.h       | 22 +++++++++++
 include/asm-ia64/hw_irq.h    |  4 +-
 include/asm-ia64/irq.h       |  1 +
 include/asm-ia64/msi.h       | 20 ++++++++++
 include/asm-x86_64/msi.h     | 21 ++++++++++
 16 files changed, 162 insertions(+), 112 deletions(-)
 create mode 100644 include/asm-i386/msi.h
 create mode 100644 include/asm-ia64/msi.h
 create mode 100644 include/asm-x86_64/msi.h

(limited to 'include')

diff --git a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
index 66a941542c81..6d8020eb4417 100644
--- a/arch/i386/kernel/io_apic.c
+++ b/arch/i386/kernel/io_apic.c
@@ -76,8 +76,8 @@ static struct irq_pin_list {
 	int apic, pin, next;
 } irq_2_pin[PIN_MAP_SIZE];
 
+int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
 #ifdef CONFIG_PCI_USE_VECTOR
-int vector_irq[NR_IRQS] = { [0 ... NR_IRQS -1] = -1};
 #define vector_to_irq(vector) 	\
 	(platform_legacy_irq(vector) ? vector : vector_irq[vector])
 #else
@@ -1149,12 +1149,16 @@ static inline int IO_APIC_irq_trigger(int irq)
 /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
 u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 
-#ifndef CONFIG_PCI_USE_VECTOR
+#ifdef CONFIG_PCI_USE_VECTOR
+int assign_irq_vector(int irq)
+#else
 int __init assign_irq_vector(int irq)
+#endif
 {
 	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
+
 	BUG_ON(irq >= NR_IRQ_VECTORS);
-	if (IO_APIC_VECTOR(irq) > 0)
+	if (irq != AUTO_ASSIGN && IO_APIC_VECTOR(irq) > 0)
 		return IO_APIC_VECTOR(irq);
 next:
 	current_vector += 8;
@@ -1162,15 +1166,18 @@ next:
 		goto next;
 
 	if (current_vector >= FIRST_SYSTEM_VECTOR) {
-		offset = (offset + 1) & 7;
+		offset++;
+		if (!(offset%8))
+			return -ENOSPC;
 		current_vector = FIRST_DEVICE_VECTOR + offset;
 	}
 
-	IO_APIC_VECTOR(irq) = current_vector;
+	vector_irq[current_vector] = irq;
+	if (irq != AUTO_ASSIGN)
+		IO_APIC_VECTOR(irq) = current_vector;
 
 	return current_vector;
 }
-#endif
 
 static struct hw_interrupt_type ioapic_level_type;
 static struct hw_interrupt_type ioapic_edge_type;
diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index 55c8ee19df4f..752e1b3fe175 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -15,6 +15,7 @@
 #include <asm/io.h>
 #include <asm/smp.h>
 #include <asm/io_apic.h>
+#include <asm/hw_irq.h>
 
 #include "pci.h"
 
@@ -1005,3 +1006,33 @@ int pirq_enable_irq(struct pci_dev *dev)
 		pci_write_config_byte(dev, PCI_INTERRUPT_LINE, dev->irq);
 	return 0;
 }
+
+int pci_vector_resources(int last, int nr_released)
+{
+	int count = nr_released;
+
+	int next = last;
+	int offset = (last % 8);
+
+	while (next < FIRST_SYSTEM_VECTOR) {
+		next += 8;
+#ifdef CONFIG_X86_64
+		if (next == IA32_SYSCALL_VECTOR)
+			continue;
+#else
+		if (next == SYSCALL_VECTOR)
+			continue;
+#endif
+		count++;
+		if (next >= FIRST_SYSTEM_VECTOR) {
+			if (offset%8) {
+				next = FIRST_DEVICE_VECTOR + offset;
+				offset++;
+				continue;
+			}
+			count--;
+		}
+	}
+
+	return count;
+}
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index efe518da2590..34a319de7ca8 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -361,6 +361,16 @@ config PCI
 	  information about which PCI hardware does work under Linux and which
 	  doesn't.
 
+config PCI_USE_VECTOR
+	bool
+	default y if IA64
+	help
+	   This enables MSI, Message Signaled Interrupt, on specific
+	   MSI capable device functions detected upon requests from the
+	   device drivers. Message Signal Interrupt enables an MSI-capable
+	   hardware device to send an inbound Memory Write on its PCI bus
+	   instead of asserting IRQ signal on device IRQ pin.
+
 config PCI_DOMAINS
 	bool
 	default PCI
diff --git a/arch/ia64/hp/sim/simeth.c b/arch/ia64/hp/sim/simeth.c
index f51225465fee..93f4d190c4b6 100644
--- a/arch/ia64/hp/sim/simeth.c
+++ b/arch/ia64/hp/sim/simeth.c
@@ -228,7 +228,7 @@ simeth_probe1(void)
 		return err;
 	}
 
-	dev->irq = ia64_alloc_vector();
+	dev->irq = assign_irq_vector(AUTO_ASSIGN);
 
 	/*
 	 * attach the interrupt in the simulator, this does enable interrupts
diff --git a/arch/ia64/hp/sim/simserial.c b/arch/ia64/hp/sim/simserial.c
index 2a16d807514f..381277884e6c 100644
--- a/arch/ia64/hp/sim/simserial.c
+++ b/arch/ia64/hp/sim/simserial.c
@@ -1051,7 +1051,7 @@ simrs_init (void)
 		if (state->type == PORT_UNKNOWN) continue;
 
 		if (!state->irq) {
-			state->irq = ia64_alloc_vector();
+			state->irq = assign_irq_vector(AUTO_ASSIGN);
 			ia64_ssc_connect_irq(KEYBOARD_INTR, state->irq);
 		}
 
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 3fa76bf1f83f..8418d76158eb 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -435,7 +435,7 @@ iosapic_reassign_vector (int vector)
 	    || iosapic_intr_info[vector].gsi_base || iosapic_intr_info[vector].dmode
 	    || iosapic_intr_info[vector].polarity || iosapic_intr_info[vector].trigger)
 	{
-		new_vector = ia64_alloc_vector();
+		new_vector = assign_irq_vector(AUTO_ASSIGN);
 		printk(KERN_INFO "Reassigning vector %d to %d\n", vector, new_vector);
 		memcpy(&iosapic_intr_info[new_vector], &iosapic_intr_info[vector],
 		       sizeof(struct iosapic_intr_info));
@@ -500,7 +500,7 @@ iosapic_register_intr (unsigned int gsi,
 
 	vector = gsi_to_vector(gsi);
 	if (vector < 0)
-		vector = ia64_alloc_vector();
+		vector = assign_irq_vector(AUTO_ASSIGN);
 
 	register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY,
 		      polarity, trigger);
@@ -538,7 +538,7 @@ iosapic_register_platform_intr (u32 int_type, unsigned int gsi,
 		delivery = IOSAPIC_PMI;
 		break;
 	      case ACPI_INTERRUPT_INIT:
-		vector = ia64_alloc_vector();
+		vector = assign_irq_vector(AUTO_ASSIGN);
 		delivery = IOSAPIC_INIT;
 		break;
 	      case ACPI_INTERRUPT_CPEI:
@@ -708,7 +708,7 @@ iosapic_parse_prt (void)
 				vector = isa_irq_to_vector(gsi);
 			else
 				/* new GSI; allocate a vector for it */
-				vector = ia64_alloc_vector();
+				vector = assign_irq_vector(AUTO_ASSIGN);
 
 			register_intr(gsi, vector, IOSAPIC_LOWEST_PRIORITY, IOSAPIC_POL_LOW,
 				      IOSAPIC_LEVEL);
diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index 89411e1b5e55..4c89328dd9c0 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -73,13 +73,13 @@ irq_exit (void)
 }
 
 int
-ia64_alloc_vector (void)
+assign_irq_vector (int irq)
 {
 	static int next_vector = IA64_FIRST_DEVICE_VECTOR;
 
 	if (next_vector > IA64_LAST_DEVICE_VECTOR)
 		/* XXX could look for sharable vectors instead of panic'ing... */
-		panic("ia64_alloc_vector: out of interrupt vectors!");
+		panic("assign_irq_vector: out of interrupt vectors!");
 	return next_vector++;
 }
 
diff --git a/arch/ia64/pci/pci.c b/arch/ia64/pci/pci.c
index 391d2b476c47..466129b12e46 100644
--- a/arch/ia64/pci/pci.c
+++ b/arch/ia64/pci/pci.c
@@ -34,6 +34,7 @@
 # include <asm/smp.h>
 #endif
 #include <asm/irq.h>
+#include <asm/hw_irq.h>
 
 
 #undef DEBUG
@@ -567,3 +568,12 @@ pcibios_prep_mwi (struct pci_dev *dev)
 	}
 	return rc;
 }
+
+int pci_vector_resources(int last, int nr_released)
+{
+	int count = nr_released;
+
+ 	count += (IA64_LAST_DEVICE_VECTOR - last);
+
+	return count;
+}
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index f7bc8f144a0e..132af770d38f 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -19,26 +19,22 @@
 #include <asm/errno.h>
 #include <asm/io.h>
 #include <asm/smp.h>
-#include <asm/desc.h>
-#include <asm/io_apic.h>
-#include <mach_apic.h>
 
 #include "msi.h"
 
-
 static spinlock_t msi_lock = SPIN_LOCK_UNLOCKED;
 static struct msi_desc* msi_desc[NR_IRQS] = { [0 ... NR_IRQS-1] = NULL };
 static kmem_cache_t* msi_cachep;
 
 static int pci_msi_enable = 1;
-static int nr_alloc_vectors = 0;
+static int last_alloc_vector = 0;
 static int nr_released_vectors = 0;
 static int nr_reserved_vectors = NR_HP_RESERVED_VECTORS;
 static int nr_msix_devices = 0;
 
 #ifndef CONFIG_X86_IO_APIC
-int vector_irq[NR_IRQS] = { [0 ... NR_IRQS -1] = -1};
-u8 irq_vector[NR_IRQS] = { FIRST_DEVICE_VECTOR , 0 };
+int vector_irq[NR_VECTORS] = { [0 ... NR_VECTORS - 1] = -1};
+u8 irq_vector[NR_IRQ_VECTORS] = { FIRST_DEVICE_VECTOR , 0 };
 #endif
 
 static void msi_cache_ctor(void *p, kmem_cache_t *cache, unsigned long flags)
@@ -96,7 +92,6 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 {
 	struct msi_desc *entry;
 	struct msg_address address;
-	unsigned int dest_id;
 
 	entry = (struct msi_desc *)msi_desc[vector];
 	if (!entry || !entry->dev)
@@ -113,10 +108,9 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 	        entry->dev->bus->ops->read(entry->dev->bus, entry->dev->devfn,
 			msi_lower_address_reg(pos), 4,
 			&address.lo_address.value);
-		dest_id = (address.lo_address.u.dest_id &
-			MSI_ADDRESS_HEADER_MASK) |
-			(cpu_mask_to_apicid(cpu_mask) << MSI_TARGET_CPU_SHIFT);
-		address.lo_address.u.dest_id = dest_id;
+		address.lo_address.value &= MSI_ADDRESS_DEST_ID_MASK;
+		address.lo_address.value |= (cpu_mask_to_apicid(cpu_mask) <<
+			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		entry->dev->bus->ops->write(entry->dev->bus, entry->dev->devfn,
 			msi_lower_address_reg(pos), 4,
@@ -129,10 +123,9 @@ static void set_msi_affinity(unsigned int vector, cpumask_t cpu_mask)
 			PCI_MSIX_ENTRY_LOWER_ADDR_OFFSET;
 
 		address.lo_address.value = readl(entry->mask_base + offset);
-		dest_id = (address.lo_address.u.dest_id &
-			MSI_ADDRESS_HEADER_MASK) |
-			(cpu_mask_to_apicid(cpu_mask) << MSI_TARGET_CPU_SHIFT);
-		address.lo_address.u.dest_id = dest_id;
+		address.lo_address.value &= MSI_ADDRESS_DEST_ID_MASK;
+		address.lo_address.value |= (cpu_mask_to_apicid(cpu_mask) <<
+			MSI_TARGET_CPU_SHIFT);
 		entry->msi_attrib.current_cpu = cpu_mask_to_apicid(cpu_mask);
 		writel(address.lo_address.value, entry->mask_base + offset);
 		break;
@@ -265,61 +258,11 @@ static void msi_address_init(struct msg_address *msi_address)
 
 	memset(msi_address, 0, sizeof(struct msg_address));
 	msi_address->hi_address = (u32)0;
-	dest_id = (MSI_ADDRESS_HEADER << MSI_ADDRESS_HEADER_SHIFT) |
-		 (MSI_TARGET_CPU << MSI_TARGET_CPU_SHIFT);
-	msi_address->lo_address.u.dest_mode = MSI_LOGICAL_MODE;
+	dest_id = (MSI_ADDRESS_HEADER << MSI_ADDRESS_HEADER_SHIFT);
+	msi_address->lo_address.u.dest_mode = MSI_DEST_MODE;
 	msi_address->lo_address.u.redirection_hint = MSI_REDIRECTION_HINT_MODE;
 	msi_address->lo_address.u.dest_id = dest_id;
-}
-
-static int pci_vector_resources(void)
-{
-	static int res = -EINVAL;
-	int nr_free_vectors;
-
-	if (res == -EINVAL) {
-		int i, repeat;
-		for (i = NR_REPEATS; i > 0; i--) {
-			if ((FIRST_DEVICE_VECTOR + i * 8) > FIRST_SYSTEM_VECTOR)
-				continue;
-			break;
-		}
-		i++;
-		repeat = (FIRST_SYSTEM_VECTOR - FIRST_DEVICE_VECTOR)/i;
-		res = i * repeat - NR_RESERVED_VECTORS + 1;
-	}
-
-	nr_free_vectors = res + nr_released_vectors - nr_alloc_vectors;
-
-	return nr_free_vectors;
-}
-
-int assign_irq_vector(int irq)
-{
-	static int current_vector = FIRST_DEVICE_VECTOR, offset = 0;
-
-	if (irq != MSI_AUTO && IO_APIC_VECTOR(irq) > 0)
-		return IO_APIC_VECTOR(irq);
-next:
-	current_vector += 8;
-	if (current_vector == SYSCALL_VECTOR)
-		goto next;
-
-	if (current_vector > FIRST_SYSTEM_VECTOR) {
-		offset++;
-		current_vector = FIRST_DEVICE_VECTOR + offset;
-	}
-
-	if (current_vector == FIRST_SYSTEM_VECTOR)
-		return -ENOSPC;
-
-	vector_irq[current_vector] = irq;
-	if (irq != MSI_AUTO)
-		IO_APIC_VECTOR(irq) = current_vector;
-
-	nr_alloc_vectors++;
-
-	return current_vector;
+	msi_address->lo_address.value |= (MSI_TARGET_CPU << MSI_TARGET_CPU_SHIFT);
 }
 
 static int assign_msi_vector(void)
@@ -333,10 +276,6 @@ static int assign_msi_vector(void)
 	 * vector is assigned unique among drivers.
 	 */
 	spin_lock_irqsave(&msi_lock, flags);
-	if (!(pci_vector_resources() > 0)) {
-		spin_unlock_irqrestore(&msi_lock, flags);
-		return -EBUSY;
-	}
 
 	if (!new_vector_avail) {
 		/*
@@ -363,9 +302,9 @@ static int assign_msi_vector(void)
 		spin_unlock_irqrestore(&msi_lock, flags);
 		return -EBUSY;
 	}
-
-	vector = assign_irq_vector(MSI_AUTO);
-	if (vector  == (FIRST_SYSTEM_VECTOR - 8))
+	vector = assign_irq_vector(AUTO_ASSIGN);
+	last_alloc_vector = vector;
+	if (vector  == LAST_DEVICE_VECTOR)
 		new_vector_avail = 0;
 
 	spin_unlock_irqrestore(&msi_lock, flags);
@@ -924,7 +863,8 @@ int msi_alloc_vectors(struct pci_dev* dev, int *vector, int nvec)
 	 * msi_lock is provided to ensure that enough vectors resources are
 	 * available before granting.
 	 */
-	free_vectors = pci_vector_resources();
+	free_vectors = pci_vector_resources(last_alloc_vector,
+				nr_released_vectors);
 	/* Ensure that each MSI/MSI-X device has one vector reserved by
 	   default to avoid any MSI-X driver to take all available
  	   resources */
diff --git a/drivers/pci/msi.h b/drivers/pci/msi.h
index 0e91abad40f7..524504fdd4f0 100644
--- a/drivers/pci/msi.h
+++ b/drivers/pci/msi.h
@@ -1,6 +1,4 @@
 /*
- * File:	msi.h
- *
  * Copyright (C) 2003-2004 Intel
  * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
  */
@@ -8,9 +6,7 @@
 #ifndef MSI_H
 #define MSI_H
 
-#define MSI_AUTO -1
-#define NR_REPEATS	23
-#define NR_RESERVED_VECTORS 3 /*FIRST_DEVICE_VECTOR,FIRST_SYSTEM_VECTOR,0x80 */
+#include <asm/msi.h>
 
 /*
  * Assume the maximum number of hot plug slots supported by the system is about
@@ -22,9 +18,10 @@
  */
 #define NR_HP_RESERVED_VECTORS 	20
 
-extern int vector_irq[NR_IRQS];
+extern int vector_irq[NR_VECTORS];
 extern cpumask_t pending_irq_balance_cpumask[NR_IRQS];
 extern void (*interrupt[NR_IRQS])(void);
+extern int pci_vector_resources(int last, int nr_released);
 
 #ifdef CONFIG_SMP
 #define set_msi_irq_affinity	set_msi_affinity
@@ -36,13 +33,6 @@ extern void (*interrupt[NR_IRQS])(void);
 static inline void move_msi(int vector) {}
 #endif
 
-#ifndef CONFIG_X86_IO_APIC
-static inline int get_ioapic_vector(struct pci_dev *dev) { return -1;}
-static inline void restore_ioapic_irq_handler(int irq) {}
-#else
-extern void restore_ioapic_irq_handler(int irq);
-#endif
-
 /*
  * MSI-X Address Register
  */
@@ -85,25 +75,20 @@ extern void restore_ioapic_irq_handler(int irq);
 #define msix_mask(address)		(address | PCI_MSIX_FLAGS_BITMASK)
 #define msix_is_pending(address) 	(address & PCI_MSIX_FLAGS_PENDMASK)
 
-
 /*
  * MSI Defined Data Structures
  */
 #define MSI_ADDRESS_HEADER		0xfee
 #define MSI_ADDRESS_HEADER_SHIFT	12
 #define MSI_ADDRESS_HEADER_MASK		0xfff000
-#define MSI_TARGET_CPU_SHIFT		4
+#define MSI_ADDRESS_DEST_ID_MASK	0xfff0000f
 #define MSI_TARGET_CPU_MASK		0xff
 #define MSI_DELIVERY_MODE		0
 #define MSI_LEVEL_MODE			1	/* Edge always assert */
 #define MSI_TRIGGER_MODE		0	/* MSI is edge sensitive */
+#define MSI_PHYSICAL_MODE		0
 #define MSI_LOGICAL_MODE		1
 #define MSI_REDIRECTION_HINT_MODE	0
-#ifdef CONFIG_SMP
-#define MSI_TARGET_CPU			logical_smp_processor_id()
-#else
-#define MSI_TARGET_CPU			TARGET_CPUS
-#endif
 
 struct msg_data {
 #if defined(__LITTLE_ENDIAN_BITFIELD)
diff --git a/include/asm-i386/hw_irq.h b/include/asm-i386/hw_irq.h
index b8166a30fbe1..2875fe7bcedd 100644
--- a/include/asm-i386/hw_irq.h
+++ b/include/asm-i386/hw_irq.h
@@ -27,6 +27,7 @@
 
 extern u8 irq_vector[NR_IRQ_VECTORS];
 #define IO_APIC_VECTOR(irq)	(irq_vector[irq])
+#define AUTO_ASSIGN		-1
 
 extern void (*interrupt[NR_IRQS])(void);
 
diff --git a/include/asm-i386/msi.h b/include/asm-i386/msi.h
new file mode 100644
index 000000000000..e7047ef3a8fd
--- /dev/null
+++ b/include/asm-i386/msi.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#ifndef ASM_MSI_H
+#define ASM_MSI_H
+
+#include <asm/desc.h>
+#include <mach_apic.h>
+
+#define LAST_DEVICE_VECTOR		232
+#define MSI_DEST_MODE			MSI_LOGICAL_MODE
+#define MSI_TARGET_CPU_SHIFT		12
+
+#ifdef CONFIG_SMP
+#define MSI_TARGET_CPU		logical_smp_processor_id()
+#else
+#define MSI_TARGET_CPU		TARGET_CPUS
+#endif
+
+#endif /* ASM_MSI_H */
diff --git a/include/asm-ia64/hw_irq.h b/include/asm-ia64/hw_irq.h
index be653c915f21..a72134370811 100644
--- a/include/asm-ia64/hw_irq.h
+++ b/include/asm-ia64/hw_irq.h
@@ -34,6 +34,8 @@ typedef u8 ia64_vector;
 #define IA64_MAX_VECTORED_IRQ		255
 #define IA64_NUM_VECTORS		256
 
+#define AUTO_ASSIGN			-1
+
 #define IA64_SPURIOUS_INT_VECTOR	0x0f
 
 /*
@@ -80,7 +82,7 @@ extern unsigned long ipi_base_addr;
 
 extern struct hw_interrupt_type irq_type_ia64_lsapic;	/* CPU-internal interrupt controller */
 
-extern int ia64_alloc_vector (void);	/* allocate a free vector */
+extern int assign_irq_vector (int irq);	/* allocate a free vector */
 extern void ia64_send_ipi (int cpu, int vector, int delivery_mode, int redirect);
 extern void register_percpu_irq (ia64_vector vec, struct irqaction *action);
 
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 8e664a0d80f7..79479e2c6966 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -12,6 +12,7 @@
  */
 
 #define NR_IRQS		256
+#define NR_IRQ_VECTORS	NR_IRQS
 
 static __inline__ int
 irq_canonicalize (int irq)
diff --git a/include/asm-ia64/msi.h b/include/asm-ia64/msi.h
new file mode 100644
index 000000000000..c6145776b87f
--- /dev/null
+++ b/include/asm-ia64/msi.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#ifndef ASM_MSI_H
+#define ASM_MSI_H
+
+#define FIRST_DEVICE_VECTOR 	IA64_FIRST_DEVICE_VECTOR
+#define LAST_DEVICE_VECTOR	IA64_LAST_DEVICE_VECTOR
+static inline void set_intr_gate (int nr, void *func) {}
+#define IO_APIC_VECTOR(irq)	(irq)
+#define ack_APIC_irq		ia64_eoi
+#define irq_desc		_irq_desc
+#define cpu_mask_to_apicid(mask) cpu_physical_id(first_cpu(mask))
+#define MSI_DEST_MODE		MSI_PHYSICAL_MODE
+#define MSI_TARGET_CPU	((ia64_getreg(_IA64_REG_CR_LID) >> 16) & 0xffff)
+#define MSI_TARGET_CPU_SHIFT	4
+
+#endif /* ASM_MSI_H */
diff --git a/include/asm-x86_64/msi.h b/include/asm-x86_64/msi.h
new file mode 100644
index 000000000000..727b911f29ca
--- /dev/null
+++ b/include/asm-x86_64/msi.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2003-2004 Intel
+ * Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ */
+
+#ifndef ASM_MSI_H
+#define ASM_MSI_H
+
+#include <asm/desc.h>
+
+#define LAST_DEVICE_VECTOR		232
+#define MSI_DEST_MODE			MSI_LOGICAL_MODE
+#define MSI_TARGET_CPU_SHIFT		12
+
+#ifdef CONFIG_SMP
+#define MSI_TARGET_CPU		logical_smp_processor_id()
+#else
+#define MSI_TARGET_CPU		TARGET_CPUS
+#endif
+
+#endif /* ASM_MSI_H */
-- 
cgit v1.2.3


From 7653e3ac4901d2f7f3c98980aeb1a19d2d6f19e7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:07:26 -0700
Subject: [PATCH] missing NULL pointer check in pte_alloc_one.

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

Just found an small bug in pgalloc for s390*.  Comparing notes with other
architectures I found that pte_alloc_one is sick for alpha and sparc64 as
well.
---
 include/asm-alpha/pgalloc.h   |  5 ++++-
 include/asm-s390/pgalloc.h    |  5 ++++-
 include/asm-sparc64/pgalloc.h | 10 +++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-alpha/pgalloc.h b/include/asm-alpha/pgalloc.h
index b34194c3d96c..2ef3066df665 100644
--- a/include/asm-alpha/pgalloc.h
+++ b/include/asm-alpha/pgalloc.h
@@ -63,7 +63,10 @@ pte_free_kernel(pte_t *pte)
 static inline struct page *
 pte_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return virt_to_page(pte_alloc_one_kernel(mm, addr));
+	pte_t *pte = pte_alloc_one_kernel(mm, addr);
+	if (pte)
+		return virt_to_page(pte);
+	return 0;
 }
 
 static inline void
diff --git a/include/asm-s390/pgalloc.h b/include/asm-s390/pgalloc.h
index ec34d58cf463..90308137c8e6 100644
--- a/include/asm-s390/pgalloc.h
+++ b/include/asm-s390/pgalloc.h
@@ -139,7 +139,10 @@ pte_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr)
 static inline struct page *
 pte_alloc_one(struct mm_struct *mm, unsigned long vmaddr)
 {
-	return virt_to_page(pte_alloc_one_kernel(mm, vmaddr));
+	pte_t *pte = pte_alloc_one_kernel(mm, vmaddr);
+	if (pte)
+		return virt_to_page(pte);
+	return 0;
 }
 
 static inline void pte_free_kernel(pte_t *pte)
diff --git a/include/asm-sparc64/pgalloc.h b/include/asm-sparc64/pgalloc.h
index 0081d1b7408e..8354a057ba88 100644
--- a/include/asm-sparc64/pgalloc.h
+++ b/include/asm-sparc64/pgalloc.h
@@ -189,7 +189,15 @@ static __inline__ void free_pmd_slow(pmd_t *pmd)
 	pmd_populate_kernel(MM,PMD,page_address(PTE_PAGE))
 
 extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address);
-#define pte_alloc_one(MM,ADDR)	virt_to_page(pte_alloc_one_kernel(MM,ADDR))
+
+static inline struct page *
+pte_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	pte_t *pte = pte_alloc_one_kernel(mm, addr);
+	if (pte)
+		return virt_to_page(pte);
+	return 0;
+}
 
 static __inline__ pte_t *pte_alloc_one_fast(struct mm_struct *mm, unsigned long address)
 {
-- 
cgit v1.2.3


From 8398bcc6b3eb950a1242f6dc4cfb151b6b9238c3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:08:32 -0700
Subject: [PATCH] eliminate nswap and cnswap

From: Matt Mackall <mpm@selenic.com>

The nswap and cnswap variables counters have never been incremented as
Linux doesn't do task swapping.
---
 arch/alpha/kernel/osf_sys.c | 3 ---
 fs/proc/array.c             | 4 ++--
 include/linux/sched.h       | 2 +-
 kernel/acct.c               | 2 +-
 kernel/exit.c               | 1 -
 kernel/fork.c               | 1 -
 kernel/sys.c                | 3 ---
 7 files changed, 4 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 34adfc76dd92..f725059fe47f 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -1095,14 +1095,12 @@ osf_getrusage(int who, struct rusage32 *ru)
 		jiffies_to_timeval32(current->stime, &r.ru_stime);
 		r.ru_minflt = current->min_flt;
 		r.ru_majflt = current->maj_flt;
-		r.ru_nswap = current->nswap;
 		break;
 	case RUSAGE_CHILDREN:
 		jiffies_to_timeval32(current->cutime, &r.ru_utime);
 		jiffies_to_timeval32(current->cstime, &r.ru_stime);
 		r.ru_minflt = current->cmin_flt;
 		r.ru_majflt = current->cmaj_flt;
-		r.ru_nswap = current->cnswap;
 		break;
 	default:
 		jiffies_to_timeval32(current->utime + current->cutime,
@@ -1111,7 +1109,6 @@ osf_getrusage(int who, struct rusage32 *ru)
 				   &r.ru_stime);
 		r.ru_minflt = current->min_flt + current->cmin_flt;
 		r.ru_majflt = current->maj_flt + current->cmaj_flt;
-		r.ru_nswap = current->nswap + current->cnswap;
 		break;
 	}
 
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ac9ccac5d1ee..ae90151e45ae 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -392,8 +392,8 @@ int proc_pid_stat(struct task_struct *task, char * buffer)
 		sigign      .sig[0] & 0x7fffffffUL,
 		sigcatch    .sig[0] & 0x7fffffffUL,
 		wchan,
-		task->nswap,
-		task->cnswap,
+		0UL,
+		0UL,
 		task->exit_signal,
 		task_cpu(task),
 		task->rt_priority,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5a1229121123..22080f919266 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -436,7 +436,7 @@ struct task_struct {
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
 	u64 start_time;
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
-	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
+	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
diff --git a/kernel/acct.c b/kernel/acct.c
index b417066778a7..8e32413c41f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -376,7 +376,7 @@ static void do_acct_process(long exitcode, struct file *file)
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_minflt = encode_comp_t(current->min_flt);
 	ac.ac_majflt = encode_comp_t(current->maj_flt);
-	ac.ac_swaps = encode_comp_t(current->nswap);
+	ac.ac_swaps = encode_comp_t(0);
 	ac.ac_exitcode = exitcode;
 
 	/*
diff --git a/kernel/exit.c b/kernel/exit.c
index 810eebd77559..8157dbc037d6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -92,7 +92,6 @@ repeat:
 	p->parent->cstime += p->stime + p->cstime;
 	p->parent->cmin_flt += p->min_flt + p->cmin_flt;
 	p->parent->cmaj_flt += p->maj_flt + p->cmaj_flt;
-	p->parent->cnswap += p->nswap + p->cnswap;
 	p->parent->cnvcsw += p->nvcsw + p->cnvcsw;
 	p->parent->cnivcsw += p->nivcsw + p->cnivcsw;
 	sched_exit(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index 315a06125e65..da5213611496 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -513,7 +513,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->cmin_flt = tsk->cmaj_flt = 0;
-	tsk->nswap = tsk->cnswap = 0;
 	tsk->nvcsw = tsk->nivcsw = tsk->cnvcsw = tsk->cnivcsw = 0;
 
 	tsk->mm = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 9d57482758f3..4d414d925889 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1532,7 +1532,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw;
 			r.ru_minflt = p->min_flt;
 			r.ru_majflt = p->maj_flt;
-			r.ru_nswap = p->nswap;
 			break;
 		case RUSAGE_CHILDREN:
 			jiffies_to_timeval(p->cutime, &r.ru_utime);
@@ -1541,7 +1540,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->cnivcsw;
 			r.ru_minflt = p->cmin_flt;
 			r.ru_majflt = p->cmaj_flt;
-			r.ru_nswap = p->cnswap;
 			break;
 		default:
 			jiffies_to_timeval(p->utime + p->cutime, &r.ru_utime);
@@ -1550,7 +1548,6 @@ int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
 			r.ru_nivcsw = p->nivcsw + p->cnivcsw;
 			r.ru_minflt = p->min_flt + p->cmin_flt;
 			r.ru_majflt = p->maj_flt + p->cmaj_flt;
-			r.ru_nswap = p->nswap + p->cnswap;
 			break;
 	}
 	return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-- 
cgit v1.2.3


From 87217f471c7d293548938c4d396fbafde664dde4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:08:45 -0700
Subject: [PATCH] shrink inode when quota is disabled

From: Matt Mackall <mpm@selenic.com>

drop quota array in inode struct if no quota support
---
 fs/inode.c         | 2 ++
 include/linux/fs.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index e1b51a3f3ebc..d192c238c5a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -126,7 +126,9 @@ static struct inode *alloc_inode(struct super_block *sb)
 		inode->i_blocks = 0;
 		inode->i_bytes = 0;
 		inode->i_generation = 0;
+#ifdef CONFIG_QUOTA
 		memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+#endif
 		inode->i_pipe = NULL;
 		inode->i_bdev = NULL;
 		inode->i_cdev = NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 02976f7c9f47..4abf3ff1fe1c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -403,7 +403,9 @@ struct inode {
 	struct file_lock	*i_flock;
 	struct address_space	*i_mapping;
 	struct address_space	i_data;
+#ifdef CONFIG_QUOTA
 	struct dquot		*i_dquot[MAXQUOTAS];
+#endif
 	/* These three should probably be a union */
 	struct list_head	i_devices;
 	struct pipe_inode_info	*i_pipe;
-- 
cgit v1.2.3


From bc0e2bbf8f0c8e77501677116798e8d7c6a8f49f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:09:10 -0700
Subject: [PATCH] O_DIRECT data exposure fixes

From: Badari Pulavarty, Suparna Bhattacharya, Andrew Morton

Forward port of Stephen Tweedie's DIO fixes from 2.4, to fix various DIO vs
buffered IO exposures involving races causing:

(a) stale data from uninstantiated blocks to be read, e.g.

    - O_DIRECT reads against buffered writes to a sparse region

    - O_DIRECT writes to a sparse region against buffered reads

(b) potential data corruption with

    - O_DIRECT IOs against truncate

    due to writes to truncated blocks (which may have been reallocated to
    another file).

Summary of fixes:

1) All the changes affect only regular files.  RAW/O_DIRECT on block are
   unaffected.

2) The DIO code will not fill in sparse regions on a write.  Instead
   -ENOTBLK is returned and the generic file write code would fallthrough to
   buffered IO in this case followed by writing through the pages to disk
   using filemap_fdatawrite/wait.

3) i_sem is held during both DIO reads and writes.  For reads, and writes
   to already allocated blocks, it is released right after IO is issued,
   while for writes to newly allocated blocks (e.g file extending writes and
   hole overwrites) it is held all the way through until IO completes (and
   data is committed to disk).

4) filemap_fdatawrite/wait are called under i_sem to synchronize buffered
   pages to disk blocks before issuing DIO.

5) A new rwsem (i_alloc_sem) is held in shared mode all the while a DIO
   (read or write) is in progress, and in exclusive mode by truncate to guard
   against deallocation of data blocks during DIO.

6) All this new locking has been pushed down into blockdev_direct_IO to
   avoid interfering with NFS direct IO.  The locks are taken in the order
   i_sem followed by i_alloc_sem.  While i_sem may be released after IO
   submission in some cases, i_alloc_sem is held through until dio_complete
   (in the case of AIO-DIO this happens through the IO completion callback).

7) i_sem and i_alloc_sem are not held for the _nolock versions of write
   routines, as used by blockdev and XFS.  Filesystems can specify the
   needs_special_locking parameter to __blockdev_direct_IO from their direct
   IO address space op accordingly.

Note from Badari:
Here is the locking (when needs_special_locking is true):

(1) generic_file_*_write() holds i_sem (as before) and calls
    ->direct_IO().  blockdev_direct_IO gets i_alloc_sem and call
    direct_io_worker().

(2) generic_file_*_read() does not hold any locks.  blockdev_direct_IO()
    gets i_sem and then i_alloc_sem and calls direct_io_worker() to do the
    work

(3) direct_io_worker() does the work and drops i_sem after submitting IOs
    if appropriate and drops i_alloc_sem after completing IOs.
---
 fs/direct-io.c          | 93 ++++++++++++++++++++++++++++++++++++++++++-------
 fs/inode.c              |  1 +
 fs/open.c               |  2 ++
 fs/xfs/linux/xfs_aops.c |  3 +-
 include/linux/fs.h      | 31 +++++++++++++++--
 mm/filemap.c            | 53 +++++++++++++++++++++-------
 6 files changed, 154 insertions(+), 29 deletions(-)

(limited to 'include')

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 4711d134cfd9..72309514e112 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -52,6 +52,10 @@
  *
  * If blkfactor is zero then the user's request was aligned to the filesystem's
  * blocksize.
+ *
+ * needs_locking is set for regular files on direct-IO-naive filesystems.  It
+ * determines whether we need to do the fancy locking which prevents direct-IO
+ * from being able to read uninitialised disk blocks.
  */
 
 struct dio {
@@ -59,6 +63,7 @@ struct dio {
 	struct bio *bio;		/* bio under assembly */
 	struct inode *inode;
 	int rw;
+	int needs_locking;		/* doesn't change */
 	unsigned blkbits;		/* doesn't change */
 	unsigned blkfactor;		/* When we're using an alignment which
 					   is finer than the filesystem's soft
@@ -206,6 +211,8 @@ static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
 {
 	if (dio->end_io)
 		dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private);
+	if (dio->needs_locking)
+		up_read(&dio->inode->i_alloc_sem);
 }
 
 /*
@@ -449,6 +456,7 @@ static int get_more_blocks(struct dio *dio)
 	unsigned long fs_count;	/* Number of filesystem-sized blocks */
 	unsigned long dio_count;/* Number of dio_block-sized blocks */
 	unsigned long blkmask;
+	int beyond_eof = 0;
 
 	/*
 	 * If there was a memory error and we've overwritten all the
@@ -466,8 +474,19 @@ static int get_more_blocks(struct dio *dio)
 		if (dio_count & blkmask)	
 			fs_count++;
 
+		if (dio->needs_locking) {
+			if (dio->block_in_file >= (i_size_read(dio->inode) >>
+							dio->blkbits))
+				beyond_eof = 1;
+		}
+		/*
+		 * For writes inside i_size we forbid block creations: only
+		 * overwrites are permitted.  We fall back to buffered writes
+		 * at a higher level for inside-i_size block-instantiating
+		 * writes.
+		 */
 		ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count,
-				map_bh, dio->rw == WRITE);
+				map_bh, (dio->rw == WRITE) && beyond_eof);
 	}
 	return ret;
 }
@@ -774,6 +793,10 @@ do_holes:
 			if (!buffer_mapped(map_bh)) {
 				char *kaddr;
 
+				/* AKPM: eargh, -ENOTBLK is a hack */
+				if (dio->rw == WRITE)
+					return -ENOTBLK;
+
 				if (dio->block_in_file >=
 					i_size_read(dio->inode)>>blkbits) {
 					/* We hit eof */
@@ -839,21 +862,21 @@ out:
 	return ret;
 }
 
+/*
+ * Releases both i_sem and i_alloc_sem
+ */
 static int
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
-	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io)
+	unsigned blkbits, get_blocks_t get_blocks, dio_iodone_t end_io,
+	struct dio *dio)
 {
 	unsigned long user_addr; 
 	int seg;
 	int ret = 0;
 	int ret2;
-	struct dio *dio;
 	size_t bytes;
 
-	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
-	if (!dio)
-		return -ENOMEM;
 	dio->is_async = !is_sync_kiocb(iocb);
 
 	dio->bio = NULL;
@@ -864,7 +887,6 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	dio->start_zero_done = 0;
 	dio->block_in_file = offset >> blkbits;
 	dio->blocks_available = 0;
-
 	dio->cur_page = NULL;
 
 	dio->boundary = 0;
@@ -952,6 +974,13 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 	 */
 	dio_cleanup(dio);
 
+	/*
+	 * All new block allocations have been performed.  We can let i_sem
+	 * go now.
+	 */
+	if (dio->needs_locking)
+		up(&dio->inode->i_sem);
+
 	/*
 	 * OK, all BIOs are submitted, so we can decrement bio_count to truly
 	 * reflect the number of to-be-processed BIOs.
@@ -987,11 +1016,17 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 
 /*
  * This is a library function for use by filesystem drivers.
+ *
+ * For writes to S_ISREG files, we are called under i_sem and return with i_sem
+ * held, even though it is internally dropped.
+ *
+ * For writes to S_ISBLK files, i_sem is not held on entry; it is never taken.
  */
 int
-blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io)
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	int needs_special_locking)
 {
 	int seg;
 	size_t size;
@@ -1000,6 +1035,8 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 	unsigned bdev_blkbits = 0;
 	unsigned blocksize_mask = (1 << blkbits) - 1;
 	ssize_t retval = -EINVAL;
+	struct dio *dio;
+	int needs_locking;
 
 	if (bdev)
 		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
@@ -1025,10 +1062,40 @@ blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		}
 	}
 
-	retval = direct_io_worker(rw, iocb, inode, iov, offset, 
-				nr_segs, blkbits, get_blocks, end_io);
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	retval = -ENOMEM;
+	if (!dio)
+		goto out;
+
+	/*
+	 * For regular files,
+	 *	readers need to grab i_sem and i_alloc_sem
+	 *	writers need to grab i_alloc_sem only (i_sem is already held)
+	 */
+	needs_locking = 0;
+	if (S_ISREG(inode->i_mode) && needs_special_locking) {
+		needs_locking = 1;
+		if (rw == READ) {
+			struct address_space *mapping;
+
+			mapping = iocb->ki_filp->f_mapping;
+			down(&inode->i_sem);
+			retval = filemap_write_and_wait(mapping);
+			if (retval) {
+				up(&inode->i_sem);
+				kfree(dio);
+				goto out;
+			}
+		}
+		down_read(&inode->i_alloc_sem);
+	}
+	dio->needs_locking = needs_locking;
+
+	retval = direct_io_worker(rw, iocb, inode, iov, offset,
+				nr_segs, blkbits, get_blocks, end_io, dio);
+	if (needs_locking && rw == WRITE)
+		down(&inode->i_sem);
 out:
 	return retval;
 }
-
-EXPORT_SYMBOL(blockdev_direct_IO);
+EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/inode.c b/fs/inode.c
index d192c238c5a9..b7f80405c076 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -185,6 +185,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
diff --git a/fs/open.c b/fs/open.c
index ce11096afcad..e0d546e01561 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -192,7 +192,9 @@ int do_truncate(struct dentry *dentry, loff_t length)
 	newattrs.ia_size = length;
 	newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME;
 	down(&dentry->d_inode->i_sem);
+	down_write(&dentry->d_inode->i_alloc_sem);
 	err = notify_change(dentry, &newattrs);
+	up_write(&dentry->d_inode->i_alloc_sem);
 	up(&dentry->d_inode->i_sem);
 	return err;
 }
diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c
index 75ab8d29cd2f..dd446266d33f 100644
--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
@@ -1032,7 +1032,8 @@ linvfs_direct_IO(
 	if (error)
 		return -error;
 
-	return blockdev_direct_IO(rw, iocb, inode, iomap.iomap_target->pbr_bdev,
+	return blockdev_direct_IO_no_locking(rw, iocb, inode,
+		iomap.iomap_target->pbr_bdev,
 		iov, offset, nr_segs,
 		linvfs_get_blocks_direct,
 		linvfs_unwritten_convert_direct);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4abf3ff1fe1c..91ff9225ba86 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -397,6 +397,7 @@ struct inode {
 	unsigned short          i_bytes;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
 	struct semaphore	i_sem;
+	struct rw_semaphore	i_alloc_sem;
 	struct inode_operations	*i_op;
 	struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
 	struct super_block	*i_sb;
@@ -1235,6 +1236,7 @@ extern void write_inode_now(struct inode *, int);
 extern int filemap_fdatawrite(struct address_space *);
 extern int filemap_flush(struct address_space *);
 extern int filemap_fdatawait(struct address_space *);
+extern int filemap_write_and_wait(struct address_space *mapping);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
@@ -1347,9 +1349,6 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern ssize_t generic_file_direct_IO(int rw, struct kiocb *iocb,
 	const struct iovec *iov, loff_t offset, unsigned long nr_segs);
-extern int blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, 
-	struct block_device *bdev, const struct iovec *iov, loff_t offset, 
-	unsigned long nr_segs, get_blocks_t *get_blocks, dio_iodone_t *end_io);
 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, 
 	unsigned long nr_segs, loff_t *ppos);
 ssize_t generic_file_writev(struct file *filp, const struct iovec *iov, 
@@ -1371,6 +1370,32 @@ static inline void do_generic_file_read(struct file * filp, loff_t *ppos,
 				actor);
 }
 
+int __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+	struct block_device *bdev, const struct iovec *iov, loff_t offset,
+	unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io,
+	int needs_special_locking);
+
+/*
+ * For filesystems which need locking between buffered and direct access
+ */
+static inline int blockdev_direct_IO(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_blocks, end_io, 1);
+}
+
+static inline int blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb,
+	struct inode *inode, struct block_device *bdev, const struct iovec *iov,
+	loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks,
+	dio_iodone_t end_io)
+{
+	return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset,
+				nr_segs, get_blocks, end_io, 0);
+}
+
 extern struct file_operations generic_ro_fops;
 
 #define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m))
diff --git a/mm/filemap.c b/mm/filemap.c
index 6fbd980c25e5..ad234dc52cbf 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -73,6 +73,9 @@
  *  ->mmap_sem
  *    ->i_sem			(msync)
  *
+ *  ->i_sem
+ *    ->i_alloc_sem             (various)
+ *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
  *    ->mapping->page_lock	(__sync_single_inode)
@@ -228,6 +231,18 @@ restart:
 
 EXPORT_SYMBOL(filemap_fdatawait);
 
+int filemap_write_and_wait(struct address_space *mapping)
+{
+	int retval = 0;
+
+	if (mapping->nrpages) {
+		retval = filemap_fdatawrite(mapping);
+		if (retval == 0)
+			retval = filemap_fdatawait(mapping);
+	}
+	return retval;
+}
+
 /*
  * This adds a page to the page cache, starting out as locked, unreferenced,
  * not uptodate and with no errors.
@@ -1716,6 +1731,7 @@ EXPORT_SYMBOL(generic_write_checks);
 
 /*
  * Write to a file through the page cache. 
+ * Called under i_sem for S_ISREG files.
  *
  * We put everything into the page cache prior to writing it. This is not a
  * problem when writing full pages. With partial pages, however, we first have
@@ -1806,12 +1822,19 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 		/*
 		 * Sync the fs metadata but not the minor inode changes and
 		 * of course not the data as we did direct DMA for the IO.
+		 * i_sem is held, which protects generic_osync_inode() from
+		 * livelocking.
 		 */
 		if (written >= 0 && file->f_flags & O_SYNC)
 			status = generic_osync_inode(inode, mapping, OSYNC_METADATA);
 		if (written >= 0 && !is_sync_kiocb(iocb))
 			written = -EIOCBQUEUED;
-		goto out_status;
+		if (written != -ENOTBLK)
+			goto out_status;
+		/*
+		 * direct-io write to a hole: fall through to buffered I/O
+		 */
+		written = 0;
 	}
 
 	buf = iov->iov_base;
@@ -1900,6 +1923,14 @@ generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
 					OSYNC_METADATA|OSYNC_DATA);
 	}
 	
+	/*
+	 * If we get here for O_DIRECT writes then we must have fallen through
+	 * to buffered writes (block instantiation inside i_size).  So we sync
+	 * the file data here, to try to honour O_DIRECT expectations.
+	 */
+	if (unlikely(file->f_flags & O_DIRECT) && written)
+		status = filemap_write_and_wait(mapping);
+
 out_status:	
 	err = written ? written : status;
 out:
@@ -1991,6 +2022,9 @@ ssize_t generic_file_writev(struct file *file, const struct iovec *iov,
 
 EXPORT_SYMBOL(generic_file_writev);
 
+/*
+ * Called under i_sem for writes to S_ISREG files
+ */
 ssize_t
 generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	loff_t offset, unsigned long nr_segs)
@@ -1999,18 +2033,13 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	struct address_space *mapping = file->f_mapping;
 	ssize_t retval;
 
-	if (mapping->nrpages) {
-		retval = filemap_fdatawrite(mapping);
-		if (retval == 0)
-			retval = filemap_fdatawait(mapping);
-		if (retval)
-			goto out;
+	retval = filemap_write_and_wait(mapping);
+	if (retval == 0) {
+		retval = mapping->a_ops->direct_IO(rw, iocb, iov,
+						offset, nr_segs);
+		if (rw == WRITE && mapping->nrpages)
+			invalidate_inode_pages2(mapping);
 	}
-
-	retval = mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs);
-	if (rw == WRITE && mapping->nrpages)
-		invalidate_inode_pages2(mapping);
-out:
 	return retval;
 }
 
-- 
cgit v1.2.3


From 8691fb836b268c622c61281238219fc166f0eee5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:10:27 -0700
Subject: [PATCH] radix-tree tags for selective lookup

Add radix-tree tagging so we can look up dirty or writeback pages in
O(log64(n)) time.

Each radix-tree node gains two bits for each slot: one for page dirtiness and
one for page writebackness.

If a tag bit is set on a leaf node, it indicates that item at the
corresponding slot is tagged (say, a dirty page).

If a tag bit is set in a non-leaf node it indicates that the same tag bit is
set in the subtree which lies under the corresponding slot.  ie: "there is a
dirty page under here somewhere, but you need to search down further to find
it".

A gang lookup function is provided which can walk the radix tree in
logarithmic time looking for items which are tagged, starting from a
specified offset.  We use this for in-order searches for dirty or writeback
pages.

There is a userspace test harness for this code at

http://www.zip.com.au/~akpm/linux/patches/stuff/rtth.tar.gz
---
 include/linux/radix-tree.h |  38 ++--
 lib/radix-tree.c           | 444 ++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 426 insertions(+), 56 deletions(-)

(limited to 'include')

diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index c32a45fd1f0d..8081a281fa5e 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -20,8 +20,7 @@
 #define _LINUX_RADIX_TREE_H
 
 #include <linux/preempt.h>
-
-struct radix_tree_node;
+#include <linux/types.h>
 
 struct radix_tree_root {
 	unsigned int		height;
@@ -29,25 +28,40 @@ struct radix_tree_root {
 	struct radix_tree_node	*rnode;
 };
 
-#define RADIX_TREE_INIT(mask)	{0, (mask), NULL}
+#define RADIX_TREE_INIT(mask)	{					\
+	.height = 0,							\
+	.gfp_mask = (mask),						\
+	.rnode = NULL,							\
+}
 
 #define RADIX_TREE(name, mask) \
 	struct radix_tree_root name = RADIX_TREE_INIT(mask)
 
-#define INIT_RADIX_TREE(root, mask)	\
-do {					\
-	(root)->height = 0;		\
-	(root)->gfp_mask = (mask);	\
-	(root)->rnode = NULL;		\
+#define INIT_RADIX_TREE(root, mask)					\
+do {									\
+	(root)->height = 0;						\
+	(root)->gfp_mask = (mask);					\
+	(root)->rnode = NULL;						\
 } while (0)
 
-extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
-extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
-extern void *radix_tree_delete(struct radix_tree_root *, unsigned long);
-extern unsigned int
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
 int radix_tree_preload(int gfp_mask);
+void radix_tree_init(void);
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items, int tag);
+int radix_tree_tagged(struct radix_tree_root *root, int tag);
 
 static inline void radix_tree_preload_end(void)
 {
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 70ad32ff37ca..5fb59f715eab 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -6,12 +6,12 @@
  * modify it under the terms of the GNU General Public License as
  * published by the Free Software Foundation; either version 2, or (at
  * your option) any later version.
- * 
+ *
  * This program is distributed in the hope that it will be useful, but
  * WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
- * 
+ *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
@@ -28,21 +28,36 @@
 #include <linux/cpu.h>
 #include <linux/gfp.h>
 #include <linux/string.h>
+#include <linux/bitops.h>
 
 /*
  * Radix tree node definition.
+ *
+ * RADIX_TREE_MAP_SHIFT must be >= log2(BITS_PER_LONG).  Otherwise the tags
+ * array will have zero size and the set_tag() arithmetic will go wrong.
  */
-#define RADIX_TREE_MAP_SHIFT  6
-#define RADIX_TREE_MAP_SIZE  (1UL << RADIX_TREE_MAP_SHIFT)
-#define RADIX_TREE_MAP_MASK  (RADIX_TREE_MAP_SIZE-1)
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	6
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+#define RADIX_TREE_TAGS		2
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
 struct radix_tree_node {
 	unsigned int	count;
 	void		*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_TAGS][RADIX_TREE_TAG_LONGS];
 };
 
 struct radix_tree_path {
 	struct radix_tree_node *node, **slot;
+	int offset;
 };
 
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
@@ -124,6 +139,22 @@ out:
 	return ret;
 }
 
+static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
+{
+	if (!test_bit(offset, &node->tags[tag][0]))
+		__set_bit(offset, &node->tags[tag][0]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, int tag, int offset)
+{
+	__clear_bit(offset, &node->tags[tag][0]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, int tag, int offset)
+{
+	return test_bit(offset, &node->tags[tag][0]);
+}
+
 /*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
@@ -140,26 +171,53 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_node *node;
 	unsigned int height;
+	char tags[RADIX_TREE_TAGS];
+	int tag;
 
 	/* Figure out what the height should be.  */
 	height = root->height + 1;
 	while (index > radix_tree_maxindex(height))
 		height++;
 
-	if (root->rnode) {
-		do {
-			if (!(node = radix_tree_node_alloc(root)))
-				return -ENOMEM;
-
-			/* Increase the height.  */
-			node->slots[0] = root->rnode;
-			node->count = 1;
-			root->rnode = node;
-			root->height++;
-		} while (height > root->height);
-	} else 
+	if (root->rnode == NULL) {
 		root->height = height;
+		goto out;
+	}
+
+	/*
+	 * Prepare the tag status of the top-level node for propagation
+	 * into the newly-pushed top-level node(s)
+	 */
+	for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+		int idx;
+
+		tags[tag] = 0;
+		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+			if (root->rnode->tags[tag][idx]) {
+				tags[tag] = 1;
+				break;
+			}
+		}
+	}
+
+	do {
+		if (!(node = radix_tree_node_alloc(root)))
+			return -ENOMEM;
+
+		/* Increase the height.  */
+		node->slots[0] = root->rnode;
 
+		/* Propagate the aggregated tag info into the new root */
+		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+			if (tags[tag])
+				tag_set(node, tag, 0);
+		}
+
+		node->count = 1;
+		root->rnode = node;
+		root->height++;
+	} while (height > root->height);
+out:
 	return 0;
 }
 
@@ -171,23 +229,27 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
  *
  *	Insert an item into the radix tree at position @index.
  */
-int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+int radix_tree_insert(struct radix_tree_root *root,
+			unsigned long index, void *item)
 {
 	struct radix_tree_node *node = NULL, *tmp, **slot;
 	unsigned int height, shift;
+	int offset;
 	int error;
 
 	/* Make sure the tree is high enough.  */
-	if (index > radix_tree_maxindex(root->height)) {
+	if ((!index && !root->rnode) ||
+			index > radix_tree_maxindex(root->height)) {
 		error = radix_tree_extend(root, index);
 		if (error)
 			return error;
 	}
-    
+
 	slot = &root->rnode;
 	height = root->height;
 	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
 
+	offset = 0;			/* uninitialised var warning */
 	while (height > 0) {
 		if (*slot == NULL) {
 			/* Have to add a child node.  */
@@ -198,18 +260,21 @@ int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *i
 				node->count++;
 		}
 
-		/* Go a level down.  */
+		/* Go a level down */
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
 		node = *slot;
-		slot = (struct radix_tree_node **)
-			(node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+		slot = (struct radix_tree_node **)(node->slots + offset);
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
 
 	if (*slot != NULL)
 		return -EEXIST;
-	if (node)
+	if (node) {
 		node->count++;
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+	}
 
 	*slot = item;
 	return 0;
@@ -221,7 +286,7 @@ EXPORT_SYMBOL(radix_tree_insert);
  *	@root:		radix tree root
  *	@index:		index key
  *
- *	Lookup them item at the position @index in the radix tree @root.
+ *	Lookup the item at the position @index in the radix tree @root.
  */
 void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 {
@@ -240,16 +305,174 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
 			return NULL;
 
 		slot = (struct radix_tree_node **)
-			((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+			((*slot)->slots +
+				((index >> shift) & RADIX_TREE_MAP_MASK));
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
 	}
 
-	return (void *) *slot;
+	return *slot;
 }
 EXPORT_SYMBOL(radix_tree_lookup);
 
-static /* inline */ unsigned int
+/**
+ *	radix_tree_tag_set - set a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Set the search tag corresponging to @index in the radix tree.  From
+ *	the root all the way down to the leaf node.
+ *
+ *	Returns the address of the tagged item.   Setting a tag on a not-present
+ *	item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	while (height > 0) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		tag_set(*slot, tag, offset);
+		slot = (struct radix_tree_node **)((*slot)->slots + offset);
+		BUG_ON(*slot == NULL);
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	return *slot;
+}
+EXPORT_SYMBOL(radix_tree_tag_set);
+
+/**
+ *	radix_tree_tag_clear - clear a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Clear the search tag corresponging to @index in the radix tree.  If
+ *	this causes the leaf node to have no tags set then clear the tag in the
+ *	next-to-leaf node, etc.
+ *
+ *	Returns the address of the tagged item on success, else NULL.  ie:
+ *	has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, int tag)
+{
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	unsigned int height, shift;
+	void *ret = NULL;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		goto out;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+	pathp->slot = &root->rnode;
+
+	while (height > 0) {
+		int offset;
+
+		if (*pathp->slot == NULL)
+			goto out;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp[1].offset = offset;
+		pathp[1].node = *pathp[0].slot;
+		pathp[1].slot = (struct radix_tree_node **)
+				(pathp[1].node->slots + offset);
+		pathp++;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	ret = *pathp[0].slot;
+	if (ret == NULL)
+		goto out;
+
+	do {
+		int idx;
+
+		tag_clear(pathp[0].node, tag, pathp[0].offset);
+		for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+			if (pathp[0].node->tags[tag][idx])
+				goto out;
+		}
+		pathp--;
+	} while (pathp[0].node);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_tag_clear);
+
+#ifndef __KERNEL__	/* Only the test harness uses this at present */
+/**
+ *	radix_tree_tag_get - get a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Return the search tag corresponging to @index in the radix tree.
+ *
+ *	Returns zero if the tag is unset, or if there is no corresponding item
+ *	in the tree.
+ */
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+	int saw_unset_tag = 0;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return 0;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	for ( ; ; ) {
+		int offset;
+
+		if (*slot == NULL)
+			return 0;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		/*
+		 * This is just a debug check.  Later, we can bale as soon as
+		 * we see an unset tag.
+		 */
+		if (!tag_get(*slot, tag, offset))
+			saw_unset_tag = 1;
+		if (height == 1) {
+			int ret = tag_get(*slot, tag, offset);
+
+			BUG_ON(ret && saw_unset_tag);
+			return ret;
+		}
+		slot = (struct radix_tree_node **)((*slot)->slots + offset);
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+}
+EXPORT_SYMBOL(radix_tree_tag_get);
+#endif
+
+static unsigned int
 __lookup(struct radix_tree_root *root, void **results, unsigned long index,
 	unsigned int max_items, unsigned long *next_index)
 {
@@ -316,17 +539,6 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 	unsigned long cur_index = first_index;
 	unsigned int ret = 0;
 
-	if (root->rnode == NULL)
-		goto out;
-	if (max_index == 0) {			/* Bah.  Special case */
-		if (first_index == 0) {
-			if (max_items > 0) {
-				*results = root->rnode;
-				ret = 1;
-			}
-		}
-		goto out;
-	}
 	while (ret < max_items) {
 		unsigned int nr_found;
 		unsigned long next_index;	/* Index of next search */
@@ -340,11 +552,101 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			break;
 		cur_index = next_index;
 	}
-out:
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_gang_lookup);
 
+/*
+ * FIXME: the two tag_get()s here should use find_next_bit() instead of
+ * open-coding the search.
+ */
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index, int tag)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	while (height > 0) {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (tag_get(slot, tag, i)) {
+				BUG_ON(slot->slots[i] == NULL);
+				break;
+			}
+			index &= ~((1 << shift) - 1);
+			index += 1 << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		if (height == 0) {	/* Bottom level: grab some items */
+			unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (tag_get(slot, tag, j)) {
+					BUG_ON(slot->slots[j] == NULL);
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = slot->slots[i];
+	}
+out:
+	*next_index = index;
+	return nr_found;
+}
+
+/**
+ *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ *	                             based on a tag
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *	@tag:		the tag index
+ *
+ *	Performs an index-ascending scan of the tree for present items which
+ *	have the tag indexed by @tag set.  Places the items at *@results and
+ *	returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items, int tag)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup_tag(root, results + ret, cur_index,
+					max_items - ret, &next_index, tag);
+		ret += nr_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(radix_tree_gang_lookup_tag);
+
 /**
  *	radix_tree_delete    -    delete an item from a radix tree
  *	@root:		radix tree root
@@ -357,24 +659,31 @@ EXPORT_SYMBOL(radix_tree_gang_lookup);
 void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 {
 	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_path *orig_pathp;
 	unsigned int height, shift;
 	void *ret = NULL;
+	char tags[RADIX_TREE_TAGS];
+	int nr_cleared_tags;
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
 		goto out;
 
-	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 	pathp->node = NULL;
 	pathp->slot = &root->rnode;
 
 	while (height > 0) {
+		int offset;
+
 		if (*pathp->slot == NULL)
 			goto out;
 
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp[1].offset = offset;
 		pathp[1].node = *pathp[0].slot;
 		pathp[1].slot = (struct radix_tree_node **)
-		    (pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+				(pathp[1].node->slots + offset);
 		pathp++;
 		shift -= RADIX_TREE_MAP_SHIFT;
 		height--;
@@ -384,20 +693,67 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
 	if (ret == NULL)
 		goto out;
 
+	orig_pathp = pathp;
+
+	/*
+	 * Clear all tags associated with the just-deleted item
+	 */
+	memset(tags, 0, sizeof(tags));
+	do {
+		int tag;
+
+		nr_cleared_tags = RADIX_TREE_TAGS;
+		for (tag = 0; tag < RADIX_TREE_TAGS; tag++) {
+			int idx;
+
+			if (!tags[tag])
+				tag_clear(pathp[0].node, tag, pathp[0].offset);
+
+			for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+				if (pathp[0].node->tags[tag][idx]) {
+					tags[tag] = 1;
+					nr_cleared_tags--;
+					break;
+				}
+			}
+		}
+		pathp--;
+	} while (pathp[0].node && nr_cleared_tags);
+
+	pathp = orig_pathp;
 	*pathp[0].slot = NULL;
 	while (pathp[0].node && --pathp[0].node->count == 0) {
 		pathp--;
+		BUG_ON(*pathp[0].slot == NULL);
 		*pathp[0].slot = NULL;
 		radix_tree_node_free(pathp[1].node);
 	}
-
 	if (root->rnode == NULL)
-		root->height = 0;  /* Empty tree, we can reset the height */
+		root->height = 0;
 out:
 	return ret;
 }
 EXPORT_SYMBOL(radix_tree_delete);
 
+/**
+ *	radix_tree_tagged - test whether any items in the tree are tagged
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, int tag)
+{
+	int idx;
+
+	if (!root->rnode)
+		return 0;
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		if (root->rnode->tags[tag][idx])
+			return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(radix_tree_tagged);
+
 static void
 radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
 {
-- 
cgit v1.2.3


From 89261aab0c7064ca9766bc79e7867b6104274f56 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:10:41 -0700
Subject: [PATCH] make the pagecache lock irq-safe.

Intro to these patches:

- Major surgery against the pagecache, radix-tree and writeback code.  This
  work is to address the O_DIRECT-vs-buffered data exposure horrors which
  we've been struggling with for months.

  As a side-effect, 32 bytes are saved from struct inode and eight bytes
  are removed from struct page.  At a cost of approximately 2.5 bits per page
  in the radix tree nodes on 4k pagesize, assuming the pagecache is densely
  populated.  Not all pages are pagecache; other pages gain the full 8 byte
  saving.

  This change will break any arch code which is using page->list and will
  also break any arch code which is using page->lru of memory which was
  obtained from slab.

  The basic problem which we (mainly Daniel McNeil) have been struggling
  with is in getting a really reliable fsync() across the page lists while
  other processes are performing writeback against the same file.  It's like
  juggling four bars of wet soap with your eyes shut while someone is
  whacking you with a baseball bat.  Daniel pretty much has the problem
  plugged but I suspect that's just because we don't have testcases to
  trigger the remaining problems.  The complexity and additional locking
  which those patches add is worrisome.

  So the approach taken here is to remove the page lists altogether and
  replace the list-based writeback and wait operations with in-order
  radix-tree walks.

  The radix-tree code has been enhanced to support "tagging" of pages, for
  later searches for pages which have a particular tag set.  This means that
  we can ask the radix tree code "find me the next 16 dirty pages starting at
  pagecache index N" and it will do that in O(log64(N)) time.

  This affects I/O scheduling potentially quite significantly.  It is no
  longer the case that the kernel will submit pages for I/O in the order in
  which the application dirtied them.  We instead submit them in file-offset
  order all the time.

  This is likely to be advantageous when applications are seeking all over
  a large file randomly writing small amounts of data.  I haven't performed
  much benchmarking, but tiobench random write throughput seems to be
  increased by 30%.  Other tests appear to be unaltered.  dbench may have got
  10-20% quicker, but it's variable.

  There is one large file which everyone seeks all over randomly writing
  small amounts of data: the blockdev mapping which caches filesystem
  metadata.  The kernel's IO submission patterns for this are now ideal.


  Because writeback and wait-for-writeback use a tree walk instead of a
  list walk they are no longer livelockable.  This probably means that we no
  longer need to hold i_sem across O_SYNC writes and perhaps fsync() and
  fdatasync().  This may be beneficial for databases: multiple processes
  writing and syncing different parts of the same file at the same time can
  now all submit and wait upon writes to just their own little bit of the
  file, so we can get a lot more data into the queues.

  It is trivial to implement a part-file-fdatasync() as well, so
  applications can say "sync the file from byte N to byte M", and multiple
  applications can do this concurrently.  This is easy for ext2 filesystems,
  but probably needs lots of work for data-journalled filesystems and XFS and
  it probably doesn't offer much benefit over an i_semless O_SYNC write.


  These patches can end up making ext3 (even) slower:

	for i in 1 2 3 4
	do
		dd if=/dev/zero of=$i bs=1M count=2000 &
	done

  runs awfully slow on SMP.  This is, yet again, because all the file
  blocks are jumbled up and the per-file linear writeout causes tons of
  seeking.  The above test runs sweetly on UP because the on UP we don't
  allocate blocks to different files in parallel.

  Mingming and Badari are working on getting block reservation working for
  ext3 (preallocation on steroids).  That should fix ext3 up.


This patch:

- Later, we'll need to access the radix trees from inside disk I/O
  completion handlers.  So make mapping->page_lock irq-safe.  And rename it
  to tree_lock to reliably break any missed conversions.
---
 fs/buffer.c         |  8 ++++----
 fs/cifs/file.c      | 10 +---------
 fs/fs-writeback.c   |  4 ++--
 fs/inode.c          |  2 +-
 fs/mpage.c          | 10 +++++-----
 include/linux/fs.h  |  2 +-
 ipc/shm.c           |  2 --
 mm/filemap.c        | 50 +++++++++++++++++++++++++-------------------------
 mm/page-writeback.c | 10 +++++-----
 mm/readahead.c      |  8 ++++----
 mm/swap_state.c     | 22 +++++++++++-----------
 mm/swapfile.c       |  8 ++++----
 mm/truncate.c       |  8 ++++----
 mm/vmscan.c         | 13 ++++---------
 14 files changed, 71 insertions(+), 86 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 2cbe21bccb0b..81d0bb842ec9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -396,7 +396,7 @@ out:
  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
  * may be quite high.  This code could TryLock the page, and if that
  * succeeds, there is no need to take private_lock. (But if
- * private_lock is contended then so is mapping->page_lock).
+ * private_lock is contended then so is mapping->tree_lock).
  */
 static struct buffer_head *
 __find_get_block_slow(struct block_device *bdev, sector_t block, int unused)
@@ -867,14 +867,14 @@ int __set_page_dirty_buffers(struct page *page)
 	spin_unlock(&mapping->private_lock);
 
 	if (!TestSetPageDirty(page)) {
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		if (page->mapping) {	/* Race with truncate? */
 			if (!mapping->backing_dev_info->memory_backed)
 				inc_page_state(nr_dirty);
 			list_del(&page->list);
 			list_add(&page->list, &mapping->dirty_pages);
 		}
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 	
@@ -1254,7 +1254,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  * inode to its superblock's dirty inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
- * mapping->page_lock and the global inode_lock.
+ * mapping->tree_lock and the global inode_lock.
  */
 void fastcall mark_buffer_dirty(struct buffer_head *bh)
 {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d991eef801ac..f120f126eab5 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -898,11 +898,9 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
 		if(list_empty(pages))
 			break;
 
-		spin_lock(&mapping->page_lock);
 		page = list_entry(pages->prev, struct page, list);
 
 		list_del(&page->list);
-		spin_unlock(&mapping->page_lock);
 
 		if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
 			page_cache_release(page);
@@ -962,14 +960,10 @@ cifs_readpages(struct file *file, struct address_space *mapping,
 	pagevec_init(&lru_pvec, 0);
 
 	for(i = 0;i<num_pages;) {
-		spin_lock(&mapping->page_lock);
-		if(list_empty(page_list)) {
-			spin_unlock(&mapping->page_lock);
+		if(list_empty(page_list))
 			break;
-		}
 		page = list_entry(page_list->prev, struct page, list);
 		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
-	        spin_unlock(&mapping->page_lock);
 
 		/* for reads over a certain size could initiate async read ahead */
 
@@ -989,12 +983,10 @@ cifs_readpages(struct file *file, struct address_space *mapping,
 			cFYI(1,("Read error in readpages: %d",rc));
 			/* clean up remaing pages off list */
             
-			spin_lock(&mapping->page_lock);
 			while (!list_empty(page_list) && (i < num_pages)) {
 				page = list_entry(page_list->prev, struct page, list);
 				list_del(&page->list);
 			}
-			spin_unlock(&mapping->page_lock);
 			break;
 		} else if (bytes_read > 0) {
 			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index aa5f34b85747..f8b6182cb152 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -159,10 +159,10 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * read speculatively by this cpu before &= ~I_DIRTY  -- mikulas
 	 */
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
 		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
diff --git a/fs/inode.c b/fs/inode.c
index b7f80405c076..b5d43d858e0b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -187,7 +187,7 @@ void inode_init_once(struct inode *inode)
 	sema_init(&inode->i_sem, 1);
 	init_rwsem(&inode->i_alloc_sem);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.page_lock);
+	spin_lock_init(&inode->i_data.tree_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
 	atomic_set(&inode->i_data.truncate_count, 0);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
diff --git a/fs/mpage.c b/fs/mpage.c
index 630d6a0f7e7b..c3e781cb4906 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -635,7 +635,7 @@ mpage_writepages(struct address_space *mapping,
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	while (!list_empty(&mapping->io_pages) && !done) {
 		struct page *page = list_entry(mapping->io_pages.prev,
 					struct page, list);
@@ -655,10 +655,10 @@ mpage_writepages(struct address_space *mapping,
 		list_add(&page->list, &mapping->locked_pages);
 
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 
 		/*
-		 * At this point we hold neither mapping->page_lock nor
+		 * At this point we hold neither mapping->tree_lock nor
 		 * lock on the page itself: the page may be truncated or
 		 * invalidated (changing page->mapping to NULL), or even
 		 * swizzled back from swapper_space to tmpfs file mapping.
@@ -695,12 +695,12 @@ mpage_writepages(struct address_space *mapping,
 			unlock_page(page);
 		}
 		page_cache_release(page);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 	}
 	/*
 	 * Leave any remaining dirty pages on ->io_pages
 	 */
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 91ff9225ba86..f64f8fb2f819 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -322,7 +322,7 @@ struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
-	spinlock_t		page_lock;	/* and spinlock protecting it */
+	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
diff --git a/ipc/shm.c b/ipc/shm.c
index 4897cfe16f27..714933b144fa 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -380,9 +380,7 @@ static void shm_get_stat(unsigned long *rss, unsigned long *swp)
 
 		if (is_file_hugepages(shp->shm_file)) {
 			struct address_space *mapping = inode->i_mapping;
-			spin_lock(&mapping->page_lock);
 			*rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages;
-			spin_unlock(&mapping->page_lock);
 		} else {
 			struct shmem_inode_info *info = SHMEM_I(inode);
 			spin_lock(&info->lock);
diff --git a/mm/filemap.c b/mm/filemap.c
index f992d76831e4..360c5feec975 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -59,7 +59,7 @@
  *    ->private_lock		(__free_pte->__set_page_dirty_buffers)
  *      ->swap_list_lock
  *        ->swap_device_lock	(exclusive_swap_page, others)
- *          ->mapping->page_lock
+ *          ->mapping->tree_lock
  *
  *  ->i_sem
  *    ->i_shared_sem		(truncate->invalidate_mmap_range)
@@ -78,12 +78,12 @@
  *
  *  ->inode_lock
  *    ->sb_lock			(fs/fs-writeback.c)
- *    ->mapping->page_lock	(__sync_single_inode)
+ *    ->mapping->tree_lock	(__sync_single_inode)
  *
  *  ->page_table_lock
  *    ->swap_device_lock	(try_to_unmap_one)
  *    ->private_lock		(try_to_unmap_one)
- *    ->page_lock		(try_to_unmap_one)
+ *    ->tree_lock		(try_to_unmap_one)
  *    ->zone.lru_lock		(follow_page->mark_page_accessed)
  *
  *  ->task->proc_lock
@@ -93,7 +93,7 @@
 /*
  * Remove a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold a write_lock on the mapping's page_lock.
+ * is safe.  The caller must hold a write_lock on the mapping's tree_lock.
  */
 void __remove_from_page_cache(struct page *page)
 {
@@ -114,9 +114,9 @@ void remove_from_page_cache(struct page *page)
 	if (unlikely(!PageLocked(page)))
 		PAGE_BUG(page);
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	__remove_from_page_cache(page);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 }
 
 static inline int sync_page(struct page *page)
@@ -148,9 +148,9 @@ static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ret = do_writepages(mapping, &wbc);
 	return ret;
 }
@@ -185,7 +185,7 @@ int filemap_fdatawait(struct address_space * mapping)
 
 restart:
 	progress = 0;
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
         while (!list_empty(&mapping->locked_pages)) {
 		struct page *page;
 
@@ -199,7 +199,7 @@ restart:
 		if (!PageWriteback(page)) {
 			if (++progress > 32) {
 				if (need_resched()) {
-					spin_unlock(&mapping->page_lock);
+					spin_unlock_irq(&mapping->tree_lock);
 					__cond_resched();
 					goto restart;
 				}
@@ -209,16 +209,16 @@ restart:
 
 		progress = 0;
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 
 		wait_on_page_writeback(page);
 		if (PageError(page))
 			ret = -EIO;
 
 		page_cache_release(page);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 	}
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	/* Check for outstanding write errors */
 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
@@ -267,7 +267,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 
 	if (error == 0) {
 		page_cache_get(page);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
 			SetPageLocked(page);
@@ -275,7 +275,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 		} else {
 			page_cache_release(page);
 		}
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		radix_tree_preload_end();
 	}
 	return error;
@@ -411,11 +411,11 @@ struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page)
 		page_cache_get(page);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -428,11 +428,11 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
 {
 	struct page *page;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page && TestSetPageLocked(page))
 		page = NULL;
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -454,15 +454,15 @@ struct page *find_lock_page(struct address_space *mapping,
 {
 	struct page *page;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 repeat:
 	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
 		page_cache_get(page);
 		if (TestSetPageLocked(page)) {
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			lock_page(page);
-			spin_lock(&mapping->page_lock);
+			spin_lock_irq(&mapping->tree_lock);
 
 			/* Has the page been truncated while we slept? */
 			if (page->mapping != mapping || page->index != offset) {
@@ -472,7 +472,7 @@ repeat:
 			}
 		}
 	}
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return page;
 }
 
@@ -546,12 +546,12 @@ unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
 	unsigned int i;
 	unsigned int ret;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	ret = radix_tree_gang_lookup(&mapping->page_tree,
 				(void **)pages, start, nr_pages);
 	for (i = 0; i < ret; i++)
 		page_cache_get(pages[i]);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	return ret;
 }
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f1ecbd88e846..044becdff304 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -472,12 +472,12 @@ int write_one_page(struct page *page, int wait)
 	if (wait)
 		wait_on_page_writeback(page);
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	list_del(&page->list);
 	if (test_clear_page_dirty(page)) {
 		list_add(&page->list, &mapping->locked_pages);
 		page_cache_get(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
@@ -487,7 +487,7 @@ int write_one_page(struct page *page, int wait)
 		page_cache_release(page);
 	} else {
 		list_add(&page->list, &mapping->clean_pages);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		unlock_page(page);
 	}
 	return ret;
@@ -515,7 +515,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 		struct address_space *mapping = page->mapping;
 
 		if (mapping) {
-			spin_lock(&mapping->page_lock);
+			spin_lock_irq(&mapping->tree_lock);
 			if (page->mapping) {	/* Race with truncate? */
 				BUG_ON(page->mapping != mapping);
 				if (!mapping->backing_dev_info->memory_backed)
@@ -523,7 +523,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 				list_del(&page->list);
 				list_add(&page->list, &mapping->dirty_pages);
 			}
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			if (!PageSwapCache(page))
 				__mark_inode_dirty(mapping->host,
 							I_DIRTY_PAGES);
diff --git a/mm/readahead.c b/mm/readahead.c
index e1d25a8b528c..6135e1484ffc 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -230,7 +230,7 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 	/*
 	 * Preallocate as many pages as we will need.
 	 */
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
 		unsigned long page_offset = offset + page_idx;
 		
@@ -241,16 +241,16 @@ __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
 		if (page)
 			continue;
 
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		page = page_cache_alloc_cold(mapping);
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 		if (!page)
 			break;
 		page->index = page_offset;
 		list_add(&page->list, &page_pool);
 		ret++;
 	}
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 
 	/*
 	 * Now start the IO.  We ignore I/O errors - if the page is not
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 88cfd4403a4c..d670c5846b45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -25,7 +25,7 @@ extern struct address_space_operations swap_aops;
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
-	.page_lock	= SPIN_LOCK_UNLOCKED,
+	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
 	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
@@ -182,9 +182,9 @@ void delete_from_swap_cache(struct page *page)
   
 	entry.val = page->index;
 
-	spin_lock(&swapper_space.page_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock(&swapper_space.page_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
@@ -195,8 +195,8 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	struct address_space *mapping = page->mapping;
 	int err;
 
-	spin_lock(&swapper_space.page_lock);
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
+	spin_lock(&mapping->tree_lock);
 
 	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
 	if (!err) {
@@ -204,8 +204,8 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 		___add_to_page_cache(page, &swapper_space, entry.val);
 	}
 
-	spin_unlock(&mapping->page_lock);
-	spin_unlock(&swapper_space.page_lock);
+	spin_unlock(&mapping->tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	if (!err) {
 		if (!swap_duplicate(entry))
@@ -231,8 +231,8 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 
 	entry.val = page->index;
 
-	spin_lock(&swapper_space.page_lock);
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&swapper_space.tree_lock);
+	spin_lock(&mapping->tree_lock);
 
 	err = radix_tree_insert(&mapping->page_tree, index, page);
 	if (!err) {
@@ -240,8 +240,8 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 		___add_to_page_cache(page, mapping, index);
 	}
 
-	spin_unlock(&mapping->page_lock);
-	spin_unlock(&swapper_space.page_lock);
+	spin_unlock(&mapping->tree_lock);
+	spin_unlock_irq(&swapper_space.tree_lock);
 
 	if (!err) {
 		swap_free(entry);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 58bf083a96b5..e5cebb1800b9 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -253,10 +253,10 @@ static int exclusive_swap_page(struct page *page)
 		/* Is the only swap cache user the cache itself? */
 		if (p->swap_map[swp_offset(entry)] == 1) {
 			/* Recheck the page count with the pagecache lock held.. */
-			spin_lock(&swapper_space.page_lock);
+			spin_lock_irq(&swapper_space.tree_lock);
 			if (page_count(page) - !!PagePrivate(page) == 2)
 				retval = 1;
-			spin_unlock(&swapper_space.page_lock);
+			spin_unlock_irq(&swapper_space.tree_lock);
 		}
 		swap_info_put(p);
 	}
@@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct page *page)
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
 		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock(&swapper_space.page_lock);
+		spin_lock_irq(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock(&swapper_space.page_lock);
+		spin_unlock_irq(&swapper_space.tree_lock);
 	}
 	swap_info_put(p);
 
diff --git a/mm/truncate.c b/mm/truncate.c
index d94d6bf56d80..1dd32a204dfc 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -62,7 +62,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
  * This is for invalidate_inode_pages().  That function can be called at
  * any time, and is not supposed to throw away dirty pages.  But pages can
  * be marked dirty at any time too.  So we re-check the dirtiness inside
- * ->page_lock.  That provides exclusion against the __set_page_dirty
+ * ->tree_lock.  That provides exclusion against the __set_page_dirty
  * functions.
  */
 static int
@@ -74,13 +74,13 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
 	if (PagePrivate(page) && !try_to_release_page(page, 0))
 		return 0;
 
-	spin_lock(&mapping->page_lock);
+	spin_lock_irq(&mapping->tree_lock);
 	if (PageDirty(page)) {
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		return 0;
 	}
 	__remove_from_page_cache(page);
-	spin_unlock(&mapping->page_lock);
+	spin_unlock_irq(&mapping->tree_lock);
 	ClearPageUptodate(page);
 	page_cache_release(page);	/* pagecache ref */
 	return 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0b290c82c1f4..df658dd6c743 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -354,7 +354,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
-			spin_lock(&mapping->page_lock);
 			if (test_clear_page_dirty(page)) {
 				int res;
 				struct writeback_control wbc = {
@@ -364,9 +363,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 					.for_reclaim = 1,
 				};
 
-				list_move(&page->list, &mapping->locked_pages);
-				spin_unlock(&mapping->page_lock);
-
 				SetPageReclaim(page);
 				res = mapping->a_ops->writepage(page, &wbc);
 				if (res < 0)
@@ -381,7 +377,6 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				}
 				goto keep;
 			}
-			spin_unlock(&mapping->page_lock);
 		}
 
 		/*
@@ -415,7 +410,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 		if (!mapping)
 			goto keep_locked;	/* truncate got there first */
 
-		spin_lock(&mapping->page_lock);
+		spin_lock_irq(&mapping->tree_lock);
 
 		/*
 		 * The non-racy check for busy page.  It is critical to check
@@ -423,7 +418,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 		 * not in use by anybody. 	(pagecache + us == 2)
 		 */
 		if (page_count(page) != 2 || PageDirty(page)) {
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			goto keep_locked;
 		}
 
@@ -431,7 +426,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 		if (PageSwapCache(page)) {
 			swp_entry_t swap = { .val = page->index };
 			__delete_from_swap_cache(page);
-			spin_unlock(&mapping->page_lock);
+			spin_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
 			__put_page(page);	/* The pagecache ref */
 			goto free_it;
@@ -439,7 +434,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 #endif /* CONFIG_SWAP */
 
 		__remove_from_page_cache(page);
-		spin_unlock(&mapping->page_lock);
+		spin_unlock_irq(&mapping->tree_lock);
 		__put_page(page);
 
 free_it:
-- 
cgit v1.2.3


From 8ece6262c5fef1b935a944f5d16965ff7dd5d1cc Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:10:54 -0700
Subject: [PATCH] tag dirty pages as such in the radix tree

Arrange for all dirty pagecache pages to be tagged as dirty within their
radix tree.
---
 fs/buffer.c                |  2 ++
 include/linux/fs.h         |  7 +++++++
 include/linux/page-flags.h |  2 ++
 mm/page-writeback.c        | 48 +++++++++++++++++++++++++++++++++++++++-------
 mm/swap_state.c            |  4 ++--
 5 files changed, 54 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 81d0bb842ec9..59f4508a472f 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -873,6 +873,8 @@ int __set_page_dirty_buffers(struct page *page)
 				inc_page_state(nr_dirty);
 			list_del(&page->list);
 			list_add(&page->list, &mapping->dirty_pages);
+			radix_tree_tag_set(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f64f8fb2f819..857e797b0ad2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -364,6 +364,13 @@ struct block_device {
 	unsigned long		bd_private;
 };
 
+/*
+ * Radix-tre tags, for tagging dirty and writeback pages within the pagecache
+ * radix trees
+ */
+#define PAGECACHE_TAG_DIRTY	0
+#define PAGECACHE_TAG_WRITEBACK	1
+
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index f58c9e68d3d8..9f4fb3da00d9 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -318,4 +318,6 @@ static inline void clear_page_dirty(struct page *page)
 	test_clear_page_dirty(page);
 }
 
+int __clear_page_dirty(struct page *page);
+
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 044becdff304..23da9ce262ca 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -522,6 +522,8 @@ int __set_page_dirty_nobuffers(struct page *page)
 					inc_page_state(nr_dirty);
 				list_del(&page->list);
 				list_add(&page->list, &mapping->dirty_pages);
+				radix_tree_tag_set(&mapping->page_tree,
+					page->index, PAGECACHE_TAG_DIRTY);
 			}
 			spin_unlock_irq(&mapping->tree_lock);
 			if (!PageSwapCache(page))
@@ -560,13 +562,45 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  */
 int test_clear_page_dirty(struct page *page)
 {
-	if (TestClearPageDirty(page)) {
-		struct address_space *mapping = page->mapping;
-
-		if (mapping && !mapping->backing_dev_info->memory_backed)
-			dec_page_state(nr_dirty);
-		return 1;
+	struct address_space *mapping = page->mapping;
+	unsigned long flags;
+
+	if (mapping) {
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		if (TestClearPageDirty(page)) {
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
+			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+			if (!mapping->backing_dev_info->memory_backed)
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return 0;
 	}
-	return 0;
+	return TestClearPageDirty(page);
 }
 EXPORT_SYMBOL(test_clear_page_dirty);
+
+/*
+ * Clear a page's dirty flag while ignoring dirty memory accounting
+ */
+int __clear_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		if (TestClearPageDirty(page)) {
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
+			spin_unlock_irqrestore(&mapping->tree_lock, flags);
+			return 1;
+		}
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index d670c5846b45..736fd2b82300 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -149,7 +149,7 @@ int add_to_swap(struct page * page)
 		switch (err) {
 		case 0:				/* Success */
 			SetPageUptodate(page);
-			ClearPageDirty(page);
+			__clear_page_dirty(page);
 			set_page_dirty(page);
 			INC_CACHE_INFO(add_total);
 			return 1;
@@ -246,7 +246,7 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 	if (!err) {
 		swap_free(entry);
 		/* shift page from clean_pages to dirty_pages list */
-		ClearPageDirty(page);
+		__clear_page_dirty(page);
 		set_page_dirty(page);
 	}
 	return err;
-- 
cgit v1.2.3


From 40c8348ec03fa2c525e13ca6ee54279735563ee4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:11:08 -0700
Subject: [PATCH] tag writeback pages as such in their radix tree

Arrange for under-writeback pages to be marked thus in their pagecache radix
tree.
---
 fs/buffer.c                |  4 ++--
 fs/mpage.c                 |  2 +-
 fs/nfs/write.c             |  2 +-
 fs/ntfs/aops.c             |  4 ++--
 fs/reiserfs/inode.c        |  4 ++--
 fs/xfs/linux/xfs_aops.c    |  2 +-
 include/linux/page-flags.h |  8 +++++++-
 mm/filemap.c               |  3 +--
 mm/page-writeback.c        | 42 ++++++++++++++++++++++++++++++++++++++++++
 mm/page_io.c               |  2 +-
 mm/swap.c                  |  2 +-
 11 files changed, 61 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 59f4508a472f..56b0df6bf752 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1829,7 +1829,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 	} while ((bh = bh->b_this_page) != head);
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);		/* Keeps try_to_free_buffers() away */
+	set_page_writeback(page);	/* Keeps try_to_free_buffers() away */
 	unlock_page(page);
 
 	/*
@@ -1892,7 +1892,7 @@ recover:
 	} while ((bh = bh->b_this_page) != head);
 	SetPageError(page);
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 	do {
 		struct buffer_head *next = bh->b_this_page;
diff --git a/fs/mpage.c b/fs/mpage.c
index c3e781cb4906..5f5f5e63fca2 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -546,7 +546,7 @@ alloc_new:
 	}
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 	if (boundary || (first_unmapped != blocks_per_page)) {
 		bio = mpage_bio_submit(WRITE, bio);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 83bc0b498a01..53bff1a2a731 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -768,7 +768,7 @@ nfs_write_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how)
 		req = nfs_list_entry(head->next);
 		nfs_list_remove_request(req);
 		nfs_list_add_request(req, &data->pages);
-		SetPageWriteback(req->wb_page);
+		set_page_writeback(req->wb_page);
 		*pages++ = req->wb_page;
 		count += req->wb_bytes;
 	}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index e3b1c227cb7b..bb048a75318d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -743,7 +743,7 @@ lock_retry_remap:
 	}
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);		/* Keeps try_to_free_buffers() away. */
+	set_page_writeback(page);	/* Keeps try_to_free_buffers() away. */
 	unlock_page(page);
 
 	/*
@@ -885,7 +885,7 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
 	// FIXME: Make sure it is ok to SetPageError() on unlocked page under
 	// writeback before doing the change!
 #if 0
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 #endif
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 31016572683e..c01847228d2c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2134,7 +2134,7 @@ static int reiserfs_write_full_page(struct page *page, struct writeback_control
     } while(bh != head) ;
 
     BUG_ON(PageWriteback(page));
-    SetPageWriteback(page);
+    set_page_writeback(page);
     unlock_page(page);
 
     /*
@@ -2198,7 +2198,7 @@ fail:
     } while(bh != head);
     SetPageError(page);
     BUG_ON(PageWriteback(page));
-    SetPageWriteback(page);
+    set_page_writeback(page);
     unlock_page(page);
     do {
         struct buffer_head *next = bh->b_this_page;
diff --git a/fs/xfs/linux/xfs_aops.c b/fs/xfs/linux/xfs_aops.c
index dd446266d33f..52a8c40d7f71 100644
--- a/fs/xfs/linux/xfs_aops.c
+++ b/fs/xfs/linux/xfs_aops.c
@@ -566,7 +566,7 @@ xfs_submit_page(
 	int			i;
 
 	BUG_ON(PageWriteback(page));
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	clear_page_dirty(page);
 	unlock_page(page);
 
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 9f4fb3da00d9..bd6ddb279c55 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -312,12 +312,18 @@ extern struct address_space swapper_space;
 struct page;	/* forward declaration */
 
 int test_clear_page_dirty(struct page *page);
+int __clear_page_dirty(struct page *page);
+int test_clear_page_writeback(struct page *page);
+int test_set_page_writeback(struct page *page);
 
 static inline void clear_page_dirty(struct page *page)
 {
 	test_clear_page_dirty(page);
 }
 
-int __clear_page_dirty(struct page *page);
+static inline void set_page_writeback(struct page *page)
+{
+	test_set_page_writeback(page);
+}
 
 #endif	/* PAGE_FLAGS_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 360c5feec975..4d5e76ceaf29 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -363,8 +363,7 @@ void end_page_writeback(struct page *page)
 	wait_queue_head_t *waitqueue = page_waitqueue(page);
 
 	if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) {
-		smp_mb__before_clear_bit();
-		if (!TestClearPageWriteback(page))
+		if (!test_clear_page_writeback(page))
 			BUG();
 		smp_mb__after_clear_bit();
 	}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 23da9ce262ca..bc4f3258daf2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -604,3 +604,45 @@ int __clear_page_dirty(struct page *page)
 	}
 	return TestClearPageDirty(page);
 }
+
+int test_clear_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestClearPageWriteback(page);
+		if (ret)
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_WRITEBACK);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestClearPageWriteback(page);
+	}
+	return ret;
+}
+
+int test_set_page_writeback(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	int ret;
+
+	if (mapping) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&mapping->tree_lock, flags);
+		ret = TestSetPageWriteback(page);
+		if (!ret)
+			radix_tree_tag_set(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_WRITEBACK);
+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	} else {
+		ret = TestSetPageWriteback(page);
+	}
+	return ret;
+
+}
+EXPORT_SYMBOL(test_set_page_writeback);
diff --git a/mm/page_io.c b/mm/page_io.c
index 421f77d2c39c..dde9d23f99bd 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -104,7 +104,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		goto out;
 	}
 	inc_page_state(pswpout);
-	SetPageWriteback(page);
+	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(WRITE, bio);
 out:
diff --git a/mm/swap.c b/mm/swap.c
index a5352c98751a..90a9ac490a3c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -70,7 +70,7 @@ int rotate_reclaimable_page(struct page *page)
 		list_add_tail(&page->lru, &zone->inactive_list);
 		inc_page_state(pgrotated);
 	}
-	if (!TestClearPageWriteback(page))
+	if (!test_clear_page_writeback(page))
 		BUG();
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 	return 0;
-- 
cgit v1.2.3


From 1d7d3304e9845f61cab6b6091e8952f6fb05009a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:11:21 -0700
Subject: [PATCH] stop using the address_space dirty_pages list

Move everything over to walking the radix tree via the PAGECACHE_TAG_DIRTY
tag.  Remove address_space.dirty_pages.
---
 fs/buffer.c              |  18 ++-----
 fs/fs-writeback.c        |  15 +-----
 fs/inode.c               |   1 -
 fs/mpage.c               | 127 +++++++++++++++++++----------------------------
 fs/xfs/linux/xfs_vnode.h |   3 +-
 include/linux/fs.h       |   3 +-
 include/linux/pagemap.h  |   7 +--
 include/linux/pagevec.h  |   7 ++-
 mm/filemap.c             |  35 ++++++++-----
 mm/page-writeback.c      |  29 +++++++----
 mm/page_alloc.c          |   2 +
 mm/swap.c                |  12 ++++-
 mm/swap_state.c          |   3 --
 13 files changed, 122 insertions(+), 140 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 56b0df6bf752..baae58828510 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -825,12 +825,6 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
  * page on the dirty page list.
  *
- * There is also a small window where the page is dirty, and not on dirty_pages.
- * Also a possibility that by the time the page is added to dirty_pages, it has
- * been set clean.  The page lists are somewhat approximate in this regard.
- * It's better to have clean pages accidentally attached to dirty_pages than to
- * leave dirty pages attached to clean_pages.
- *
  * We use private_lock to lock against try_to_free_buffers while using the
  * page's buffer list.  Also use this to protect against clean buffers being
  * added to the page after it was set dirty.
@@ -871,8 +865,6 @@ int __set_page_dirty_buffers(struct page *page)
 		if (page->mapping) {	/* Race with truncate? */
 			if (!mapping->backing_dev_info->memory_backed)
 				inc_page_state(nr_dirty);
-			list_del(&page->list);
-			list_add(&page->list, &mapping->dirty_pages);
 			radix_tree_tag_set(&mapping->page_tree, page->index,
 						PAGECACHE_TAG_DIRTY);
 		}
@@ -1228,7 +1220,7 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
  * The relationship between dirty buffers and dirty pages:
  *
  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
- * the page appears on its address_space.dirty_pages list.
+ * the page is tagged dirty in its radix tree.
  *
  * At all times, the dirtiness of the buffers represents the dirtiness of
  * subsections of the page.  If the page has buffers, the page dirty bit is
@@ -1250,10 +1242,10 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size)
 /**
  * mark_buffer_dirty - mark a buffer_head as needing writeout
  *
- * mark_buffer_dirty() will set the dirty bit against the buffer,
- * then set its backing page dirty, then attach the page to its
- * address_space's dirty_pages list and then attach the address_space's
- * inode to its superblock's dirty inode list.
+ * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
+ * backing page dirty, then tag the page as dirty in its address_space's radix
+ * tree and then attach the address_space's inode to its superblock's dirty
+ * inode list.
  *
  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
  * mapping->tree_lock and the global inode_lock.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f8b6182cb152..0a75c690f142 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -129,12 +129,6 @@ static void write_inode(struct inode *inode, int sync)
  * starvation of particular inodes when others are being redirtied, prevent
  * livelocks, etc.
  *
- * So what we do is to move all pages which are to be written from dirty_pages
- * onto io_pages.  And keep on writing io_pages until it's empty.  Refusing to
- * move more pages onto io_pages until io_pages is empty.  Once that point has
- * been reached, we are ready to take another pass across the inode's dirty
- * pages.
- *
  * Called under inode_lock.
  */
 static int
@@ -159,10 +153,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	 * read speculatively by this cpu before &= ~I_DIRTY  -- mikulas
 	 */
 
-	spin_lock_irq(&mapping->tree_lock);
-	if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages))
-		list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock_irq(&mapping->tree_lock);
 	spin_unlock(&inode_lock);
 
 	ret = do_writepages(mapping, wbc);
@@ -180,10 +170,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode_lock);
 	inode->i_state &= ~I_LOCK;
 	if (!(inode->i_state & I_FREEING)) {
-		if (!list_empty(&mapping->io_pages)) {
-		 	/* Needs more writeback */
-			inode->i_state |= I_DIRTY_PAGES;
-		} else if (!list_empty(&mapping->dirty_pages)) {
+		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/* Redirtied */
 			inode->i_state |= I_DIRTY_PAGES;
 			inode->dirtied_when = jiffies;
diff --git a/fs/inode.c b/fs/inode.c
index b5d43d858e0b..3ffd4e4fc522 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -179,7 +179,6 @@ void inode_init_once(struct inode *inode)
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
-	INIT_LIST_HEAD(&inode->i_data.dirty_pages);
 	INIT_LIST_HEAD(&inode->i_data.locked_pages);
 	INIT_LIST_HEAD(&inode->i_data.io_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
diff --git a/fs/mpage.c b/fs/mpage.c
index 5f5f5e63fca2..964a06035da8 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -592,28 +592,12 @@ out:
  * (The next two paragraphs refer to code which isn't here yet, but they
  *  explain the presence of address_space.io_pages)
  *
- * Pages can be moved from clean_pages or locked_pages onto dirty_pages
- * at any time - it's not possible to lock against that.  So pages which
- * have already been added to a BIO may magically reappear on the dirty_pages
- * list.  And mpage_writepages() will again try to lock those pages.
- * But I/O has not yet been started against the page.  Thus deadlock.
- *
- * To avoid this, mpage_writepages() will only write pages from io_pages. The
- * caller must place them there.  We walk io_pages, locking the pages and
- * submitting them for I/O, moving them to locked_pages.
- *
- * This has the added benefit of preventing a livelock which would otherwise
- * occur if pages are being dirtied faster than we can write them out.
- *
  * If a page is already under I/O, generic_writepages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
  * the call was made get new I/O started against them.  So if called_for_sync()
  * is true, we must wait for existing IO to complete.
- *
- * It's fairly rare for PageWriteback pages to be on ->dirty_pages.  It
- * means that someone redirtied the page while it was under I/O.
  */
 int
 mpage_writepages(struct address_space *mapping,
@@ -625,6 +609,9 @@ mpage_writepages(struct address_space *mapping,
 	int ret = 0;
 	int done = 0;
 	int (*writepage)(struct page *page, struct writeback_control *wbc);
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -635,72 +622,58 @@ mpage_writepages(struct address_space *mapping,
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
 
-	spin_lock_irq(&mapping->tree_lock);
-	while (!list_empty(&mapping->io_pages) && !done) {
-		struct page *page = list_entry(mapping->io_pages.prev,
-					struct page, list);
-		list_del(&page->list);
-		if (PageWriteback(page) && wbc->sync_mode == WB_SYNC_NONE) {
-			if (PageDirty(page)) {
-				list_add(&page->list, &mapping->dirty_pages);
-				continue;
-			}
-			list_add(&page->list, &mapping->locked_pages);
-			continue;
-		}
-		if (!PageDirty(page)) {
-			list_add(&page->list, &mapping->clean_pages);
-			continue;
-		}
-		list_add(&page->list, &mapping->locked_pages);
-
-		page_cache_get(page);
-		spin_unlock_irq(&mapping->tree_lock);
-
-		/*
-		 * At this point we hold neither mapping->tree_lock nor
-		 * lock on the page itself: the page may be truncated or
-		 * invalidated (changing page->mapping to NULL), or even
-		 * swizzled back from swapper_space to tmpfs file mapping.
-		 */
-
-		lock_page(page);
-
-		if (wbc->sync_mode != WB_SYNC_NONE)
-			wait_on_page_writeback(page);
-
-		if (page->mapping == mapping && !PageWriteback(page) &&
-					test_clear_page_dirty(page)) {
-			if (writepage) {
-				ret = (*writepage)(page, wbc);
-				if (ret) {
-					if (ret == -ENOSPC)
-						set_bit(AS_ENOSPC,
-							&mapping->flags);
-					else
-						set_bit(AS_EIO,
-							&mapping->flags);
+	pagevec_init(&pvec, 0);
+	index = 0;
+	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+					PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
+		unsigned i;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			/*
+			 * At this point we hold neither mapping->tree_lock nor
+			 * lock on the page itself: the page may be truncated or
+			 * invalidated (changing page->mapping to NULL), or even
+			 * swizzled back from swapper_space to tmpfs file
+			 * mapping
+			 */
+
+			lock_page(page);
+
+			if (wbc->sync_mode != WB_SYNC_NONE)
+				wait_on_page_writeback(page);
+
+			if (page->mapping == mapping && !PageWriteback(page) &&
+						test_clear_page_dirty(page)) {
+				if (writepage) {
+					ret = (*writepage)(page, wbc);
+					if (ret) {
+						if (ret == -ENOSPC)
+							set_bit(AS_ENOSPC,
+							  &mapping->flags);
+						else
+							set_bit(AS_EIO,
+							  &mapping->flags);
+					}
+				} else {
+					bio = mpage_writepage(bio, page,
+						get_block, &last_block_in_bio,
+						&ret, wbc);
+				}
+				if (ret || (--(wbc->nr_to_write) <= 0))
+					done = 1;
+				if (wbc->nonblocking &&
+						bdi_write_congested(bdi)) {
+					wbc->encountered_congestion = 1;
+					done = 1;
 				}
 			} else {
-				bio = mpage_writepage(bio, page, get_block,
-					&last_block_in_bio, &ret, wbc);
+				unlock_page(page);
 			}
-			if (ret || (--(wbc->nr_to_write) <= 0))
-				done = 1;
-			if (wbc->nonblocking && bdi_write_congested(bdi)) {
-				wbc->encountered_congestion = 1;
-				done = 1;
-			}
-		} else {
-			unlock_page(page);
 		}
-		page_cache_release(page);
-		spin_lock_irq(&mapping->tree_lock);
+		pagevec_release(&pvec);
 	}
-	/*
-	 * Leave any remaining dirty pages on ->io_pages
-	 */
-	spin_unlock_irq(&mapping->tree_lock);
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff --git a/fs/xfs/linux/xfs_vnode.h b/fs/xfs/linux/xfs_vnode.h
index 514bc9cde057..6736f7aa2b97 100644
--- a/fs/xfs/linux/xfs_vnode.h
+++ b/fs/xfs/linux/xfs_vnode.h
@@ -600,7 +600,8 @@ static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
 	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
 	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
 #define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
-#define VN_DIRTY(vp)	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->dirty_pages)))
+#define VN_DIRTY(vp)	mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
+					PAGECACHE_TAG_DIRTY)
 #define VMODIFY(vp)	VN_FLAGSET(vp, VMODIFIED)
 #define VUNMODIFY(vp)	VN_FLAGCLR(vp, VMODIFIED)
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 857e797b0ad2..f8954889d336 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -324,7 +324,6 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
-	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
 	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
@@ -371,6 +370,8 @@ struct block_device {
 #define PAGECACHE_TAG_DIRTY	0
 #define PAGECACHE_TAG_WRITEBACK	1
 
+int mapping_tagged(struct address_space *mapping, int tag);
+
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index e552cb04a0ed..70d07dbfcd02 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -69,9 +69,10 @@ extern struct page * find_trylock_page(struct address_space *mapping,
 				unsigned long index);
 extern struct page * find_or_create_page(struct address_space *mapping,
 				unsigned long index, unsigned int gfp_mask);
-extern unsigned int find_get_pages(struct address_space *mapping,
-				pgoff_t start, unsigned int nr_pages,
-				struct page **pages);
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
+			unsigned int nr_pages, struct page **pages);
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+			int tag, unsigned int nr_pages, struct page **pages);
 
 /*
  * Returns locked page at given index in given cache, creating it if needed.
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 2a332eed3d82..e6e43ce82b55 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -22,8 +22,11 @@ void __pagevec_free(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 void __pagevec_lru_add_active(struct pagevec *pvec);
 void pagevec_strip(struct pagevec *pvec);
-unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned int nr_pages);
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned nr_pages);
+unsigned pagevec_lookup_tag(struct pagevec *pvec,
+		struct address_space *mapping, pgoff_t *index, int tag,
+		unsigned nr_pages);
 
 static inline void pagevec_init(struct pagevec *pvec, int cold)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index 4d5e76ceaf29..cac8da0dd773 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -100,9 +100,7 @@ void __remove_from_page_cache(struct page *page)
 	struct address_space *mapping = page->mapping;
 
 	radix_tree_delete(&mapping->page_tree, page->index);
-	list_del(&page->list);
 	page->mapping = NULL;
-
 	mapping->nrpages--;
 	pagecache_acct(-1);
 }
@@ -148,9 +146,6 @@ static int __filemap_fdatawrite(struct address_space *mapping, int sync_mode)
 	if (mapping->backing_dev_info->memory_backed)
 		return 0;
 
-	spin_lock_irq(&mapping->tree_lock);
-	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
-	spin_unlock_irq(&mapping->tree_lock);
 	ret = do_writepages(mapping, &wbc);
 	return ret;
 }
@@ -190,11 +185,7 @@ restart:
 		struct page *page;
 
 		page = list_entry(mapping->locked_pages.next,struct page,list);
-		list_del(&page->list);
-		if (PageDirty(page))
-			list_add(&page->list, &mapping->dirty_pages);
-		else
-			list_add(&page->list, &mapping->clean_pages);
+		list_del_init(&page->list);
 
 		if (!PageWriteback(page)) {
 			if (++progress > 32) {
@@ -228,7 +219,6 @@ restart:
 
 	return ret;
 }
-
 EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
@@ -539,7 +529,7 @@ EXPORT_SYMBOL(find_or_create_page);
  *
  * find_get_pages() returns the number of pages which were found.
  */
-unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
+unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
 			    unsigned int nr_pages, struct page **pages)
 {
 	unsigned int i;
@@ -554,6 +544,27 @@ unsigned int find_get_pages(struct address_space *mapping, pgoff_t start,
 	return ret;
 }
 
+/*
+ * Like find_get_pages, except we only return pages which are tagged with
+ * `tag'.   We update *start to index the next page for the traversal.
+ */
+unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
+			int tag, unsigned int nr_pages, struct page **pages)
+{
+	unsigned int i;
+	unsigned int ret;
+
+	spin_lock_irq(&mapping->tree_lock);
+	ret = radix_tree_gang_lookup_tag(&mapping->page_tree,
+				(void **)pages, *index, nr_pages, tag);
+	for (i = 0; i < ret; i++)
+		page_cache_get(pages[i]);
+	if (ret)
+		*index = pages[ret - 1]->index + 1;
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
 /*
  * Same as grab_cache_page, but do not wait if the page is unavailable.
  * This is intended for speculative data generators, where the data can
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index bc4f3258daf2..fa5eeca766cf 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -472,12 +472,8 @@ int write_one_page(struct page *page, int wait)
 	if (wait)
 		wait_on_page_writeback(page);
 
-	spin_lock_irq(&mapping->tree_lock);
-	list_del(&page->list);
 	if (test_clear_page_dirty(page)) {
-		list_add(&page->list, &mapping->locked_pages);
 		page_cache_get(page);
-		spin_unlock_irq(&mapping->tree_lock);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
 			wait_on_page_writeback(page);
@@ -486,8 +482,6 @@ int write_one_page(struct page *page, int wait)
 		}
 		page_cache_release(page);
 	} else {
-		list_add(&page->list, &mapping->clean_pages);
-		spin_unlock_irq(&mapping->tree_lock);
 		unlock_page(page);
 	}
 	return ret;
@@ -495,9 +489,8 @@ int write_one_page(struct page *page, int wait)
 EXPORT_SYMBOL(write_one_page);
 
 /*
- * For address_spaces which do not use buffers.  Just set the page's dirty bit
- * and move it to the dirty_pages list.  Also perform space reservation if
- * required.
+ * For address_spaces which do not use buffers.  Just tag the page as dirty in
+ * its radix tree.
  *
  * __set_page_dirty_nobuffers() may return -ENOSPC.  But if it does, the page
  * is still safe, as long as it actually manages to find some blocks at
@@ -520,8 +513,6 @@ int __set_page_dirty_nobuffers(struct page *page)
 				BUG_ON(page->mapping != mapping);
 				if (!mapping->backing_dev_info->memory_backed)
 					inc_page_state(nr_dirty);
-				list_del(&page->list);
-				list_add(&page->list, &mapping->dirty_pages);
 				radix_tree_tag_set(&mapping->page_tree,
 					page->index, PAGECACHE_TAG_DIRTY);
 			}
@@ -646,3 +637,19 @@ int test_set_page_writeback(struct page *page)
 
 }
 EXPORT_SYMBOL(test_set_page_writeback);
+
+/*
+ * Return true if any of the pages in the mapping are marged with the
+ * passed tag.
+ */
+int mapping_tagged(struct address_space *mapping, int tag)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&mapping->tree_lock, flags);
+	ret = radix_tree_tagged(&mapping->page_tree, tag);
+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(mapping_tagged);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c87ca3dd2f11..ae1636c3a422 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -682,6 +682,8 @@ nopage:
 	return NULL;
 got_pg:
 	kernel_map_pages(page, 1 << order, 1);
+	INIT_LIST_HEAD(&page->list);
+	INIT_LIST_HEAD(&page->lru);
 	return page;
 }
 
diff --git a/mm/swap.c b/mm/swap.c
index 90a9ac490a3c..c20d079a0729 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -353,13 +353,21 @@ void pagevec_strip(struct pagevec *pvec)
  *
  * pagevec_lookup() returns the number of pages which were found.
  */
-unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
-		pgoff_t start, unsigned int nr_pages)
+unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t start, unsigned nr_pages)
 {
 	pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 	return pagevec_count(pvec);
 }
 
+unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
+		pgoff_t *index, int tag, unsigned nr_pages)
+{
+	pvec->nr = find_get_pages_tag(mapping, index, tag,
+					nr_pages, pvec->pages);
+	return pagevec_count(pvec);
+}
+
 
 #ifdef CONFIG_SMP
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 736fd2b82300..77424e877e62 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,7 +27,6 @@ struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
-	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
 	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
 	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
 	.a_ops		= &swap_aops,
@@ -210,7 +209,6 @@ int move_to_swap_cache(struct page *page, swp_entry_t entry)
 	if (!err) {
 		if (!swap_duplicate(entry))
 			BUG();
-		/* shift page from clean_pages to dirty_pages list */
 		BUG_ON(PageDirty(page));
 		set_page_dirty(page);
 		INC_CACHE_INFO(add_total);
@@ -245,7 +243,6 @@ int move_from_swap_cache(struct page *page, unsigned long index,
 
 	if (!err) {
 		swap_free(entry);
-		/* shift page from clean_pages to dirty_pages list */
 		__clear_page_dirty(page);
 		set_page_dirty(page);
 	}
-- 
cgit v1.2.3


From 3c1ed9b2ce95145ba1c0434a7a7b63261fd7c15d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:11:47 -0700
Subject: [PATCH] remove address_space.io_pages

Now remove address_space.io_pages.
---
 fs/inode.c         | 1 -
 fs/mpage.c         | 8 +++-----
 include/linux/fs.h | 1 -
 mm/swap_state.c    | 1 -
 4 files changed, 3 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 3ffd4e4fc522..ac8d22413404 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -180,7 +180,6 @@ void inode_init_once(struct inode *inode)
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
 	INIT_LIST_HEAD(&inode->i_data.locked_pages);
-	INIT_LIST_HEAD(&inode->i_data.io_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
diff --git a/fs/mpage.c b/fs/mpage.c
index 964a06035da8..6226bfe0a254 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -589,15 +589,13 @@ out:
  * This is a library function, which implements the writepages()
  * address_space_operation.
  *
- * (The next two paragraphs refer to code which isn't here yet, but they
- *  explain the presence of address_space.io_pages)
- *
  * If a page is already under I/O, generic_writepages() skips it, even
  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
  * and msync() need to guarantee that all the data which was dirty at the time
- * the call was made get new I/O started against them.  So if called_for_sync()
- * is true, we must wait for existing IO to complete.
+ * the call was made get new I/O started against them.  If wbc->sync_mode is
+ * WB_SYNC_ALL then we were called for data integrity and we must wait for
+ * existing IO to complete.
  */
 int
 mpage_writepages(struct address_space *mapping,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f8954889d336..7270490162a9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -325,7 +325,6 @@ struct address_space {
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	locked_pages;	/* list of locked pages */
-	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 77424e877e62..e0396e7ada38 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,7 +27,6 @@ struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
-	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
 	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
-- 
cgit v1.2.3


From a15133091ee83b0a97913cd48d6131188af093e1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:12:01 -0700
Subject: [PATCH] Stop using address_space.locked_pages

Instead, use a radix-tree walk of the pages which are tagged as being under
writeback.

The new function wait_on_page_writeback_range() was generalised out of
filemap_fdatawait().  We can later use this to provide concurrent fsync of
just a section of a file.
---
 fs/inode.c         |  1 -
 include/linux/fs.h |  1 -
 include/linux/mm.h |  2 +-
 mm/filemap.c       | 72 +++++++++++++++++++++++++++---------------------------
 mm/swap_state.c    |  1 -
 5 files changed, 37 insertions(+), 40 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index ac8d22413404..0c122d4a6529 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -179,7 +179,6 @@ void inode_init_once(struct inode *inode)
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
-	INIT_LIST_HEAD(&inode->i_data.locked_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7270490162a9..5194a645baf2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -324,7 +324,6 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
-	struct list_head	locked_pages;	/* list of locked pages */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fbd569b35b4f..af18e1da3bd5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -182,7 +182,7 @@ struct page {
 	atomic_t count;			/* Usage count, see below. */
 	struct list_head list;		/* ->mapping has some page lists. */
 	struct address_space *mapping;	/* The inode (or ...) we belong to. */
-	unsigned long index;		/* Our offset within mapping. */
+	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by zone->lru_lock !! */
 	union {
diff --git a/mm/filemap.c b/mm/filemap.c
index cac8da0dd773..692c9a837e61 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -154,7 +154,6 @@ int filemap_fdatawrite(struct address_space *mapping)
 {
 	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
 }
-
 EXPORT_SYMBOL(filemap_fdatawrite);
 
 /*
@@ -165,51 +164,40 @@ int filemap_flush(struct address_space *mapping)
 {
 	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
 }
-
 EXPORT_SYMBOL(filemap_flush);
 
-/**
- * filemap_fdatawait - walk the list of locked pages of the given address
- *                     space and wait for all of them.
- * @mapping: address space structure to wait for
+/*
+ * Wait for writeback to complete against pages indexed by start->end
+ * inclusive
  */
-int filemap_fdatawait(struct address_space * mapping)
+static int wait_on_page_writeback_range(struct address_space *mapping,
+				pgoff_t start, pgoff_t end)
 {
+	struct pagevec pvec;
+	int nr_pages;
 	int ret = 0;
-	int progress;
-
-restart:
-	progress = 0;
-	spin_lock_irq(&mapping->tree_lock);
-        while (!list_empty(&mapping->locked_pages)) {
-		struct page *page;
-
-		page = list_entry(mapping->locked_pages.next,struct page,list);
-		list_del_init(&page->list);
+	pgoff_t index;
 
-		if (!PageWriteback(page)) {
-			if (++progress > 32) {
-				if (need_resched()) {
-					spin_unlock_irq(&mapping->tree_lock);
-					__cond_resched();
-					goto restart;
-				}
-			}
-			continue;
-		}
+	if (end < start)
+		return 0;
 
-		progress = 0;
-		page_cache_get(page);
-		spin_unlock_irq(&mapping->tree_lock);
+	pagevec_init(&pvec, 0);
+	index = start;
+	while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+			PAGECACHE_TAG_WRITEBACK,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
 
-		wait_on_page_writeback(page);
-		if (PageError(page))
-			ret = -EIO;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
 
-		page_cache_release(page);
-		spin_lock_irq(&mapping->tree_lock);
+			wait_on_page_writeback(page);
+			if (PageError(page))
+				ret = -EIO;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
 	}
-	spin_unlock_irq(&mapping->tree_lock);
 
 	/* Check for outstanding write errors */
 	if (test_and_clear_bit(AS_ENOSPC, &mapping->flags))
@@ -219,6 +207,18 @@ restart:
 
 	return ret;
 }
+
+/**
+ * filemap_fdatawait - walk the list of under-writeback pages of the given
+ *     address space and wait for all of them.
+ *
+ * @mapping: address space structure to wait for
+ */
+int filemap_fdatawait(struct address_space *mapping)
+{
+	return wait_on_page_writeback_range(mapping, 0, -1);
+}
+
 EXPORT_SYMBOL(filemap_fdatawait);
 
 int filemap_write_and_wait(struct address_space *mapping)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e0396e7ada38..8e3c3ca4ae4f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -27,7 +27,6 @@ struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
-	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
-- 
cgit v1.2.3


From d672c382411ffafbf2b8ed608dfdb8bd8e67307d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:12:13 -0700
Subject: [PATCH] stop using address_space.clean_pages

Remove remaining references to address_space.clean_pages.
---
 fs/inode.c              | 1 -
 include/linux/fs.h      | 1 -
 include/linux/pagemap.h | 1 -
 mm/swap_state.c         | 1 -
 4 files changed, 4 deletions(-)

(limited to 'include')

diff --git a/fs/inode.c b/fs/inode.c
index 0c122d4a6529..282d86aed622 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -178,7 +178,6 @@ void inode_init_once(struct inode *inode)
 {
 	memset(inode, 0, sizeof(*inode));
 	INIT_HLIST_NODE(&inode->i_hash);
-	INIT_LIST_HEAD(&inode->i_data.clean_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5194a645baf2..dc8c46fb4b69 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -323,7 +323,6 @@ struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
-	struct list_head	clean_pages;	/* list of clean pages */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 70d07dbfcd02..5585675ab842 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -142,7 +142,6 @@ static inline unsigned long get_page_cache_size(void)
 static inline void ___add_to_page_cache(struct page *page,
 		struct address_space *mapping, unsigned long index)
 {
-	list_add(&page->list, &mapping->clean_pages);
 	page->mapping = mapping;
 	page->index = index;
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8e3c3ca4ae4f..22946f0d9ecf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -26,7 +26,6 @@ extern struct address_space_operations swap_aops;
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
-	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
 	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
-- 
cgit v1.2.3


From c33c9e78434fbf8c3ffb4f72bb2a57d12c4b70af Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:12:30 -0700
Subject: [PATCH] revert the slabification of i386 pgd's and pmd's

This code is playing with page->lru from pages which came from slab.  But to
remove page->list we need to convert slab over to using page->lru.  So we
cannot allow the i386 pagetable code to go scribbling on the ->lru field of
active slab pages.

This optimisation was pretty thin, and it is more important to shrink the
pageframe (on all architectures).
---
 arch/i386/mm/init.c               |  30 ++++-------
 arch/i386/mm/pageattr.c           |  25 ++++-----
 arch/i386/mm/pgtable.c            | 104 ++++++++++++++------------------------
 include/asm-i386/pgtable-3level.h |   2 +
 include/asm-i386/pgtable.h        |  30 +++++------
 5 files changed, 76 insertions(+), 115 deletions(-)

(limited to 'include')

diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 040862e6c6a0..f91e63489463 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -523,30 +523,20 @@ void __init mem_init(void)
 #endif
 }
 
-kmem_cache_t *pgd_cache;
-kmem_cache_t *pmd_cache;
+#ifdef CONFIG_X86_PAE
+struct kmem_cache_s *pae_pgd_cachep;
 
 void __init pgtable_cache_init(void)
 {
-	if (PTRS_PER_PMD > 1) {
-		pmd_cache = kmem_cache_create("pmd",
-					PTRS_PER_PMD*sizeof(pmd_t),
-					PTRS_PER_PMD*sizeof(pmd_t),
-					0,
-					pmd_ctor,
-					NULL);
-		if (!pmd_cache)
-			panic("pgtable_cache_init(): cannot create pmd cache");
-	}
-	pgd_cache = kmem_cache_create("pgd",
-				PTRS_PER_PGD*sizeof(pgd_t),
-				PTRS_PER_PGD*sizeof(pgd_t),
-				0,
-				pgd_ctor,
-				PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
-	if (!pgd_cache)
-		panic("pgtable_cache_init(): Cannot create pgd cache");
+	/*
+	 * PAE pgds must be 16-byte aligned:
+	 */
+	pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0,
+		SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL);
+	if (!pae_pgd_cachep)
+		panic("init_pae(): Cannot alloc pae_pgd SLAB cache");
 }
+#endif
 
 /*
  * This function cannot be __init, since exceptions don't work in that
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
index 9d58473d4a68..460fde9a1a9e 100644
--- a/arch/i386/mm/pageattr.c
+++ b/arch/i386/mm/pageattr.c
@@ -67,22 +67,19 @@ static void flush_kernel_map(void *dummy)
 
 static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) 
 { 
-	struct page *page;
-	unsigned long flags;
-
 	set_pte_atomic(kpte, pte); 	/* change init_mm */
-	if (PTRS_PER_PMD > 1)
-		return;
-
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_for_each_entry(page, &pgd_list, lru) {
-		pgd_t *pgd;
-		pmd_t *pmd;
-		pgd = (pgd_t *)page_address(page) + pgd_index(address);
-		pmd = pmd_offset(pgd, address);
-		set_pte_atomic((pte_t *)pmd, pte);
+#ifndef CONFIG_X86_PAE
+	{
+		struct list_head *l;
+		spin_lock(&mmlist_lock);
+		list_for_each(l, &init_mm.mmlist) {
+			struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist);
+			pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address);
+			set_pte_atomic((pte_t *)pmd, pte);
+		}
+		spin_unlock(&mmlist_lock);
 	}
-	spin_unlock_irqrestore(&pgd_lock, flags);
+#endif
 }
 
 /* 
diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 8c82d4b9a7a8..941c2aa5236c 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -12,7 +12,6 @@
 #include <linux/highmem.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/spinlock.h>
 
 #include <asm/system.h>
 #include <asm/pgtable.h>
@@ -152,88 +151,61 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	return pte;
 }
 
-void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
-{
-	memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
-}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * If the locking proves to be non-performant, a ticketing scheme with
- * checks at dup_mmap(), exec(), and other mmlist addition points
- * could be used. The locking scheme was chosen on the basis of
- * manfred's recommendations and having no core impact whatsoever.
- * -- wli
- */
-spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
-LIST_HEAD(pgd_list);
+#ifdef CONFIG_X86_PAE
 
-void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	unsigned long flags;
-
-	if (PTRS_PER_PMD == 1)
-		spin_lock_irqsave(&pgd_lock, flags);
-
-	memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
+	int i;
+	pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
+
+	if (pgd) {
+		for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+			unsigned long pmd = __get_free_page(GFP_KERNEL);
+			if (!pmd)
+				goto out_oom;
+			clear_page(pmd);
+			set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
+		}
+		memcpy(pgd + USER_PTRS_PER_PGD,
 			swapper_pg_dir + USER_PTRS_PER_PGD,
 			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
-
-	if (PTRS_PER_PMD > 1)
-		return;
-
-	list_add(&virt_to_page(pgd)->lru, &pgd_list);
-	spin_unlock_irqrestore(&pgd_lock, flags);
-	memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+	}
+	return pgd;
+out_oom:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
+	return NULL;
 }
 
-/* never called when PTRS_PER_PMD > 1 */
-void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
+void pgd_free(pgd_t *pgd)
 {
-	unsigned long flags; /* can be called from interrupt context */
+	int i;
 
-	spin_lock_irqsave(&pgd_lock, flags);
-	list_del(&virt_to_page(pgd)->lru);
-	spin_unlock_irqrestore(&pgd_lock, flags);
+	for (i = 0; i < USER_PTRS_PER_PGD; i++)
+		free_page((unsigned long)__va(pgd_val(pgd[i])-1));
+	kmem_cache_free(pae_pgd_cachep, pgd);
 }
 
+#else
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	int i;
-	pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
-
-	if (PTRS_PER_PMD == 1 || !pgd)
-		return pgd;
+	pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL);
 
-	for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
-		pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
-		if (!pmd)
-			goto out_oom;
-		set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd))));
+	if (pgd) {
+		memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t));
+		memcpy(pgd + USER_PTRS_PER_PGD,
+			swapper_pg_dir + USER_PTRS_PER_PGD,
+			(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
 	}
 	return pgd;
-
-out_oom:
-	for (i--; i >= 0; i--)
-		kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-	kmem_cache_free(pgd_cache, pgd);
-	return NULL;
 }
 
 void pgd_free(pgd_t *pgd)
 {
-	int i;
-
-	/* in the PAE case user pgd entries are overwritten before usage */
-	if (PTRS_PER_PMD > 1)
-		for (i = 0; i < USER_PTRS_PER_PGD; ++i)
-			kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
-	/* in the non-PAE case, clear_page_tables() clears user pgd entries */
-	kmem_cache_free(pgd_cache, pgd);
+	free_page((unsigned long)pgd);
 }
+
+#endif /* CONFIG_X86_PAE */
+
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index 147acd8530e2..0ca6393cbe4c 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -123,4 +123,6 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
 #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
 #define PTE_FILE_MAX_BITS       32
 
+extern struct kmem_cache_s *pae_pgd_cachep;
+
 #endif /* _I386_PGTABLE_3LEVEL_H */
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index cc7e18b5e92c..1c0c38375349 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -21,27 +21,15 @@
 #include <asm/bitops.h>
 #endif
 
-#include <linux/slab.h>
-#include <linux/list.h>
-#include <linux/spinlock.h>
+extern pgd_t swapper_pg_dir[1024];
+extern void paging_init(void);
 
 /*
  * ZERO_PAGE is a global shared page that is always zero: used
  * for zero-mapped memory areas etc..
  */
-#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 extern unsigned long empty_zero_page[1024];
-extern pgd_t swapper_pg_dir[1024];
-extern kmem_cache_t *pgd_cache;
-extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
-extern struct list_head pgd_list;
-
-void pmd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_ctor(void *, kmem_cache_t *, unsigned long);
-void pgd_dtor(void *, kmem_cache_t *, unsigned long);
-void pgtable_cache_init(void);
-void paging_init(void);
+#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
 
 #endif /* !__ASSEMBLY__ */
 
@@ -53,8 +41,20 @@ void paging_init(void);
 #ifndef __ASSEMBLY__
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
+
+/*
+ * Need to initialise the X86 PAE caches
+ */
+extern void pgtable_cache_init(void);
+
 #else
 # include <asm/pgtable-2level.h>
+
+/*
+ * No page table caches to initialise
+ */
+#define pgtable_cache_init()	do { } while (0)
+
 #endif
 #endif
 
-- 
cgit v1.2.3


From 0fcb51fd7ee151a03aab2d07493bbadf176a1457 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:13:47 -0700
Subject: [PATCH] stop using page->lru in compound pages

The compound page logic is using page->lru, and these get will scribbled on
in various places so switch the Compound page logic over to using ->mapping
and ->private.
---
 arch/i386/mm/hugetlbpage.c    |  1 -
 arch/ia64/mm/hugetlbpage.c    |  1 -
 arch/ppc64/mm/hugetlbpage.c   |  1 -
 arch/sparc64/mm/hugetlbpage.c |  1 -
 include/linux/mm.h            | 10 +++++-----
 mm/page_alloc.c               | 31 ++++++++++++++++---------------
 6 files changed, 21 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 1777966f0186..0c73f414b5b1 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -278,7 +278,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 static void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d75ec2bfdb41..aa2a1945d2c2 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -246,7 +246,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index 3b67759defe1..032a1c9c5766 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -450,7 +450,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 static void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 867d8b788e6b..dd2a7549caef 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -248,7 +248,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 static void free_huge_page(struct page *page)
 {
 	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
 
 	INIT_LIST_HEAD(&page->lru);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index af18e1da3bd5..fa7beaefd038 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -242,24 +242,24 @@ extern void FASTCALL(__page_cache_release(struct page *));
 static inline int page_count(struct page *p)
 {
 	if (PageCompound(p))
-		p = (struct page *)p->lru.next;
+		p = (struct page *)p->private;
 	return atomic_read(&(p)->count);
 }
 
 static inline void get_page(struct page *page)
 {
 	if (PageCompound(page))
-		page = (struct page *)page->lru.next;
+		page = (struct page *)page->private;
 	atomic_inc(&page->count);
 }
 
 static inline void put_page(struct page *page)
 {
 	if (PageCompound(page)) {
-		page = (struct page *)page->lru.next;
+		page = (struct page *)page->private;
 		if (put_page_testzero(page)) {
-			if (page->lru.prev) {	/* destructor? */
-				(*(void (*)(struct page *))page->lru.prev)(page);
+			if (page[1].mapping) {	/* destructor? */
+				(*(void (*)(struct page *))page[1].mapping)(page);
 			} else {
 				__page_cache_release(page);
 			}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b79b7907e734..6cb630fec60e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -71,13 +71,14 @@ static int bad_range(struct zone *zone, struct page *page)
 
 static void bad_page(const char *function, struct page *page)
 {
-	printk("Bad page state at %s (in process '%s', page %p)\n", function, current->comm, page);
-	printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n",
+	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
+		function, current->comm, page);
+	printk(KERN_EMERG "flags:0x%08lx mapping:%p mapped:%d count:%d\n",
 		(unsigned long)page->flags, page->mapping,
 		page_mapped(page), page_count(page));
-	printk("Backtrace:\n");
+	printk(KERN_EMERG "Backtrace:\n");
 	dump_stack();
-	printk("Trying to fix it up, but a reboot is needed\n");
+	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
 	page->flags &= ~(1 << PG_private	|
 			1 << PG_locked	|
 			1 << PG_lru	|
@@ -99,13 +100,13 @@ static void bad_page(const char *function, struct page *page)
  *
  * The remaining PAGE_SIZE pages are called "tail pages".
  *
- * All pages have PG_compound set.  All pages have their lru.next pointing at
+ * All pages have PG_compound set.  All pages have their ->private pointing at
  * the head page (even the head page has this).
  *
- * The head page's lru.prev, if non-zero, holds the address of the compound
- * page's put_page() function.
+ * The first tail page's ->mapping, if non-zero, holds the address of the
+ * compound page's put_page() function.
  *
- * The order of the allocation is stored in the first tail page's lru.prev.
+ * The order of the allocation is stored in the first tail page's ->index
  * This is only for debug at present.  This usage means that zero-order pages
  * may not be compound.
  */
@@ -114,13 +115,13 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	page->lru.prev = NULL;
-	page[1].lru.prev = (void *)order;
+	page[1].mapping = 0;
+	page[1].index = order;
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
 		SetPageCompound(p);
-		p->lru.next = (void *)page;
+		p->private = (unsigned long)page;
 	}
 }
 
@@ -129,7 +130,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
-	if (page[1].lru.prev != (void *)order)
+	if (page[1].index != order)
 		bad_page(__FUNCTION__, page);
 
 	for (i = 0; i < nr_pages; i++) {
@@ -137,7 +138,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 
 		if (!PageCompound(p))
 			bad_page(__FUNCTION__, page);
-		if (p->lru.next != (void *)page)
+		if (p->private != (unsigned long)page)
 			bad_page(__FUNCTION__, page);
 		ClearPageCompound(p);
 	}
@@ -512,14 +513,14 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 		spin_lock_irqsave(&zone->lock, flags);
 		page = __rmqueue(zone, order);
 		spin_unlock_irqrestore(&zone->lock, flags);
-		if (order && page)
-			prep_compound_page(page, order);
 	}
 
 	if (page != NULL) {
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
+		if (order)
+			prep_compound_page(page, order);
 	}
 	return page;
 }
-- 
cgit v1.2.3


From be5ceb401d4853c6b31f3f0c79d6b14ef5847288 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:14:26 -0700
Subject: [PATCH] remove page.list

Remove the now-unneeded page.list field.
---
 include/linux/mm.h | 1 -
 mm/page_alloc.c    | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa7beaefd038..94b0326d120f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -180,7 +180,6 @@ struct page {
 	page_flags_t flags;		/* atomic flags, some possibly
 					   updated asynchronously */
 	atomic_t count;			/* Usage count, see below. */
-	struct list_head list;		/* ->mapping has some page lists. */
 	struct address_space *mapping;	/* The inode (or ...) we belong to. */
 	pgoff_t index;			/* Our offset within mapping. */
 	struct list_head lru;		/* Pageout list, eg. active_list;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6cb630fec60e..96fb97866a28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -683,8 +683,6 @@ nopage:
 	return NULL;
 got_pg:
 	kernel_map_pages(page, 1 << order, 1);
-	INIT_LIST_HEAD(&page->list);
-	INIT_LIST_HEAD(&page->lru);
 	return page;
 }
 
-- 
cgit v1.2.3


From d3eb546e203ab717237566e5762d97796e58f41f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:14:39 -0700
Subject: [PATCH] fdatasync integrity fix

fdatasync can fail to wait on some pages due to a race.

If some task (eg pdflush) is flushing the same mapping it can remove a page's
dirty tag but not then mark that page as being under writeback, because
pdflush hit a locked buffer in __block_write_full_page().  This will happen
because kjournald is writing the buffer.  In this situation
__block_write_full_page() will redirty the page so that fsync notices it, but
there is a window where the page eludes the radix tree dirty page walk.

Consequently a concurrent fsync will fail to notice the page when walking the
radix tree's dirty pages.

The approach taken by this patch is to leave the page marked as dirty in the
radix tree while ->writepage is working out what to do with it.  This ensures
that a concurrent write-for-sync will successfully locate the page and will
then block in lock_page() until the non-write-for-sync code has finished
altering the page state.
---
 fs/mpage.c          |  2 +-
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 35 ++++++++++++++++++++++++++++++++++-
 mm/vmscan.c         |  2 +-
 4 files changed, 37 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/fs/mpage.c b/fs/mpage.c
index 9edb2d6042b2..fecfe9307a7e 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -643,7 +643,7 @@ mpage_writepages(struct address_space *mapping,
 				wait_on_page_writeback(page);
 
 			if (page->mapping == mapping && !PageWriteback(page) &&
-						test_clear_page_dirty(page)) {
+						clear_page_dirty_for_io(page)) {
 				if (writepage) {
 					ret = (*writepage)(page, wbc);
 					if (ret) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 94b0326d120f..2ba5ab34cbdd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -472,6 +472,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
 int set_page_dirty_lock(struct page *page);
+int clear_page_dirty_for_io(struct page *page);
 
 /*
  * Prototype to add a shrinker callback for ageable caches.
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index fa5eeca766cf..113c4f67bb02 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -472,7 +472,7 @@ int write_one_page(struct page *page, int wait)
 	if (wait)
 		wait_on_page_writeback(page);
 
-	if (test_clear_page_dirty(page)) {
+	if (clear_page_dirty_for_io(page)) {
 		page_cache_get(page);
 		ret = mapping->a_ops->writepage(page, &wbc);
 		if (ret == 0 && wait) {
@@ -573,6 +573,36 @@ int test_clear_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(test_clear_page_dirty);
 
+/*
+ * Clear a page's dirty flag, while caring for dirty memory accounting.
+ * Returns true if the page was previously dirty.
+ *
+ * This is for preparing to put the page under writeout.  We leave the page
+ * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * can discover it via a PAGECACHE_TAG_DIRTY walk.  The ->writepage
+ * implementation will run either set_page_writeback() or set_page_dirty(),
+ * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * back into sync.
+ *
+ * This incoherency between the page's dirty flag and radix-tree tag is
+ * unfortunate, but it only exists while the page is locked.
+ */
+int clear_page_dirty_for_io(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+
+	if (mapping) {
+		if (TestClearPageDirty(page)) {
+			if (!mapping->backing_dev_info->memory_backed)
+				dec_page_state(nr_dirty);
+			return 1;
+		}
+		return 0;
+	}
+	return TestClearPageDirty(page);
+}
+EXPORT_SYMBOL(clear_page_dirty_for_io);
+
 /*
  * Clear a page's dirty flag while ignoring dirty memory accounting
  */
@@ -629,6 +659,9 @@ int test_set_page_writeback(struct page *page)
 		if (!ret)
 			radix_tree_tag_set(&mapping->page_tree, page->index,
 						PAGECACHE_TAG_WRITEBACK);
+		if (!PageDirty(page))
+			radix_tree_tag_clear(&mapping->page_tree, page->index,
+						PAGECACHE_TAG_DIRTY);
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	} else {
 		ret = TestSetPageWriteback(page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index df658dd6c743..372ef182c478 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -354,7 +354,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
-			if (test_clear_page_dirty(page)) {
+			if (clear_page_dirty_for_io(page)) {
 				int res;
 				struct writeback_control wbc = {
 					.sync_mode = WB_SYNC_NONE,
-- 
cgit v1.2.3


From bd134f2720aa6fe1544a76360999d8e18e5f3e02 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:14:52 -0700
Subject: [PATCH] don't allow background writes to hide dirty buffers

If pdflush hits a locked-and-clean buffer in __block_write_full_page() it
will just pass over the buffer.  Typically the buffer is an ext3 data=ordered
buffer which is being written by kjournald, but a similar thing can happen
with blockdev buffers and ll_rw_block().

This is bad because the buffer is still under I/O and a subsequent fsync's
fdatawait() needs to know about it.

It is not practical to tag the page for writeback - only the submitter of the
I/O can do that, because the submitter has control of the end_io handler.

So instead, redirty the page so a subsequent fsync's fdatawrite() will wait on
the underway I/O.

There is a risk that pdflush::background_writeout() will lock up, repeatedly
trying and failing to write the same page.  This is prevented by ensuring
that background_writeout() always throttles when it made no progress.
---
 fs/buffer.c               | 19 ++++++++++++-------
 fs/fs-writeback.c         |  9 +++++++++
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       |  8 ++++----
 4 files changed, 26 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index baae58828510..42b61de10bf3 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1802,14 +1802,18 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
 		get_bh(bh);
 		if (!buffer_mapped(bh))
 			continue;
-		if (wbc->sync_mode != WB_SYNC_NONE) {
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
 			lock_buffer(bh);
-		} else {
-			if (test_set_buffer_locked(bh)) {
-				if (buffer_dirty(bh))
-					__set_page_dirty_nobuffers(page);
-				continue;
-			}
+		} else if (test_set_buffer_locked(bh)) {
+			__set_page_dirty_nobuffers(page);
+			continue;
 		}
 		if (test_clear_buffer_dirty(bh)) {
 			if (!buffer_uptodate(bh))
@@ -1857,6 +1861,7 @@ done:
 		if (uptodate)
 			SetPageUptodate(page);
 		end_page_writeback(page);
+		wbc->pages_skipped++;	/* We didn't write this page */
 	}
 	return err;
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index bd6e0588066e..591c5eb79ba3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -279,6 +279,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 						struct inode, i_list);
 		struct address_space *mapping = inode->i_mapping;
 		struct backing_dev_info *bdi = mapping->backing_dev_info;
+		long pages_skipped;
 
 		if (bdi->memory_backed) {
 			if (sb == blockdev_superblock) {
@@ -326,6 +327,7 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 
 		BUG_ON(inode->i_state & I_FREEING);
 		__iget(inode);
+		pages_skipped = wbc->pages_skipped;
 		__writeback_single_inode(inode, wbc);
 		if (wbc->sync_mode == WB_SYNC_HOLD) {
 			inode->dirtied_when = jiffies;
@@ -333,6 +335,13 @@ sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc)
 		}
 		if (current_is_pdflush())
 			writeback_release(bdi);
+		if (wbc->pages_skipped != pages_skipped) {
+			/*
+			 * writeback is not making progress due to locked
+			 * buffers.  Skip this inode for now.
+			 */
+			list_move(&inode->i_list, &sb->s_dirty);
+		}
 		spin_unlock(&inode_lock);
 		iput(inode);
 		spin_lock(&inode_lock);
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 1424811e1eab..7380d2cefb16 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -39,6 +39,7 @@ struct writeback_control {
 					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
+	long pages_skipped;		/* Pages which were not written */
 	int nonblocking;		/* Don't get stuck on request queues */
 	int encountered_congestion;	/* An output: a queue is full */
 	int for_kupdate;		/* A kupdate writeback */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 113c4f67bb02..1981309fa9c5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -261,13 +261,13 @@ static void background_writeout(unsigned long _min_pages)
 			break;
 		wbc.encountered_congestion = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+		wbc.pages_skipped = 0;
 		writeback_inodes(&wbc);
 		min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-		if (wbc.nr_to_write > 0) {
+		if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
 			/* Wrote less than expected */
-			if (wbc.encountered_congestion)
-				blk_congestion_wait(WRITE, HZ/10);
-			else
+			blk_congestion_wait(WRITE, HZ/10);
+			if (!wbc.encountered_congestion)
 				break;
 		}
 	}
-- 
cgit v1.2.3


From 9672a337305358ecc81dc17700e58ce3f42c11f6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:15:07 -0700
Subject: [PATCH] writeback efficiency and QoS improvements

The radix-tree walk for writeback has a couple of problems:

a) It always scans a file from its first dirty page, so if someone
   is repeatedly dirtying the front part of a file, pages near the end
   may be starved of writeout.  (Well, not completely: the `kupdate'
   function will write an entire file once the file's dirty timestamp
   has expired).

b) When the disk queues are huge (10000 requests), there can be a
   very large number of locked pages.  Scanning past these in writeback
   consumes quite some CPU time.

So in each address_space we record the index at which the last batch of
writeout terminated and start the next batch of writeback from that
point.
---
 fs/mpage.c         | 20 +++++++++++++++++++-
 include/linux/fs.h |  1 +
 2 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/mpage.c b/fs/mpage.c
index fecfe9307a7e..25fd33b53444 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -610,6 +610,7 @@ mpage_writepages(struct address_space *mapping,
 	struct pagevec pvec;
 	int nr_pages;
 	pgoff_t index;
+	int scanned = 0;
 
 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
 		wbc->encountered_congestion = 1;
@@ -621,11 +622,18 @@ mpage_writepages(struct address_space *mapping,
 		writepage = mapping->a_ops->writepage;
 
 	pagevec_init(&pvec, 0);
-	index = 0;
+	if (wbc->sync_mode == WB_SYNC_NONE) {
+		index = mapping->writeback_index; /* Start from prev offset */
+	} else {
+		index = 0;			  /* whole-file sweep */
+		scanned = 1;
+	}
+retry:
 	while (!done && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
 					PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) {
 		unsigned i;
 
+		scanned = 1;
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
@@ -672,6 +680,16 @@ mpage_writepages(struct address_space *mapping,
 		}
 		pagevec_release(&pvec);
 	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	mapping->writeback_index = index;
 	if (bio)
 		mpage_bio_submit(WRITE, bio);
 	return ret;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index dc8c46fb4b69..bacf6bcbc7b7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -324,6 +324,7 @@ struct address_space {
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		tree_lock;	/* and spinlock protecting it */
 	unsigned long		nrpages;	/* number of total pages */
+	pgoff_t			writeback_index;/* writeback starts here */
 	struct address_space_operations *a_ops;	/* methods */
 	struct list_head	i_mmap;		/* list of private mappings */
 	struct list_head	i_mmap_shared;	/* list of shared mappings */
-- 
cgit v1.2.3


From 3c7011b3e90508f2f3adb895d712d36b1cfdcfd2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:15:46 -0700
Subject: [PATCH] use compound pages for hugetlb pages only

The compound page logic is a little fragile - it relies on additional
metadata in the pageframes which some other kernel code likes to stomp on
(xfs was doing this).

Also, because we're treating all higher-order pages as compound pages it is
no longer possible to free individual lower-order pages from the middle of
higher-order pages.  At least one ARM driver insists on doing this.

We only really need the compound page logic for higher-order pages which can
be mapped into user pagetables and placed under direct-io.  This covers
hugetlb pages and, conceivably, soundcard DMA buffers which were allcoated
with a higher-order allocation but which weren't marked PageReserved.

The patch arranges for the hugetlb implications to allocate their pages with
compound page metadata, and all other higher-order allocations go back to the
old way.

(Andrea supplied the GFP_LEVEL_MASK fix)
---
 arch/i386/mm/hugetlbpage.c    |  3 ++-
 arch/ia64/mm/hugetlbpage.c    |  3 ++-
 arch/ppc64/mm/hugetlbpage.c   |  3 ++-
 arch/sh/mm/hugetlbpage.c      |  3 ++-
 arch/sparc64/mm/hugetlbpage.c |  3 ++-
 include/linux/gfp.h           |  6 ++++++
 include/linux/mm.h            |  4 ++--
 include/linux/slab.h          |  4 +---
 mm/page_alloc.c               | 22 +++++++++++-----------
 9 files changed, 30 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 0c73f414b5b1..7224ddcb6a11 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -54,7 +54,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+				HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index aa2a1945d2c2..3dec8e2f4056 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -58,7 +58,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index 032a1c9c5766..a7b2c63c700f 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -78,7 +78,8 @@ static struct page *alloc_fresh_huge_page(void)
 	static int nid = 0;
 	struct page *page;
 
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	if (!page)
 		return NULL;
 
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index f458eb2d0e6e..6f72d865e8d2 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -60,7 +60,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index dd2a7549caef..5a674bbd5796 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -56,7 +56,8 @@ static struct page *alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER, HUGETLB_PAGE_ORDER);
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % numnodes;
 	return page;
 }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c9695427a435..679fc963f842 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -32,10 +32,16 @@
 #define __GFP_NOFAIL	0x800	/* Retry for ever.  Cannot fail */
 #define __GFP_NORETRY	0x1000	/* Do not retry.  Might fail */
 #define __GFP_NO_GROW	0x2000	/* Slab internal usage */
+#define __GFP_COMP	0x4000	/* Add compound page metadata */
 
 #define __GFP_BITS_SHIFT 16	/* Room for 16 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1)
 
+/* if you forget to add the bitmask here kernel will crash, period */
+#define GFP_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS| \
+			__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT| \
+			__GFP_NOFAIL|__GFP_NORETRY|__GFP_NO_GROW|__GFP_COMP)
+
 #define GFP_ATOMIC	(__GFP_HIGH)
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2ba5ab34cbdd..f827be900157 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -247,14 +247,14 @@ static inline int page_count(struct page *p)
 
 static inline void get_page(struct page *page)
 {
-	if (PageCompound(page))
+	if (unlikely(PageCompound(page)))
 		page = (struct page *)page->private;
 	atomic_inc(&page->count);
 }
 
 static inline void put_page(struct page *page)
 {
-	if (PageCompound(page)) {
+	if (unlikely(PageCompound(page))) {
 		page = (struct page *)page->private;
 		if (put_page_testzero(page)) {
 			if (page[1].mapping) {	/* destructor? */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 69be5b308a11..806cc52abd3a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -25,9 +25,7 @@ typedef struct kmem_cache_s kmem_cache_t;
 #define	SLAB_KERNEL		GFP_KERNEL
 #define	SLAB_DMA		GFP_DMA
 
-#define SLAB_LEVEL_MASK		(__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\
-				__GFP_COLD|__GFP_NOWARN|__GFP_REPEAT|\
-				__GFP_NOFAIL|__GFP_NORETRY)
+#define SLAB_LEVEL_MASK		GFP_LEVEL_MASK
 
 #define	SLAB_NO_GROW		__GFP_NO_GROW	/* don't grow a cache */
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 96fb97866a28..4148e94eee13 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -130,6 +130,9 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	int i;
 	int nr_pages = 1 << order;
 
+	if (!PageCompound(page))
+		return;
+
 	if (page[1].index != order)
 		bad_page(__FUNCTION__, page);
 
@@ -487,10 +490,12 @@ void fastcall free_cold_page(struct page *page)
  * or two.
  */
 
-static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
+static struct page *
+buffered_rmqueue(struct zone *zone, int order, int gfp_flags)
 {
 	unsigned long flags;
 	struct page *page = NULL;
+	int cold = !!(gfp_flags & __GFP_COLD);
 
 	if (order == 0) {
 		struct per_cpu_pages *pcp;
@@ -519,7 +524,7 @@ static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
 		BUG_ON(bad_range(zone, page));
 		mod_page_state_zone(zone, pgalloc, 1 << order);
 		prep_new_page(page, order);
-		if (order)
+		if (order && (gfp_flags & __GFP_COMP))
 			prep_compound_page(page, order);
 	}
 	return page;
@@ -552,16 +557,11 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	struct reclaim_state reclaim_state;
 	struct task_struct *p = current;
 	int i;
-	int cold;
 	int alloc_type;
 	int do_retry;
 
 	might_sleep_if(wait);
 
-	cold = 0;
-	if (gfp_mask & __GFP_COLD)
-		cold = 1;
-
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
 	if (zones[0] == NULL)     /* no zones in the zonelist */
 		return NULL;
@@ -583,7 +583,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
@@ -606,7 +606,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
@@ -620,7 +620,7 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
@@ -648,7 +648,7 @@ rebalance:
 
 		if (z->free_pages >= min ||
 				(!wait && z->free_pages >= z->pages_high)) {
-			page = buffered_rmqueue(z, order, cold);
+			page = buffered_rmqueue(z, order, gfp_mask);
 			if (page)
 				goto got_pg;
 		}
-- 
cgit v1.2.3


From e2ea83742133d581a0422f1b2d276e690a81f043 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:16:32 -0700
Subject: [PATCH] mremap: move_vma fixes and cleanup

From: Hugh Dickins <hugh@veritas.com>

Partial rewrite of mremap's move_vma.  Rajesh Venkatasubramanian has pointed
out that vmtruncate could miss ptes, leaving orphaned pages, because move_vma
only made the new vma visible after filling it.  We see no good reason for
that, and time to make move_vma more robust.

Removed all its vma merging decisions, leave them to mmap.c's vma_merge, with
copy_vma added.  Removed duplicated is_mergeable_vma test from vma_merge, and
duplicated validate_mm from insert_vm_struct.

move_vma move from old to new then unmap old; but on error move back from new
to old and unmap new.  Don't unwind within move_page_tables, let move_vma
call it explicitly to unwind, with the right source vma.  Get the
VM_ACCOUNTing right even when the final do_munmap fails.
---
 include/linux/mm.h |   2 +
 mm/mmap.c          |  49 ++++++++++++---
 mm/mremap.c        | 172 +++++++++++++++++------------------------------------
 3 files changed, 97 insertions(+), 126 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f827be900157..43335c61e0da 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -541,6 +541,8 @@ extern void si_meminfo_node(struct sysinfo *val, int nid);
 extern void insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
 extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
+extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
+	unsigned long addr, unsigned long len, unsigned long pgoff);
 extern void exit_mmap(struct mm_struct *);
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 000e377d4888..08e65e8ba699 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -385,7 +385,8 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  * whether that can be merged with its predecessor or its successor.  Or
  * both (it neatly fills a hole).
  */
-static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
+static struct vm_area_struct *vma_merge(struct mm_struct *mm,
+			struct vm_area_struct *prev,
 			struct rb_node *rb_parent, unsigned long addr, 
 			unsigned long end, unsigned long vm_flags,
 			struct file *file, unsigned long pgoff)
@@ -399,7 +400,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 	 * vma->vm_flags & VM_SPECIAL, too.
 	 */
 	if (vm_flags & VM_SPECIAL)
-		return 0;
+		return NULL;
 
 	i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL;
 
@@ -412,7 +413,6 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 	 * Can it merge with the predecessor?
 	 */
 	if (prev->vm_end == addr &&
-			is_mergeable_vma(prev, file, vm_flags) &&
 			can_vma_merge_after(prev, vm_flags, file, pgoff)) {
 		struct vm_area_struct *next;
 		int need_up = 0;
@@ -443,12 +443,12 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 
 			mm->map_count--;
 			kmem_cache_free(vm_area_cachep, next);
-			return 1;
+			return prev;
 		}
 		spin_unlock(lock);
 		if (need_up)
 			up(i_shared_sem);
-		return 1;
+		return prev;
 	}
 
 	/*
@@ -459,7 +459,7 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
  merge_next:
 		if (!can_vma_merge_before(prev, vm_flags, file,
 				pgoff, (end - addr) >> PAGE_SHIFT))
-			return 0;
+			return NULL;
 		if (end == prev->vm_start) {
 			if (file)
 				down(i_shared_sem);
@@ -469,11 +469,11 @@ static int vma_merge(struct mm_struct *mm, struct vm_area_struct *prev,
 			spin_unlock(lock);
 			if (file)
 				up(i_shared_sem);
-			return 1;
+			return prev;
 		}
 	}
 
-	return 0;
+	return NULL;
 }
 
 /*
@@ -1492,5 +1492,36 @@ void insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
 	if (__vma && __vma->vm_start < vma->vm_end)
 		BUG();
 	vma_link(mm, vma, prev, rb_link, rb_parent);
-	validate_mm(mm);
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
+	unsigned long addr, unsigned long len, unsigned long pgoff)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct vm_area_struct *new_vma, *prev;
+	struct rb_node **rb_link, *rb_parent;
+
+	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
+	new_vma = vma_merge(mm, prev, rb_parent, addr, addr + len,
+			vma->vm_flags, vma->vm_file, pgoff);
+	if (!new_vma) {
+		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+		if (new_vma) {
+			*new_vma = *vma;
+			INIT_LIST_HEAD(&new_vma->shared);
+			new_vma->vm_start = addr;
+			new_vma->vm_end = addr + len;
+			new_vma->vm_pgoff = pgoff;
+			if (new_vma->vm_file)
+				get_file(new_vma->vm_file);
+			if (new_vma->vm_ops && new_vma->vm_ops->open)
+				new_vma->vm_ops->open(new_vma);
+			vma_link(mm, new_vma, prev, rb_link, rb_parent);
+		}
+	}
+	return new_vma;
 }
diff --git a/mm/mremap.c b/mm/mremap.c
index b685f21c2d21..57c52111ea0b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -148,7 +148,7 @@ out:
 static int move_page_tables(struct vm_area_struct *vma,
 	unsigned long new_addr, unsigned long old_addr, unsigned long len)
 {
-	unsigned long offset = len;
+	unsigned long offset;
 
 	flush_cache_range(vma, old_addr, old_addr + len);
 
@@ -157,137 +157,75 @@ static int move_page_tables(struct vm_area_struct *vma,
 	 * easy way out on the assumption that most remappings will be
 	 * only a few pages.. This also makes error recovery easier.
 	 */
-	while (offset) {
-		offset -= PAGE_SIZE;
-		if (move_one_page(vma, old_addr + offset, new_addr + offset))
-			goto oops_we_failed;
+	for (offset = 0; offset < len; offset += PAGE_SIZE) {
+		if (move_one_page(vma, old_addr+offset, new_addr+offset) < 0)
+			break;
 	}
-	return 0;
-
-	/*
-	 * Ok, the move failed because we didn't have enough pages for
-	 * the new page table tree. This is unlikely, but we have to
-	 * take the possibility into account. In that case we just move
-	 * all the pages back (this will work, because we still have
-	 * the old page tables)
-	 */
-oops_we_failed:
-	flush_cache_range(vma, new_addr, new_addr + len);
-	while ((offset += PAGE_SIZE) < len)
-		move_one_page(vma, new_addr + offset, old_addr + offset);
-	zap_page_range(vma, new_addr, len);
-	return -1;
+	return offset;
 }
 
 static unsigned long move_vma(struct vm_area_struct *vma,
-	unsigned long addr, unsigned long old_len, unsigned long new_len,
-	unsigned long new_addr)
+		unsigned long old_addr, unsigned long old_len,
+		unsigned long new_len, unsigned long new_addr)
 {
 	struct mm_struct *mm = vma->vm_mm;
-	struct vm_area_struct *new_vma, *next, *prev;
-	int allocated_vma;
+	struct vm_area_struct *new_vma;
+	unsigned long vm_flags = vma->vm_flags;
+	unsigned long new_pgoff;
+	unsigned long moved_len;
+	unsigned long excess = 0;
 	int split = 0;
 
-	new_vma = NULL;
-	next = find_vma_prev(mm, new_addr, &prev);
-	if (next) {
-		if (prev && prev->vm_end == new_addr &&
-		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
-					!(vma->vm_flags & VM_SHARED)) {
-			spin_lock(&mm->page_table_lock);
-			prev->vm_end = new_addr + new_len;
-			spin_unlock(&mm->page_table_lock);
-			new_vma = prev;
-			if (next != prev->vm_next)
-				BUG();
-			if (prev->vm_end == next->vm_start &&
-					can_vma_merge(next, prev->vm_flags)) {
-				spin_lock(&mm->page_table_lock);
-				prev->vm_end = next->vm_end;
-				__vma_unlink(mm, next, prev);
-				spin_unlock(&mm->page_table_lock);
-				if (vma == next)
-					vma = prev;
-				mm->map_count--;
-				kmem_cache_free(vm_area_cachep, next);
-			}
-		} else if (next->vm_start == new_addr + new_len &&
-			  	can_vma_merge(next, vma->vm_flags) &&
-				!vma->vm_file && !(vma->vm_flags & VM_SHARED)) {
-			spin_lock(&mm->page_table_lock);
-			next->vm_start = new_addr;
-			spin_unlock(&mm->page_table_lock);
-			new_vma = next;
-		}
-	} else {
-		prev = find_vma(mm, new_addr-1);
-		if (prev && prev->vm_end == new_addr &&
-		    can_vma_merge(prev, vma->vm_flags) && !vma->vm_file &&
-				!(vma->vm_flags & VM_SHARED)) {
-			spin_lock(&mm->page_table_lock);
-			prev->vm_end = new_addr + new_len;
-			spin_unlock(&mm->page_table_lock);
-			new_vma = prev;
-		}
+	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
+	new_vma = copy_vma(vma, new_addr, new_len, new_pgoff);
+	if (!new_vma)
+		return -ENOMEM;
+
+	moved_len = move_page_tables(vma, new_addr, old_addr, old_len);
+	if (moved_len < old_len) {
+		/*
+		 * On error, move entries back from new area to old,
+		 * which will succeed since page tables still there,
+		 * and then proceed to unmap new area instead of old.
+		 */
+		move_page_tables(new_vma, old_addr, new_addr, moved_len);
+		vma = new_vma;
+		old_len = new_len;
+		old_addr = new_addr;
+		new_addr = -ENOMEM;
 	}
 
-	allocated_vma = 0;
-	if (!new_vma) {
-		new_vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
-		if (!new_vma)
-			goto out;
-		allocated_vma = 1;
+	/* Conceal VM_ACCOUNT so old reservation is not undone */
+	if (vm_flags & VM_ACCOUNT) {
+		vma->vm_flags &= ~VM_ACCOUNT;
+		excess = vma->vm_end - vma->vm_start - old_len;
+		if (old_addr > vma->vm_start &&
+		    old_addr + old_len < vma->vm_end)
+			split = 1;
 	}
 
-	if (!move_page_tables(vma, new_addr, addr, old_len)) {
-		unsigned long vm_locked = vma->vm_flags & VM_LOCKED;
-
-		if (allocated_vma) {
-			*new_vma = *vma;
-			INIT_LIST_HEAD(&new_vma->shared);
-			new_vma->vm_start = new_addr;
-			new_vma->vm_end = new_addr+new_len;
-			new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT;
-			if (new_vma->vm_file)
-				get_file(new_vma->vm_file);
-			if (new_vma->vm_ops && new_vma->vm_ops->open)
-				new_vma->vm_ops->open(new_vma);
-			insert_vm_struct(current->mm, new_vma);
-		}
+	if (do_munmap(mm, old_addr, old_len) < 0) {
+		/* OOM: unable to split vma, just get accounts right */
+		vm_unacct_memory(excess >> PAGE_SHIFT);
+		excess = 0;
+	}
 
-		/* Conceal VM_ACCOUNT so old reservation is not undone */
-		if (vma->vm_flags & VM_ACCOUNT) {
-			vma->vm_flags &= ~VM_ACCOUNT;
-			if (addr > vma->vm_start) {
-				if (addr + old_len < vma->vm_end)
-					split = 1;
-			} else if (addr + old_len == vma->vm_end)
-				vma = NULL;	/* it will be removed */
-		} else
-			vma = NULL;		/* nothing more to do */
-
-		do_munmap(current->mm, addr, old_len);
-
-		/* Restore VM_ACCOUNT if one or two pieces of vma left */
-		if (vma) {
-			vma->vm_flags |= VM_ACCOUNT;
-			if (split)
-				vma->vm_next->vm_flags |= VM_ACCOUNT;
-		}
+	/* Restore VM_ACCOUNT if one or two pieces of vma left */
+	if (excess) {
+		vma->vm_flags |= VM_ACCOUNT;
+		if (split)
+			vma->vm_next->vm_flags |= VM_ACCOUNT;
+	}
 
-		current->mm->total_vm += new_len >> PAGE_SHIFT;
-		if (vm_locked) {
-			current->mm->locked_vm += new_len >> PAGE_SHIFT;
-			if (new_len > old_len)
-				make_pages_present(new_addr + old_len,
-						   new_addr + new_len);
-		}
-		return new_addr;
+	mm->total_vm += new_len >> PAGE_SHIFT;
+	if (vm_flags & VM_LOCKED) {
+		mm->locked_vm += new_len >> PAGE_SHIFT;
+		if (new_len > old_len)
+			make_pages_present(new_addr + old_len,
+					   new_addr + new_len);
 	}
-	if (allocated_vma)
-		kmem_cache_free(vm_area_cachep, new_vma);
- out:
-	return -ENOMEM;
+
+	return new_addr;
 }
 
 /*
-- 
cgit v1.2.3


From 2039e7b519e24f743a708a5f10bdc95273d1e077 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:16:45 -0700
Subject: [PATCH] mremap: vma_relink_file race fix

From: Hugh Dickins <hugh@veritas.com>

Subtle point from Rajesh Venkatasubramanian: when mremap's move_vma fails and
so rewinds, before moving the file-based ptes back, we must move new_vma
before old vma in the i_mmap or i_mmap_shared list, so that when racing
against vmtruncate we cannot propagate pages to be truncated back from
new_vma into the just cleaned old_vma.
---
 include/linux/mm.h |  1 +
 mm/mmap.c          | 21 +++++++++++++++++++++
 mm/mremap.c        |  7 +++++++
 3 files changed, 29 insertions(+)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43335c61e0da..6d6abe8c656e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -543,6 +543,7 @@ extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
 	struct rb_node **, struct rb_node *);
 extern struct vm_area_struct *copy_vma(struct vm_area_struct *,
 	unsigned long addr, unsigned long len, unsigned long pgoff);
+extern void vma_relink_file(struct vm_area_struct *, struct vm_area_struct *);
 extern void exit_mmap(struct mm_struct *);
 
 extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c
index 08e65e8ba699..eed4e083bca1 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1525,3 +1525,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct *vma,
 	}
 	return new_vma;
 }
+
+/*
+ * Position vma after prev in shared file list:
+ * for mremap move error recovery racing against vmtruncate.
+ */
+void vma_relink_file(struct vm_area_struct *vma, struct vm_area_struct *prev)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct address_space *mapping;
+
+	if (vma->vm_file) {
+		mapping = vma->vm_file->f_mapping;
+		if (mapping) {
+			down(&mapping->i_shared_sem);
+			spin_lock(&mm->page_table_lock);
+			list_move(&vma->shared, &prev->shared);
+			spin_unlock(&mm->page_table_lock);
+			up(&mapping->i_shared_sem);
+		}
+	}
+}
diff --git a/mm/mremap.c b/mm/mremap.c
index 57c52111ea0b..c3502aa241f2 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -187,7 +187,14 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
 		 * and then proceed to unmap new area instead of old.
+		 *
+		 * Subtle point from Rajesh Venkatasubramanian: before
+		 * moving file-based ptes, move new_vma before old vma
+		 * in the i_mmap or i_mmap_shared list, so when racing
+		 * against vmtruncate we cannot propagate pages to be
+		 * truncated back from new_vma into just cleaned old.
 		 */
+		vma_relink_file(vma, new_vma);
 		move_page_tables(new_vma, old_addr, new_addr, moved_len);
 		vma = new_vma;
 		old_len = new_len;
-- 
cgit v1.2.3


From 93d33a4885a483c708ccb7d24b56e0d5fef7bcab Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:17:38 -0700
Subject: [PATCH] laptop mode

From: Bart Samwel <bart@samwel.tk>

Adds /proc/sys/vm/laptop-mode: a special knob which says "this is a laptop".
In this mode the kernel will attempt to avoid spinning disks up.

Algorithm: the idea is to hold dirty data in memory for a long time, but to
flush everything which has been accumulated if the disk happens to spin up
for other reasons.

- Whenever a disk request completes (read or write), schedule a timer a few
  seconds hence.  If the timer was already pending, reset it to a few seconds
  hence.

- When the timer expires, write back the whole world.  We use
  sync_filesystems() for this because it will force ext3 journal commits as
  well.

- In balance_dirty_pages(), kick off background writeback when we hit the
  high threshold (dirty_ratio), not when we hit the low threshold.  This has
  the effect of causing "lumpy" writeback which is something I spent a year
  fixing, but in laptop mode, it is desirable.

- In try_to_free_pages(), only kick pdflush if the VM is getting into
  distress: we want to keep scanning for clean pages, deferring writeback.

- In page reclaim, avoid writing back the odd random dirty page off the
  LRU: only start I/O if the scanning is working harder.

The effect is to perform a sync() a few seconds after all I/O has ceased.

The value which was written into /proc/sys/vm/laptop-mode determines, in
seconds, the delay between the final I/O and the flush.

Additionally, the patch adds tools which help answer the question "why the
heck does my disk spin up all the time?".  The user may set
/proc/sys/vm/block_dump to a non-zero value and the kernel will print out
information which will identify the process which is performing disk reads or
which is dirtying pagecache.

The user should probably disable syslogd before setting block-dump.
---
 Documentation/laptop-mode.txt | 665 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c     |  14 +
 fs/buffer.c                   |   2 +
 fs/fs-writeback.c             |   3 +
 include/linux/sysctl.h        |   2 +
 include/linux/writeback.h     |   6 +-
 kernel/sysctl.c               |  20 ++
 mm/page-writeback.c           |  69 ++++-
 mm/vmscan.c                   |  61 ++--
 9 files changed, 815 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/laptop-mode.txt

(limited to 'include')

diff --git a/Documentation/laptop-mode.txt b/Documentation/laptop-mode.txt
new file mode 100644
index 000000000000..9df8d2677bef
--- /dev/null
+++ b/Documentation/laptop-mode.txt
@@ -0,0 +1,665 @@
+How to conserve battery power using laptop-mode
+-----------------------------------------------
+
+Document Author: Bart Samwel (bart@samwel.tk)
+Date created: January 2, 2004
+Last modified: April 3, 2004
+
+Introduction
+------------
+
+Laptopmode is used to minimize the time that the hard disk needs to be spun up,
+to conserve battery power on laptops. It has been reported to cause significant
+power savings.
+
+Contents
+--------
+
+* Introduction
+* The short story
+* Caveats
+* The details
+* Tips & Tricks
+* Control script
+* ACPI integration
+* Monitoring tool
+
+
+The short story
+---------------
+
+If you just want to use it, run the laptop_mode control script (which is included
+at the end of this document) as follows:
+
+# laptop_mode start
+
+Then set your harddisk spindown time to a relatively low value with hdparm:
+
+hdparm -S 4 /dev/hda
+
+The value -S 4 means 20 seconds idle time before spindown. Your harddisk will
+now only spin up when a disk cache miss occurs, or at least once every 10
+minutes to write back any pending changes.
+
+To stop laptop_mode, run "laptop_mode stop".
+
+
+Caveats
+-------
+
+* The downside of laptop mode is that you have a chance of losing up
+  to 10 minutes of work. If you cannot afford this, don't use it! It's
+  wise to turn OFF laptop mode when you're almost out of battery --
+  although this will make the battery run out faster, at least you'll
+  lose less work when it actually runs out. I'm still looking for someone
+  to submit instructions on how to turn off laptop mode when battery is low,
+  e.g., using ACPI events. I don't have a laptop myself, so if you do and
+  you care to contribute such instructions, please do.
+
+* Most desktop hard drives have a very limited lifetime measured in spindown
+  cycles, typically about 50.000 times (it's usually listed on the spec sheet).
+  Check your drive's rating, and don't wear down your drive's lifetime if you
+  don't need to.
+
+* If you mount some of your ext3/reiserfs filesystems with the -n option, then
+  the control script will not be able to remount them correctly. You must set
+  DO_REMOUNTS=0 in the control script, otherwise it will remount them with the
+  wrong options -- or it will fail because it cannot write to /etc/mtab.
+
+* If you have your filesystems listed as type "auto" in fstab, like I did, then
+  the control script will not recognize them as filesystems that need remounting.
+
+* If you have XFS, make SURE that you set the XFS_HZ value in the control script
+  correctly, to the value of HZ of your running kernel. Laptop mode will not
+  work correctly if it is set too low, and you may lose data if it is set too
+  high. The reason for this problem is that XFS does not export its sysctl
+  variables in centisecs (like most other subsystems do) but in "jiffies",
+  which is an internal kernel measure. Once this is fixed things will get better.
+
+
+The details
+-----------
+
+Laptop-mode is controlled by the flag /proc/sys/vm/laptop_mode. When this
+flag is set, any physical disk read operation (that might have caused the
+hard disk to spin up) causes Linux to flush all dirty blocks. The result
+of this is that after a disk has spun down, it will not be spun up anymore
+to write dirty blocks, because those blocks had already been written
+immediately after the most recent read operation
+
+To increase the effectiveness of the laptop_mode strategy, the laptop_mode
+control script increases dirty_expire_centisecs and dirty_writeback_centisecs in
+/proc/sys/vm to about 10 minutes (by default), which means that pages that are
+dirtied are not forced to be written to disk as often. The control script also
+changes the dirty background ratio, so that background writeback of dirty pages
+is not done anymore. Combined with a higher commit value (also 10 minutes) for
+ext3 or ReiserFS filesystems (also done automatically by the control script),
+this results in concentration of disk activity in a small time interval which
+occurs only once every 10 minutes, or whenever the disk is forced to spin up by
+a cache miss. The disk can then be spun down in the periods of inactivity.
+
+If you want to find out which process caused the disk to spin up, you can
+gather information by setting the flag /proc/sys/vm/block_dump. When this flag
+is set, Linux reports all disk read and write operations that take place, and
+all block dirtyings done to files. This makes it possible to debug why a disk
+needs to spin up, and to increase battery life even more. The output of
+block_dump is written to the kernel output, and it can be retrieved using
+"dmesg". When you use block_dump, you may want to turn off klogd, otherwise
+the output of block_dump will be logged, causing disk activity that is not
+normally there.
+
+If 10 minutes is too much or too little downtime for you, you can configure
+this downtime as follows. In the control script, set the MAX_AGE value to the
+maximum number of seconds of disk downtime that you would like. You should
+then set your filesystem's commit interval to the same value. The dirty ratio
+is also configurable from the control script.
+
+If you don't like the idea of the control script remounting your filesystems
+for you, you can change DO_REMOUNTS to 0 in the script.
+
+Thanks to Kiko Piris, the control script can be used to enable laptop mode on
+both the Linux 2.4 and 2.6 series.
+
+
+Tips & Tricks
+-------------
+
+* Bartek Kania reports getting up to 50 minutes of extra battery life (on top
+  of his regular 3 to 3.5 hours) using very aggressive power management (hdparm
+  -B1) and a spindown time of 5 seconds (hdparm -S1).
+
+* You can spin down the disk while playing MP3, by setting the disk readahead
+  to 8MB (hdparm -a 16384). Effectively, the disk will read a complete MP3 at
+  once, and will then spin down while the MP3 is playing. (Thanks to Bartek
+  Kania.)
+
+* Drew Scott Daniels observed: "I don't know why, but when I decrease the number
+  of colours that my display uses it consumes less battery power. I've seen
+  this on powerbooks too. I hope that this is a piece of information that
+  might be useful to the Laptop Mode patch or it's users."
+
+* One thing which will cause disks to spin up is not-present application
+  and dynamic library text pages.  The kernel will load program text off disk
+  on-demand, so each time you invoke an application feature for the first
+  time, the kernel needs to spin the disk up to go and fetch that part of the
+  application.
+
+  So it is useful to increase the disk readahead parameter greatly, so that
+  the kernel will pull all of the executable's pages into memory on the first
+  pagefault.
+
+  The supplied script does this.
+
+* In syslog.conf, you can prefix entries with a dash ``-'' to omit syncing the
+  file after every logging. When you're using laptop-mode and your disk doesn't
+  spin down, this is a likely culprit.
+
+* Richard Atterer observed that laptop mode does not work well with noflushd
+  (http://noflushd.sourceforge.net/), it seems that noflushd prevents laptop-mode
+  from doing its thing.
+
+
+Control script
+--------------
+
+Please note that this control script works for the Linux 2.4 and 2.6 series.
+
+--------------------CONTROL SCRIPT BEGIN------------------------------------------
+#! /bin/sh
+
+# start or stop laptop_mode, best run by a power management daemon when
+# ac gets connected/disconnected from a laptop
+#
+# install as /sbin/laptop_mode
+#
+# Contributors to this script:   Kiko Piris
+#				 Bart Samwel
+#				 Micha Feigin
+#				 Andrew Morton
+#				 Dax Kelson
+#
+# Original Linux 2.4 version by: Jens Axboe
+
+# Remove an option (the first parameter) of the form option=<number> from
+# a mount options string (the rest of the parameters).
+parse_mount_opts () {
+	OPT="$1"
+	shift
+	echo "$*"			| \
+	sed 's/.*/,&,/'			| \
+	sed 's/,'"$OPT"'=[0-9]*,/,/g'	| \
+	sed 's/,,*/,/g'			| \
+	sed 's/^,//'			| \
+	sed 's/,$//'			| \
+	cat -
+}
+
+# Remove an option (the first parameter) without any arguments from
+# a mount option string (the rest of the parameters).
+parse_nonumber_mount_opts () {
+	OPT="$1"
+	shift
+	echo "$*" 			| \
+	sed 's/.*/,&,/'			| \
+	sed 's/,'"$OPT"',/,/g'		| \
+	sed 's/,,*/,/g'			| \
+	sed 's/^,//'			| \
+	sed 's/,$//'			| \
+	cat -
+}
+
+# Find out the state of a yes/no option (e.g. "atime"/"noatime") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, the option name the second, and the default
+# value the third. The remainder is the mount options string.
+#
+# Example:
+# parse_yesno_opts_wfstab /dev/hda1 atime atime defaults,noatime
+#
+# If fstab contains, say, "rw" for this filesystem, then the result
+# will be "defaults,atime".
+parse_yesno_opts_wfstab () {
+	L_DEV=$1
+	shift
+	OPT=$1
+	shift
+	DEF_OPT=$1
+	shift
+	L_OPTS="$*"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts $OPT $L_OPTS)"
+	PARSEDOPTS1="$(parse_nonumber_mount_opts no$OPT $PARSEDOPTS1)"
+	# Watch for a default atime in fstab
+	FSTAB_OPTS="$(cat /etc/fstab | sed 's/  / /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')"
+	if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT")" ] ; then
+		# option not specified in fstab -- choose the default.
+		echo "$PARSEDOPTS1,$DEF_OPT"
+	else
+		# option specified in fstab: extract the value and use it
+		if [ -z "$(echo "$FSTAB_OPTS" | grep "no$OPT")" ] ; then
+			# no$OPT not found -- so we must have $OPT.
+			echo "$PARSEDOPTS1,$OPT"
+		else
+			echo "$PARSEDOPTS1,no$OPT"
+		fi
+	fi
+}
+
+# Find out the state of a numbered option (e.g. "commit=NNN") in
+# fstab for a given filesystem, and use this state to replace the
+# value of the option in another mount options string. The device
+# is the first argument, and the option name the second. The
+# remainder is the mount options string in which the replacement
+# must be done.
+#
+# Example:
+# parse_mount_opts_wfstab /dev/hda1 commit defaults,commit=7
+#
+# If fstab contains, say, "commit=3,rw" for this filesystem, then the
+# result will be "rw,commit=3".
+parse_mount_opts_wfstab () {
+	L_DEV=$1
+	shift
+	OPT=$1
+	shift
+	L_OPTS="$*"
+
+	PARSEDOPTS1="$(parse_mount_opts $OPT $L_OPTS)"
+	# Watch for a default commit in fstab
+	FSTAB_OPTS="$(cat /etc/fstab | sed 's/	/ /g' | grep ^\ *"$L_DEV " | awk '{ print $4 }')"
+	if [ -z "$(echo "$FSTAB_OPTS" | grep "$OPT=")" ] ; then
+		# option not specified in fstab: set it to 0
+		echo "$PARSEDOPTS1,$OPT=0"
+	else
+		# option specified in fstab: extract the value, and use it
+		echo -n "$PARSEDOPTS1,$OPT="
+		echo "$FSTAB_OPTS"	| \
+		sed 's/.*/,&,/'		| \
+		sed 's/.*,'"$OPT"'=//'	| \
+		sed 's/,.*//'		| \
+		cat -
+	fi
+}
+
+KLEVEL="$(uname -r | cut -c1-3)"
+case "$KLEVEL" in
+	"2.4"|"2.6")
+		true
+		;;
+	*)
+		echo "Unhandled kernel version: $KLEVEL ('uname -r' = '$(uname -r)')"
+		exit 1
+		;;
+esac
+
+# Shall we remount journaled fs. with appropiate commit interval? (1=yes)
+DO_REMOUNTS=1
+
+# age time, in seconds. should be put into a sysconfig file
+MAX_AGE=600
+
+# Dirty synchronous ratio.  At this percentage of dirty pages the process which
+# calls write() does its own writeback
+DIRTY_RATIO=40
+
+#
+# Allowed dirty background ratio, in percent.  Once DIRTY_RATIO has been
+# exceeded, the kernel will wake pdflush which will then reduce the amount
+# of dirty memory to dirty_background_ratio.  Set this nice and low, so once
+# some writeout has commenced, we do a lot of it.
+#
+DIRTY_BACKGROUND_RATIO=5
+
+READAHEAD=4096		# kilobytes
+
+# kernel default dirty buffer age
+DEF_AGE=30
+DEF_UPDATE=5
+DEF_DIRTY_BACKGROUND_RATIO=10
+DEF_DIRTY_RATIO=40
+DEF_XFS_AGE_BUFFER=15
+DEF_XFS_SYNC_INTERVAL=30
+
+# This must be adjusted manually to the value of HZ in the running kernel,
+# until the XFS people change their external interfaces to work in centisecs
+# like the rest of the external world. Unfortunately this cannot be automated. :(
+XFS_HZ=1000
+
+if [ ! -e /proc/sys/vm/laptop_mode ]; then
+	echo "Kernel is not patched with laptop_mode patch."
+	exit 1
+fi
+
+if [ ! -w /proc/sys/vm/laptop_mode ]; then
+	echo "You do not have enough privileges to enable laptop_mode."
+	exit 1
+fi
+
+case "$1" in
+	start)
+		AGE=$((100*$MAX_AGE))
+		XFS_AGE=$(($XFS_HZ*$MAX_AGE))
+		echo -n "Starting laptop_mode"
+
+		if [ -d /proc/sys/vm/pagebuf ] ; then
+			# This only needs to be set, not reset -- it is only used when
+			# laptop mode is enabled.
+			echo $XFS_AGE > /proc/sys/vm/pagebuf/lm_flush_age
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# The same goes for these.
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/lm_sync_interval
+		elif [ -f /proc/sys/fs/xfs/age_buffer ] ; then
+			# But not for these -- they are also used in normal
+			# operation.
+			echo $XFS_AGE > /proc/sys/fs/xfs/age_buffer
+			echo $XFS_AGE > /proc/sys/fs/xfs/sync_interval
+		fi
+
+		case "$KLEVEL" in
+			"2.4")
+				echo "1"				> /proc/sys/vm/laptop_mode
+				echo "30 500 0 0 $AGE $AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "5"				> /proc/sys/vm/laptop_mode
+				echo "$AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DIRTY_BACKGROUND_RATIO"		> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				PARSEDOPTS="$(parse_mount_opts "$OPTS")"
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts commit "$OPTS")"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS,commit=$MAX_AGE,noatime
+						;;
+					"xfs")
+						mount $DEV -t $FST $MP -o remount,$OPTS,noatime
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra $(($READAHEAD * 2)) $DEV
+				fi
+			done
+		fi
+		echo "."
+		;;
+	stop)
+		U_AGE=$((100*$DEF_UPDATE))
+		B_AGE=$((100*$DEF_AGE))
+		echo -n "Stopping laptop_mode"
+		echo "0" > /proc/sys/vm/laptop_mode
+		if [ -f /proc/sys/fs/xfs/age_buffer ] && [ ! -f /proc/sys/fs/xfs/lm_age_buffer ] ; then
+			# These need to be restored though, if there are no lm_*.
+			echo "$(($XFS_HZ*$DEF_XFS_AGE_BUFFER))" 	> /proc/sys/fs/xfs/age_buffer
+			echo "$(($XFS_HZ*$DEF_XFS_SYNC_INTERVAL))" 	> /proc/sys/fs/xfs/sync_interval
+		fi
+		case "$KLEVEL" in
+			"2.4")
+				echo "30 500 0 0 $U_AGE $B_AGE 60 20 0"	> /proc/sys/vm/bdflush
+				;;
+			"2.6")
+				echo "$U_AGE"				> /proc/sys/vm/dirty_writeback_centisecs
+				echo "$B_AGE"				> /proc/sys/vm/dirty_expire_centisecs
+				echo "$DEF_DIRTY_RATIO"			> /proc/sys/vm/dirty_ratio
+				echo "$DEF_DIRTY_BACKGROUND_RATIO"	> /proc/sys/vm/dirty_background_ratio
+				;;
+		esac
+		if [ $DO_REMOUNTS -eq 1 ]; then
+			cat /etc/mtab | while read DEV MP FST OPTS DUMP PASS ; do
+				# Reset commit and atime options to defaults.
+				case "$FST" in
+					"ext3"|"reiserfs")
+						PARSEDOPTS="$(parse_mount_opts_wfstab $DEV commit $OPTS)"
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $PARSEDOPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+					"xfs")
+						PARSEDOPTS="$(parse_yesno_opts_wfstab $DEV atime atime $OPTS)"
+						mount $DEV -t $FST $MP -o remount,$PARSEDOPTS
+						;;
+				esac
+				if [ -b $DEV ] ; then
+					blockdev --setra 256 $DEV
+				fi
+			done
+		fi
+		echo "."
+		;;
+	*)
+		echo "Usage: $0 {start|stop}"
+		;;
+
+esac
+
+exit 0
+
+--------------------CONTROL SCRIPT END--------------------------------------------
+
+
+ACPI integration
+----------------
+
+Dax Kelson submitted this so that the ACPI acpid daemon will
+kick off the laptop_mode script and run hdparm.
+
+---------------------------/etc/acpi/events/ac_adapter BEGIN-------------------------------------------
+event=ac_adapter
+action=/etc/acpi/actions/battery.sh
+---------------------------/etc/acpi/events/ac_adapter END-------------------------------------------
+
+---------------------------/etc/acpi/actions/battery.sh BEGIN-------------------------------------------
+#!/bin/sh
+
+# cpu throttling
+# cat /proc/acpi/processor/CPU0/throttling for more info
+ACAD_THR=0
+BATT_THR=2
+
+# spindown time for HD (man hdparm for valid values)
+# I prefer 2 hours for acad and 20 seconds for batt
+ACAD_HD=244
+BATT_HD=4
+
+# ac/battery event handler
+
+status=`awk '/^state: / { print $2 }' /proc/acpi/ac_adapter/AC/state`
+
+case $status in
+        "on-line")
+                echo "Setting HD spindown to 2 hours"
+                /sbin/laptop-mode stop
+                /sbin/hdparm -S $ACAD_HD /dev/hda > /dev/null 2>&1
+                /sbin/hdparm -B 255 /dev/hda > /dev/null 2>&1
+                #echo -n $ACAD_CPU:$ACAD_THR > /proc/acpi/processor/CPU0/limit
+                exit 0
+        ;;
+        "off-line")
+                echo "Setting HD spindown to 20 seconds"
+                /sbin/laptop-mode start
+                /sbin/hdparm -S $BATT_HD /dev/hda > /dev/null 2>&1
+                /sbin/hdparm -B 1 /dev/hda > /dev/null 2>&1
+                #echo -n $BATT_CPU:$BATT_THR > /proc/acpi/processor/CPU0/limit
+                exit 0
+        ;;
+esac
+---------------------------/etc/acpi/actions/battery.sh END-------------------------------------------
+
+Monitoring tool
+---------------
+
+Bartek Kania submitted this, it can be used to measure how much time your disk
+spends spun up/down.
+
+---------------------------dslm.c BEGIN-------------------------------------------
+/*
+ * Simple Disk Sleep Monitor
+ *  by Bartek Kania
+ * Licenced under the GPL
+ */
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <time.h>
+#include <string.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <linux/hdreg.h>
+
+#ifdef DEBUG
+#define D(x) x
+#else
+#define D(x)
+#endif
+
+int endit = 0;
+
+/* Check if the disk is in powersave-mode
+ * Most of the code is stolen from hdparm.
+ * 1 = active, 0 = standby/sleep, -1 = unknown */
+int check_powermode(int fd)
+{
+    unsigned char args[4] = {WIN_CHECKPOWERMODE1,0,0,0};
+    int state;
+
+    if (ioctl(fd, HDIO_DRIVE_CMD, &args)
+	&& (args[0] = WIN_CHECKPOWERMODE2) /* try again with 0x98 */
+	&& ioctl(fd, HDIO_DRIVE_CMD, &args)) {
+	if (errno != EIO || args[0] != 0 || args[1] != 0) {
+	    state = -1; /* "unknown"; */
+	} else
+	    state = 0; /* "sleeping"; */
+    } else {
+	state = (args[2] == 255) ? 1 : 0;
+    }
+    D(printf(" drive state is:  %d\n", state));
+
+    return state;
+}
+
+char *state_name(int i)
+{
+    if (i == -1) return "unknown";
+    if (i == 0) return "sleeping";
+    if (i == 1) return "active";
+
+    return "internal error";
+}
+
+char *myctime(time_t time)
+{
+    char *ts = ctime(&time);
+    ts[strlen(ts) - 1] = 0;
+
+    return ts;
+}
+
+void measure(int fd)
+{
+    time_t start_time;
+    int last_state;
+    time_t last_time;
+    int curr_state;
+    time_t curr_time = 0;
+    time_t time_diff;
+    time_t active_time = 0;
+    time_t sleep_time = 0;
+    time_t unknown_time = 0;
+    time_t total_time = 0;
+    int changes = 0;
+    float tmp;
+
+    printf("Starting measurements\n");
+
+    last_state = check_powermode(fd);
+    start_time = last_time = time(0);
+    printf("  System is in state %s\n\n", state_name(last_state));
+
+    while(!endit) {
+	sleep(1);
+	curr_state = check_powermode(fd);
+
+	if (curr_state != last_state || endit) {
+	    changes++;
+	    curr_time = time(0);
+	    time_diff = curr_time - last_time;
+
+	    if (last_state == 1) active_time += time_diff;
+	    else if (last_state == 0) sleep_time += time_diff;
+	    else unknown_time += time_diff;
+
+	    last_state = curr_state;
+	    last_time = curr_time;
+
+	    printf("%s: State-change to %s\n", myctime(curr_time),
+		   state_name(curr_state));
+	}
+    }
+    changes--; /* Compensate for SIGINT */
+
+    total_time = time(0) - start_time;
+    printf("\nTotal running time:  %lus\n", curr_time - start_time);
+    printf(" State changed %d times\n", changes);
+
+    tmp = (float)sleep_time / (float)total_time * 100;
+    printf(" Time in sleep state:   %lus (%.2f%%)\n", sleep_time, tmp);
+    tmp = (float)active_time / (float)total_time * 100;
+    printf(" Time in active state:  %lus (%.2f%%)\n", active_time, tmp);
+    tmp = (float)unknown_time / (float)total_time * 100;
+    printf(" Time in unknown state: %lus (%.2f%%)\n", unknown_time, tmp);
+}
+
+void ender(int s)
+{
+    endit = 1;
+}
+
+void usage()
+{
+    puts("usage: dslm [-w <time>] <disk>");
+    exit(0);
+}
+
+int main(int ac, char **av)
+{
+    int fd;
+    char *disk = 0;
+    int settle_time = 60;
+
+    /* Parse the simple command-line */
+    if (ac == 2)
+	disk = av[1];
+    else if (ac == 4) {
+	settle_time = atoi(av[2]);
+	disk = av[3];
+    } else
+	usage();
+
+    if (!(fd = open(disk, O_RDONLY|O_NONBLOCK))) {
+	printf("Can't open %s, because: %s\n", disk, strerror(errno));
+	exit(-1);
+    }
+
+    if (settle_time) {
+	printf("Waiting %d seconds for the system to settle down to "
+	       "'normal'\n", settle_time);
+	sleep(settle_time);
+    } else
+	puts("Not waiting for system to settle down");
+
+    signal(SIGINT, ender);
+
+    measure(fd);
+
+    close(fd);
+
+    return 0;
+}
+---------------------------dslm.c END---------------------------------------------
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 7b9f0c75bffd..fc4b6c698fcf 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -27,6 +27,7 @@
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/writeback.h>
 
 /*
  * for max sense size
@@ -2471,6 +2472,16 @@ int submit_bio(int rw, struct bio *bio)
 		mod_page_state(pgpgout, count);
 	else
 		mod_page_state(pgpgin, count);
+
+	if (unlikely(block_dump)) {
+		char b[BDEVNAME_SIZE];
+		printk("%s(%d): %s block %Lu on %s\n",
+			current->comm, current->pid,
+			(rw & WRITE) ? "WRITE" : "READ",
+			(unsigned long long)bio->bi_sector,
+			bdevname(bio->bi_bdev,b));
+	}
+
 	generic_make_request(bio);
 	return 1;
 }
@@ -2754,6 +2765,9 @@ void end_that_request_last(struct request *req)
 	struct gendisk *disk = req->rq_disk;
 	struct completion *waiting = req->waiting;
 
+	if (unlikely(laptop_mode))
+		laptop_io_completion();
+
 	if (disk && blk_fs_request(req)) {
 		unsigned long duration = jiffies - req->start_time;
 		switch (rq_data_dir(req)) {
diff --git a/fs/buffer.c b/fs/buffer.c
index 42b61de10bf3..605ce2099aa5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -274,6 +274,8 @@ static void do_sync(unsigned long wait)
 	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
 	if (!wait)
 		printk("Emergency Sync complete\n");
+	if (unlikely(laptop_mode))
+		laptop_sync_completion();
 }
 
 asmlinkage long sys_sync(void)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 591c5eb79ba3..23e367ed22f7 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -75,6 +75,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) == flags)
 		return;
 
+	if (unlikely(block_dump))
+		printk("%s(%d): dirtied file\n", current->comm, current->pid);
+
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 3767428df94d..d2224f6617f9 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -159,6 +159,8 @@ enum
 	VM_LOWER_ZONE_PROTECTION=20,/* Amount of protection of lower zones */
 	VM_MIN_FREE_KBYTES=21,	/* Minimum free kilobytes to maintain */
 	VM_MAX_MAP_COUNT=22,	/* int: Maximum number of mmaps/address-space */
+	VM_LAPTOP_MODE=23,	/* vm laptop mode */
+	VM_BLOCK_DUMP=24,	/* block dump mode */
 };
 
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7380d2cefb16..f557b55e8b0a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -72,12 +72,16 @@ static inline void wait_on_inode(struct inode *inode)
  * mm/page-writeback.c
  */
 int wakeup_bdflush(long nr_pages);
+void laptop_io_completion(void);
+void laptop_sync_completion(void);
 
-/* These 5 are exported to sysctl. */
+/* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
 extern int dirty_writeback_centisecs;
 extern int dirty_expire_centisecs;
+extern int block_dump;
+extern int laptop_mode;
 
 struct ctl_table;
 struct file;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index f2c8c8ce4926..05ea59ae4276 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -744,6 +744,26 @@ static ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
+	{
+		.ctl_name	= VM_LAPTOP_MODE,
+		.procname	= "laptop_mode",
+		.data		= &laptop_mode,
+		.maxlen		= sizeof(laptop_mode),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
+	{
+		.ctl_name	= VM_BLOCK_DUMP,
+		.procname	= "block_dump",
+		.data		= &block_dump,
+		.maxlen		= sizeof(block_dump),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1981309fa9c5..9cf47af10ccc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -28,6 +28,7 @@
 #include <linux/smp.h>
 #include <linux/sysctl.h>
 #include <linux/cpu.h>
+#include <linux/syscalls.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -81,6 +82,16 @@ int dirty_writeback_centisecs = 5 * 100;
  */
 int dirty_expire_centisecs = 30 * 100;
 
+/*
+ * Flag that makes the machine dump writes/reads and block dirtyings.
+ */
+int block_dump;
+
+/*
+ * Flag that puts the machine in "laptop mode".
+ */
+int laptop_mode;
+
 /* End of sysctl-exported parameters */
 
 
@@ -195,7 +206,19 @@ static void balance_dirty_pages(struct address_space *mapping)
 	if (nr_reclaimable + ps.nr_writeback <= dirty_thresh)
 		dirty_exceeded = 0;
 
-	if (!writeback_in_progress(bdi) && nr_reclaimable > background_thresh)
+	if (writeback_in_progress(bdi))
+		return;		/* pdflush is already working this queue */
+
+	/*
+	 * In laptop mode, we wait until hitting the higher threshold before
+	 * starting background writeout, and then write out all the way down
+	 * to the lower threshold.  So slow writers cause minimal disk activity.
+	 *
+	 * In normal mode, we start background writeout at the lower
+	 * background_thresh, to keep the amount of dirty memory low.
+	 */
+	if ((laptop_mode && pages_written) ||
+	     (!laptop_mode && (nr_reclaimable > background_thresh)))
 		pdflush_operation(background_writeout, 0);
 }
 
@@ -289,7 +312,13 @@ int wakeup_bdflush(long nr_pages)
 	return pdflush_operation(background_writeout, nr_pages);
 }
 
-static struct timer_list wb_timer;
+static void wb_timer_fn(unsigned long unused);
+static void laptop_timer_fn(unsigned long unused);
+
+static struct timer_list wb_timer =
+			TIMER_INITIALIZER(wb_timer_fn, 0, 0);
+static struct timer_list laptop_mode_wb_timer =
+			TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
 
 /*
  * Periodic writeback of "old" data.
@@ -368,7 +397,36 @@ static void wb_timer_fn(unsigned long unused)
 {
 	if (pdflush_operation(wb_kupdate, 0) < 0)
 		mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+}
+
+static void laptop_flush(unsigned long unused)
+{
+	sys_sync();
+}
+
+static void laptop_timer_fn(unsigned long unused)
+{
+	pdflush_operation(laptop_flush, 0);
+}
 
+/*
+ * We've spun up the disk and we're in laptop mode: schedule writeback
+ * of all dirty data a few seconds from now.  If the flush is already scheduled
+ * then push it back - the user is still using the disk.
+ */
+void laptop_io_completion(void)
+{
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+}
+
+/*
+ * We're in laptop mode and we've just synced. The sync's writes will have
+ * caused another writeback to be scheduled by laptop_io_completion.
+ * Nothing needs to be written back anymore, so we unschedule the writeback.
+ */
+void laptop_sync_completion(void)
+{
+	del_timer(&laptop_mode_wb_timer);
 }
 
 /*
@@ -429,12 +487,7 @@ void __init page_writeback_init(void)
 		vm_dirty_ratio *= correction;
 		vm_dirty_ratio /= 100;
 	}
-
-	init_timer(&wb_timer);
-	wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
-	wb_timer.data = 0;
-	wb_timer.function = wb_timer_fn;
-	add_timer(&wb_timer);
+	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0728eadc0eb7..39e8ed0fcdd6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -246,7 +246,8 @@ static void handle_write_error(struct address_space *mapping,
  * shrink_list returns the number of reclaimed pages
  */
 static int
-shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
+shrink_list(struct list_head *page_list, unsigned int gfp_mask,
+		int *nr_scanned, int do_writepage)
 {
 	struct address_space *mapping;
 	LIST_HEAD(ret_pages);
@@ -354,6 +355,8 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask, int *nr_scanned)
 				goto keep_locked;
 			if (!may_write_to_queue(mapping->backing_dev_info))
 				goto keep_locked;
+			if (laptop_mode && !do_writepage)
+				goto keep_locked;
 			if (clear_page_dirty_for_io(page)) {
 				int res;
 				struct writeback_control wbc = {
@@ -473,7 +476,7 @@ keep:
  */
 static int
 shrink_cache(struct zone *zone, unsigned int gfp_mask,
-		int max_scan, int *total_scanned)
+		int max_scan, int *total_scanned, int do_writepage)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -521,7 +524,8 @@ shrink_cache(struct zone *zone, unsigned int gfp_mask,
 			mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
 		else
 			mod_page_state_zone(zone, pgscan_direct, nr_scan);
-		nr_freed = shrink_list(&page_list, gfp_mask, total_scanned);
+		nr_freed = shrink_list(&page_list, gfp_mask,
+					total_scanned, do_writepage);
 		*total_scanned += nr_taken;
 		if (current_is_kswapd())
 			mod_page_state(kswapd_steal, nr_freed);
@@ -735,7 +739,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
  */
 static int
 shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
-		int *total_scanned, struct page_state *ps)
+		int *total_scanned, struct page_state *ps, int do_writepage)
 {
 	unsigned long ratio;
 	int count;
@@ -764,7 +768,8 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
 	count = atomic_read(&zone->nr_scan_inactive);
 	if (count >= SWAP_CLUSTER_MAX) {
 		atomic_set(&zone->nr_scan_inactive, 0);
-		return shrink_cache(zone, gfp_mask, count, total_scanned);
+		return shrink_cache(zone, gfp_mask, count,
+					total_scanned, do_writepage);
 	}
 	return 0;
 }
@@ -787,7 +792,7 @@ shrink_zone(struct zone *zone, int max_scan, unsigned int gfp_mask,
  */
 static int
 shrink_caches(struct zone **zones, int priority, int *total_scanned,
-		int gfp_mask, struct page_state *ps)
+		int gfp_mask, struct page_state *ps, int do_writepage)
 {
 	int ret = 0;
 	int i;
@@ -803,7 +808,8 @@ shrink_caches(struct zone **zones, int priority, int *total_scanned,
 			continue;	/* Let kswapd poll it */
 
 		max_scan = zone->nr_inactive >> priority;
-		ret += shrink_zone(zone, max_scan, gfp_mask, total_scanned, ps);
+		ret += shrink_zone(zone, max_scan, gfp_mask,
+					total_scanned, ps, do_writepage);
 	}
 	return ret;
 }
@@ -833,6 +839,8 @@ int try_to_free_pages(struct zone **zones,
 	int nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	int i;
+	unsigned long total_scanned = 0;
+	int do_writepage = 0;
 
 	inc_page_state(allocstall);
 
@@ -840,13 +848,13 @@ int try_to_free_pages(struct zone **zones,
 		zones[i]->temp_priority = DEF_PRIORITY;
 
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-		int total_scanned = 0;
+		int scanned = 0;
 		struct page_state ps;
 
 		get_page_state(&ps);
-		nr_reclaimed += shrink_caches(zones, priority, &total_scanned,
-						gfp_mask, &ps);
-		shrink_slab(total_scanned, gfp_mask);
+		nr_reclaimed += shrink_caches(zones, priority, &scanned,
+						gfp_mask, &ps, do_writepage);
+		shrink_slab(scanned, gfp_mask);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
@@ -858,14 +866,20 @@ int try_to_free_pages(struct zone **zones,
 		if (!(gfp_mask & __GFP_FS))
 			break;		/* Let the caller handle it */
 		/*
-		 * Try to write back as many pages as we just scanned.  Not
-		 * sure if that makes sense, but it's an attempt to avoid
-		 * creating IO storms unnecessarily
+		 * Try to write back as many pages as we just scanned.  This
+		 * tends to cause slow streaming writers to write data to the
+		 * disk smoothly, at the dirtying rate, which is nice.   But
+		 * that's undesirable in laptop mode, where we *want* lumpy
+		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		wakeup_bdflush(total_scanned);
+		total_scanned += scanned;
+		if (total_scanned > SWAP_CLUSTER_MAX + SWAP_CLUSTER_MAX/2) {
+			wakeup_bdflush(laptop_mode ? 0 : total_scanned);
+			do_writepage = 1;
+		}
 
 		/* Take a nap, wait for some writeback to complete */
-		if (total_scanned && priority < DEF_PRIORITY - 2)
+		if (scanned && priority < DEF_PRIORITY - 2)
 			blk_congestion_wait(WRITE, HZ/10);
 	}
 	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY))
@@ -908,6 +922,8 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, struct page_state *ps)
 	int i;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long total_scanned = 0;
+	unsigned long total_reclaimed = 0;
+	int do_writepage = 0;
 
 	inc_page_state(pageoutrun);
 
@@ -969,16 +985,25 @@ scan:
 			zone->temp_priority = priority;
 			max_scan = zone->nr_inactive >> priority;
 			reclaimed = shrink_zone(zone, max_scan, GFP_KERNEL,
-					&scanned, ps);
+					&scanned, ps, do_writepage);
 			total_scanned += scanned;
 			reclaim_state->reclaimed_slab = 0;
 			shrink_slab(scanned, GFP_KERNEL);
 			reclaimed += reclaim_state->reclaimed_slab;
+			total_reclaimed += reclaimed;
 			to_free -= reclaimed;
 			if (zone->all_unreclaimable)
 				continue;
 			if (zone->pages_scanned > zone->present_pages * 2)
 				zone->all_unreclaimable = 1;
+			/*
+			 * If we've done a decent amount of scanning and
+			 * the reclaim ratio is low, start doing writepage
+			 * even in laptop mode
+			 */
+			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
+			    total_scanned > total_reclaimed+total_reclaimed/2)
+				do_writepage = 1;
 		}
 		if (nr_pages && to_free > 0)
 			continue;	/* swsusp: need to do more work */
@@ -997,7 +1022,7 @@ out:
 
 		zone->prev_priority = zone->temp_priority;
 	}
-	return nr_pages - to_free;
+	return total_reclaimed;
 }
 
 /*
-- 
cgit v1.2.3


From 26f14a5727b9447f87ba4cdd8e83fb9e8af79631 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:17:51 -0700
Subject: [PATCH] Add commit=0 to ext3, meaning "set commit to default".

From: Bart Samwel <bart@samwel.tk>

Add support for the value "0" to ext3's "commit" option.  When this value
is given, ext3 substitutes it by the default commit interval.  Introduce a
constant JBD_DEFAULT_MAX_COMMIT_AGE for this.
---
 fs/ext3/super.c     | 4 ++++
 fs/jbd/journal.c    | 2 +-
 include/linux/jbd.h | 5 +++++
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e6ae6c9e0f46..14d9f3c72055 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -735,6 +735,10 @@ static int parse_options (char * options, struct ext3_sb_info *sbi,
 		case Opt_commit:
 			if (match_int(&args[0], &option))
 				return 0;
+			if (option < 0)
+				return 0;
+			if (option == 0)
+				option = JBD_DEFAULT_MAX_COMMIT_AGE;
 			sbi->s_commit_interval = HZ * option;
 			break;
 		case Opt_data_journal:
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 7550f31d0249..897a8c886259 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -636,7 +636,7 @@ static journal_t * journal_init_common (void)
 	spin_lock_init(&journal->j_list_lock);
 	spin_lock_init(&journal->j_state_lock);
 
-	journal->j_commit_interval = (HZ * 5);
+	journal->j_commit_interval = (HZ * JBD_DEFAULT_MAX_COMMIT_AGE);
 
 	/* The journal is marked for error until we succeed with recovery! */
 	journal->j_flags = JFS_ABORT;
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 241387b13764..62c7f363ae74 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -42,6 +42,11 @@
  */
 #undef JBD_PARANOID_IOFAIL
 
+/*
+ * The default maximum commit age, in seconds.
+ */
+#define JBD_DEFAULT_MAX_COMMIT_AGE 5
+
 #ifdef CONFIG_JBD_DEBUG
 /*
  * Define JBD_EXPENSIVE_CHECKING to enable more expensive internal
-- 
cgit v1.2.3


From 95f238eac82907c4ccbc301cd5788e67db0715ce Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:18:43 -0700
Subject: [PATCH] ia32: 4Kb stacks (and irqstacks) patch

From: Arjan van de Ven <arjanv@redhat.com>

Below is a patch to enable 4Kb stacks for x86. The goal of this is to

1) Reduce footprint per thread so that systems can run many more threads
   (for the java people)

2) Reduce the pressure on the VM for order > 0 allocations. We see real life
   workloads (granted with 2.4 but the fundamental fragmentation issue isn't
   solved in 2.6 and isn't solvable in theory) where this can be a problem.
   In addition order > 0 allocations can make the VM "stutter" and give more
   latency due to having to do much much more work trying to defragment

The first 2 bits of the patch actually affect compiler options in a generic
way: I propose to disable the -funit-at-a-time feature from gcc.  With this
enabled (and it's default with -O2), gcc will very agressively inline
functions, which is nice and all for userspace, but for the kernel this makes
us suffer a gcc deficiency more: gcc is extremely bad at sharing stackslots,
for example a situation like this:

if (some_condition)
	function_A();
else
	function_B();

with -funit-at-a-time, both function_A() and _B() might get inlined, however
the stack usage of both functions of the parent function grows the stack
usage of both functions COMBINED instead of the maximum of the two.  Even
with the normal 8Kb stacks this is a danger since we see some functions grow
3Kb to 4Kb of stack use this way.  With 4Kb stacks, 4Kb of stack usage growth
obviously is deadly ;-( but even with 8Kb stacks it's pure lottery.
Disabling -funit-at-a-time also exposes another thing in the -mm tree; the
attribute always_inline is considered harmful by gcc folks in that when gcc
makes a decision to NOT inline a function marked this way, it throws an
error.  Disabling -funit-at-a-time disables some of the agressive inlining
(eg of large functions that come later in the .c file) so this would make
your tree not compile.

The 4k stackness of the kernel is included in modversions, so people don't
load 4k-stack modules into 8k-stack kernels.

At present 4k stacks are selectable in config.  When the feature has settled
in we should remove the 8k option.  This will break the nvidia modules.  But
Fedora uses 4k stacks so a new nvidia driver is expected soon.
---
 arch/i386/Kconfig              |   9 +++
 arch/i386/Makefile             |   6 +-
 arch/i386/kernel/i8259.c       |   3 +
 arch/i386/kernel/irq.c         | 146 ++++++++++++++++++++++++++++++++++++++++-
 arch/i386/kernel/smpboot.c     |   2 +
 arch/i386/kernel/traps.c       |  18 +++--
 include/asm-alpha/irq.h        |   3 +
 include/asm-arm/irq.h          |   4 ++
 include/asm-arm26/irq.h        |   2 +
 include/asm-cris/irq.h         |   4 ++
 include/asm-h8300/irq.h        |   4 ++
 include/asm-i386/irq.h         |  25 +++++++
 include/asm-i386/module.h      |   8 ++-
 include/asm-i386/thread_info.h |  24 ++++++-
 include/asm-ia64/irq.h         |   4 ++
 include/asm-m68k/irq.h         |   4 ++
 include/asm-m68knommu/irq.h    |   4 ++
 include/asm-mips/irq.h         |   3 +
 include/asm-parisc/irq.h       |   3 +
 include/asm-ppc/irq.h          |   4 ++
 include/asm-ppc64/irq.h        |   4 ++
 include/asm-s390/irq.h         |   4 ++
 include/asm-sh/irq.h           |   4 ++
 include/asm-sparc/irq.h        |   4 ++
 include/asm-sparc64/irq.h      |   4 ++
 include/asm-um/irq.h           |   5 ++
 include/asm-v850/irq.h         |   4 ++
 include/asm-x86_64/irq.h       |   4 ++
 include/linux/compiler-gcc3.h  |   2 +-
 include/linux/irq.h            |   1 -
 kernel/softirq.c               |  70 ++++++++++++--------
 31 files changed, 342 insertions(+), 44 deletions(-)

(limited to 'include')

diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 03620021bd6b..c6439f846171 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1294,6 +1294,15 @@ config FRAME_POINTER
 	  If you don't debug the kernel, you can say N, but we may not be able
 	  to solve problems without frame pointers.
 
+config 4KSTACKS
+	bool "Use 4Kb for kernel stacks instead of 8Kb"
+	help
+	  If you say Y here the kernel will use a 4Kb stacksize for the
+	  kernel stack attached to each process/thread. This facilitates
+	  running more threads on a system and also reduces the pressure
+	  on the VM subsystem for higher order allocations. This option
+	  will also use IRQ stacks to compensate for the reduced stackspace.
+
 config X86_FIND_SMP_CONFIG
 	bool
 	depends on X86_LOCAL_APIC || X86_VOYAGER
diff --git a/arch/i386/Makefile b/arch/i386/Makefile
index 4c8f1c06f572..019544e08f1e 100644
--- a/arch/i386/Makefile
+++ b/arch/i386/Makefile
@@ -56,9 +56,9 @@ cflags-$(CONFIG_X86_ELAN)	+= -march=i486
 GCC_VERSION			:= $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-version.sh $(CC))
 cflags-$(CONFIG_REGPARM) 	+= $(shell if [ $(GCC_VERSION) -ge 0300 ] ; then echo "-mregparm=3"; fi ;)
 
-# Enable unit-at-a-time mode when possible. It shrinks the
-# kernel considerably.
-CFLAGS += $(call check_gcc,-funit-at-a-time,)
+# Disable unit-at-a-time mode, it makes gcc use a lot more stack
+# due to the lack of sharing of stacklots.
+CFLAGS += $(call check_gcc,-fno-unit-at-a-time,)
 
 CFLAGS += $(cflags-y)
 
diff --git a/arch/i386/kernel/i8259.c b/arch/i386/kernel/i8259.c
index f093e29b69a2..48fbaf5cee34 100644
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -444,4 +444,7 @@ void __init init_IRQ(void)
 	 */
 	if (boot_cpu_data.hard_math && !cpu_has_fpu)
 		setup_irq(FPU_IRQ, &fpu_irq);
+
+	current_thread_info()->cpu = 0;
+	irq_ctx_init(0);
 }
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index c1385d668bf4..ea69f21994f6 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -74,6 +74,14 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 
 static void register_irq_proc (unsigned int irq);
 
+/*
+ * per-CPU IRQ handling stacks
+ */
+#ifdef CONFIG_4KSTACKS
+union irq_ctx *hardirq_ctx[NR_CPUS];
+union irq_ctx *softirq_ctx[NR_CPUS];
+#endif
+
 /*
  * Special irq handlers.
  */
@@ -209,7 +217,7 @@ inline void synchronize_irq(unsigned int irq)
  * waste of time and is not what some drivers would
  * prefer.
  */
-int handle_IRQ_event(unsigned int irq,
+asmlinkage int handle_IRQ_event(unsigned int irq,
 		struct pt_regs *regs, struct irqaction *action)
 {
 	int status = 1;	/* Force the "do bottom halves" bit */
@@ -432,7 +440,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
-		if (unlikely(esp < (sizeof(struct thread_info) + 1024))) {
+		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
 			printk("do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
@@ -480,11 +488,68 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 	 * useful for irq hardware that does not mask cleanly in an
 	 * SMP environment.
 	 */
+#ifdef CONFIG_4KSTACKS
+
 	for (;;) {
 		irqreturn_t action_ret;
+		u32 *isp;
+		union irq_ctx * curctx;
+		union irq_ctx * irqctx;
+
+		curctx = (union irq_ctx *) current_thread_info();
+		irqctx = hardirq_ctx[smp_processor_id()];
 
 		spin_unlock(&desc->lock);
+
+		/*
+		 * this is where we switch to the IRQ stack. However, if we are already using
+		 * the IRQ stack (because we interrupted a hardirq handler) we can't do that
+		 * and just have to keep using the current stack (which is the irq stack already
+		 * after all)
+		 */
+
+		if (curctx == irqctx)
+			action_ret = handle_IRQ_event(irq, &regs, action);
+		else {
+			/* build the stack frame on the IRQ stack */
+			isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+			irqctx->tinfo.task = curctx->tinfo.task;
+			irqctx->tinfo.previous_esp = current_stack_pointer();
+
+			*--isp = (u32) action;
+			*--isp = (u32) &regs;
+			*--isp = (u32) irq;
+
+			asm volatile(
+				"       xchgl   %%ebx,%%esp     \n"
+				"       call    handle_IRQ_event \n"
+				"       xchgl   %%ebx,%%esp     \n"
+				: "=a"(action_ret)
+				: "b"(isp)
+				: "memory", "cc", "edx", "ecx"
+			);
+
+
+		}
+		spin_lock(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret);
+		if (curctx != irqctx)
+			irqctx->tinfo.task = NULL;
+		if (likely(!(desc->status & IRQ_PENDING)))
+			break;
+		desc->status &= ~IRQ_PENDING;
+	}
+
+#else
+
+	for (;;) {
+		irqreturn_t action_ret;
+
+		spin_unlock(&desc->lock);
+
 		action_ret = handle_IRQ_event(irq, &regs, action);
+
 		spin_lock(&desc->lock);
 		if (!noirqdebug)
 			note_interrupt(irq, desc, action_ret);
@@ -492,6 +557,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs regs)
 			break;
 		desc->status &= ~IRQ_PENDING;
 	}
+#endif
 	desc->status &= ~IRQ_INPROGRESS;
 
 out:
@@ -1049,3 +1115,79 @@ void init_irq_proc (void)
 		register_irq_proc(i);
 }
 
+
+#ifdef CONFIG_4KSTACKS
+static char softirq_stack[NR_CPUS * THREAD_SIZE]  __attribute__((__aligned__(THREAD_SIZE)));
+static char hardirq_stack[NR_CPUS * THREAD_SIZE]  __attribute__((__aligned__(THREAD_SIZE)));
+
+/*
+ * allocate per-cpu stacks for hardirq and for softirq processing
+ */
+void irq_ctx_init(int cpu)
+{
+	union irq_ctx *irqctx;
+
+	if (hardirq_ctx[cpu])
+		return;
+
+	irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	hardirq_ctx[cpu] = irqctx;
+
+	irqctx = (union irq_ctx*) &softirq_stack[cpu*THREAD_SIZE];
+	irqctx->tinfo.task              = NULL;
+	irqctx->tinfo.exec_domain       = NULL;
+	irqctx->tinfo.cpu               = cpu;
+	irqctx->tinfo.preempt_count     = SOFTIRQ_OFFSET;
+	irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
+
+	softirq_ctx[cpu] = irqctx;
+
+	printk("CPU %u irqstacks, hard=%p soft=%p\n",
+		cpu,hardirq_ctx[cpu],softirq_ctx[cpu]);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+asmlinkage void do_softirq(void)
+{
+	unsigned long flags;
+	struct thread_info *curctx;
+	union irq_ctx *irqctx;
+	u32 *isp;
+
+	if (in_interrupt())
+		return;
+
+	local_irq_save(flags);
+
+	if (local_softirq_pending()) {
+		curctx = current_thread_info();
+		irqctx = softirq_ctx[smp_processor_id()];
+		irqctx->tinfo.task = curctx->task;
+		irqctx->tinfo.previous_esp = current_stack_pointer();
+
+		/* build the stack frame on the softirq stack */
+		isp = (u32*) ((char*)irqctx + sizeof(*irqctx));
+
+
+		asm volatile(
+			"       xchgl   %%ebx,%%esp     \n"
+			"       call    __do_softirq    \n"
+			"       movl    %%ebx,%%esp     \n"
+			: "=b"(isp)
+			: "0"(isp)
+			: "memory", "cc", "edx", "ecx", "eax"
+		);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(do_softirq);
+#endif
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index a15be84152c4..7baa4d420b73 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -815,6 +815,8 @@ static int __init do_boot_cpu(int apicid)
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	stack_start.esp = (void *) idle->thread.esp;
 
+	irq_ctx_init(cpu);
+
 	/*
 	 * This grunge runs the startup process for
 	 * the targeted processor.
diff --git a/arch/i386/kernel/traps.c b/arch/i386/kernel/traps.c
index fe976abcdea9..cf8da7ba4cdb 100644
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -105,12 +105,20 @@ void show_trace(struct task_struct *task, unsigned long * stack)
 #ifdef CONFIG_KALLSYMS
 	printk("\n");
 #endif
-	while (!kstack_end(stack)) {
-		addr = *stack++;
-		if (kernel_text_address(addr)) {
-			printk(" [<%08lx>] ", addr);
-			print_symbol("%s\n", addr);
+	while (1) {
+		struct thread_info *context;
+		context = (struct thread_info*) ((unsigned long)stack & (~(THREAD_SIZE - 1)));
+		while (!kstack_end(stack)) {
+			addr = *stack++;
+			if (kernel_text_address(addr)) {
+				printk(" [<%08lx>] ", addr);
+				print_symbol("%s\n", addr);
+			}
 		}
+		stack = (unsigned long*)context->previous_esp;
+		if (!stack)
+			break;
+		printk(" =======================\n");
 	}
 	printk("\n");
 }
diff --git a/include/asm-alpha/irq.h b/include/asm-alpha/irq.h
index 551c7308c642..566db720000a 100644
--- a/include/asm-alpha/irq.h
+++ b/include/asm-alpha/irq.h
@@ -93,5 +93,8 @@ extern void enable_irq(unsigned int);
 struct pt_regs;
 extern void (*perf_irq)(unsigned long, struct pt_regs *);
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 
 #endif /* _ALPHA_IRQ_H */
diff --git a/include/asm-arm/irq.h b/include/asm-arm/irq.h
index a89f7345ed39..286be7cf7c63 100644
--- a/include/asm-arm/irq.h
+++ b/include/asm-arm/irq.h
@@ -44,5 +44,9 @@ void disable_irq_wake(unsigned int irq);
 void enable_irq_wake(unsigned int irq);
 int setup_irq(unsigned int, struct irqaction *);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
 
diff --git a/include/asm-arm26/irq.h b/include/asm-arm26/irq.h
index 68712e576c6f..06bd5a543d13 100644
--- a/include/asm-arm26/irq.h
+++ b/include/asm-arm26/irq.h
@@ -45,6 +45,8 @@ extern void enable_irq(unsigned int);
 int set_irq_type(unsigned int irq, unsigned int type);
 
 int setup_irq(unsigned int, struct irqaction *);
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 
 #endif
 
diff --git a/include/asm-cris/irq.h b/include/asm-cris/irq.h
index caa45facb1b2..87f342517bb1 100644
--- a/include/asm-cris/irq.h
+++ b/include/asm-cris/irq.h
@@ -14,6 +14,10 @@ extern void enable_irq(unsigned int);
 #define disable_irq_nosync      disable_irq
 #define enable_irq_nosync       enable_irq
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif  /* _ASM_IRQ_H */
 
 
diff --git a/include/asm-h8300/irq.h b/include/asm-h8300/irq.h
index fabde1dd34a1..5027181ed067 100644
--- a/include/asm-h8300/irq.h
+++ b/include/asm-h8300/irq.h
@@ -68,4 +68,8 @@ extern void disable_irq(unsigned int);
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _H8300_IRQ_H_ */
diff --git a/include/asm-i386/irq.h b/include/asm-i386/irq.h
index 69cb661b012a..5649b4a79bb2 100644
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -14,6 +14,7 @@
 #include <linux/sched.h>
 /* include comes from machine specific directory */
 #include "irq_vectors.h"
+#include <asm/thread_info.h>
 
 static __inline__ int irq_canonicalize(int irq)
 {
@@ -30,4 +31,28 @@ extern int can_request_irq(unsigned int, unsigned long flags);
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+#ifdef CONFIG_4KSTACKS
+/*
+ * per-CPU IRQ handling contexts (thread information and stack)
+ */
+union irq_ctx {
+	struct thread_info      tinfo;
+	u32                     stack[THREAD_SIZE/sizeof(u32)];
+};
+
+extern union irq_ctx *hardirq_ctx[NR_CPUS];
+extern union irq_ctx *softirq_ctx[NR_CPUS];
+
+extern void irq_ctx_init(int cpu);
+
+#define __ARCH_HAS_DO_SOFTIRQ
+#else
+#define irq_ctx_init(cpu) do { ; } while (0)
+#endif
+
+struct irqaction;
+struct pt_regs;
+asmlinkage int handle_IRQ_event(unsigned int, struct pt_regs *,
+				struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-i386/module.h b/include/asm-i386/module.h
index 76fc36f60ebe..8ec1dae638cb 100644
--- a/include/asm-i386/module.h
+++ b/include/asm-i386/module.h
@@ -60,6 +60,12 @@ struct mod_arch_specific
 #define MODULE_REGPARM ""
 #endif
 
-#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM
+#ifdef CONFIG_4KSTACKS
+#define MODULE_STACKSIZE "4KSTACKS "
+#else
+#define MODULE_STACKSIZE ""
+#endif
+
+#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_REGPARM MODULE_STACKSIZE
 
 #endif /* _ASM_I386_MODULE_H */
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index 75f940011daa..da5c780f2c5c 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -9,6 +9,9 @@
 
 #ifdef __KERNEL__
 
+#include <linux/config.h>
+#include <asm/page.h>
+
 #ifndef __ASSEMBLY__
 #include <asm/processor.h>
 #endif
@@ -29,12 +32,16 @@ struct thread_info {
 	__u32			cpu;		/* current CPU */
 	__s32			preempt_count; /* 0 => preemptable, <0 => BUG */
 
+
 	mm_segment_t		addr_limit;	/* thread address space:
 					 	   0-0xBFFFFFFF for user-thead
 						   0-0xFFFFFFFF for kernel-thread
 						*/
 	struct restart_block    restart_block;
 
+	unsigned long           previous_esp;   /* ESP of the previous stack in case
+						   of nested (IRQ) stacks
+						*/
 	__u8			supervisor_stack[0];
 };
 
@@ -53,7 +60,13 @@ struct thread_info {
 #endif
 
 #define PREEMPT_ACTIVE		0x4000000
+#ifdef CONFIG_4KSTACKS
+#define THREAD_SIZE            (4096)
+#else
+#define THREAD_SIZE		(8192)
+#endif
 
+#define STACK_WARN             (THREAD_SIZE/8)
 /*
  * macros/functions for gaining access to the thread information structure
  *
@@ -77,7 +90,6 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
-#define THREAD_SIZE (2*PAGE_SIZE)
 
 /* how to get the thread information struct from C */
 static inline struct thread_info *current_thread_info(void)
@@ -87,6 +99,14 @@ static inline struct thread_info *current_thread_info(void)
 	return ti;
 }
 
+/* how to get the current stack pointer from C */
+static inline unsigned long current_stack_pointer(void)
+{
+	unsigned long ti;
+	__asm__("movl %%esp,%0; ":"=r" (ti) : );
+	return ti;
+}
+
 /* thread information allocation */
 #ifdef CONFIG_DEBUG_STACK_USAGE
 #define alloc_thread_info(tsk)					\
@@ -108,8 +128,6 @@ static inline struct thread_info *current_thread_info(void)
 
 #else /* !__ASSEMBLY__ */
 
-#define THREAD_SIZE	8192
-
 /* how to get the thread information struct from ASM */
 #define GET_THREAD_INFO(reg) \
 	movl $-THREAD_SIZE, reg; \
diff --git a/include/asm-ia64/irq.h b/include/asm-ia64/irq.h
index 79479e2c6966..5d930fdc0bea 100644
--- a/include/asm-ia64/irq.h
+++ b/include/asm-ia64/irq.h
@@ -30,4 +30,8 @@ extern void disable_irq_nosync (unsigned int);
 extern void enable_irq (unsigned int);
 extern void set_irq_affinity_info (unsigned int irq, int dest, int redir);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IA64_IRQ_H */
diff --git a/include/asm-m68k/irq.h b/include/asm-m68k/irq.h
index 02855ca536b0..5889bc919e80 100644
--- a/include/asm-m68k/irq.h
+++ b/include/asm-m68k/irq.h
@@ -124,4 +124,8 @@ extern volatile unsigned int num_spurious;
  */
 extern irq_node_t *new_irq_node(void);
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-m68knommu/irq.h b/include/asm-m68knommu/irq.h
index 4c66ba93201a..208ccd969e4b 100644
--- a/include/asm-m68knommu/irq.h
+++ b/include/asm-m68knommu/irq.h
@@ -121,4 +121,8 @@ extern irq_node_t *new_irq_node(void);
 #define enable_irq_nosync(x)	enable_irq(x)
 #define disable_irq_nosync(x)	disable_irq(x)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _M68K_IRQ_H_ */
diff --git a/include/asm-mips/irq.h b/include/asm-mips/irq.h
index 90b4ae1258a8..d9667a8fbbfb 100644
--- a/include/asm-mips/irq.h
+++ b/include/asm-mips/irq.h
@@ -31,4 +31,7 @@ extern asmlinkage unsigned int do_IRQ(int irq, struct pt_regs *regs);
 
 extern void init_generic_irq(void);
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/asm-parisc/irq.h b/include/asm-parisc/irq.h
index 39db70230740..b7acca7de670 100644
--- a/include/asm-parisc/irq.h
+++ b/include/asm-parisc/irq.h
@@ -96,4 +96,7 @@ extern unsigned long txn_alloc_addr(int);
 /* soft power switch support (power.c) */
 extern struct tasklet_struct power_tasklet;
 
+struct irqaction;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif	/* _ASM_PARISC_IRQ_H */
diff --git a/include/asm-ppc/irq.h b/include/asm-ppc/irq.h
index bfa3de404d27..df5b76306f7a 100644
--- a/include/asm-ppc/irq.h
+++ b/include/asm-ppc/irq.h
@@ -211,5 +211,9 @@ extern unsigned long ppc_cached_irq_mask[NR_MASK_WORDS];
 extern unsigned long ppc_lost_interrupts[NR_MASK_WORDS];
 extern atomic_t ppc_n_lost_interrupts;
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h
index 949e19f96be1..2cd77b4935fb 100644
--- a/include/asm-ppc64/irq.h
+++ b/include/asm-ppc64/irq.h
@@ -75,5 +75,9 @@ static __inline__ int irq_canonicalize(int irq)
 	return irq;
 }
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
 #endif /* __KERNEL__ */
diff --git a/include/asm-s390/irq.h b/include/asm-s390/irq.h
index 25f1808531cc..cac6b3080725 100644
--- a/include/asm-s390/irq.h
+++ b/include/asm-s390/irq.h
@@ -21,6 +21,10 @@ enum interruption_class {
 
 #define touch_nmi_watchdog() do { } while(0)
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __KERNEL__ */
 #endif
 
diff --git a/include/asm-sh/irq.h b/include/asm-sh/irq.h
index f470f758057a..7dd2a5ae10b5 100644
--- a/include/asm-sh/irq.h
+++ b/include/asm-sh/irq.h
@@ -329,4 +329,8 @@ static inline int generic_irq_demux(int irq)
 #define irq_canonicalize(irq)	(irq)
 #define irq_demux(irq)		__irq_demux(sh_mv.mv_irq_demux(irq))
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __ASM_SH_IRQ_H */
diff --git a/include/asm-sparc/irq.h b/include/asm-sparc/irq.h
index 5423905ffb40..cee356b0dae3 100644
--- a/include/asm-sparc/irq.h
+++ b/include/asm-sparc/irq.h
@@ -184,4 +184,8 @@ extern struct sun4m_intregs *sun4m_interrupts;
 #define SUN4M_INT_SBUS(x)	(1 << (x+7))
 #define SUN4M_INT_VME(x)	(1 << (x))
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-sparc64/irq.h b/include/asm-sparc64/irq.h
index e3ba6bc2cc3e..3aef0ca67750 100644
--- a/include/asm-sparc64/irq.h
+++ b/include/asm-sparc64/irq.h
@@ -150,4 +150,8 @@ static __inline__ unsigned long get_softint(void)
 	return retval;
 }
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-um/irq.h b/include/asm-um/irq.h
index cd580acadc71..8300c209a1bc 100644
--- a/include/asm-um/irq.h
+++ b/include/asm-um/irq.h
@@ -32,4 +32,9 @@ extern int um_request_irq(unsigned int irq, int fd, int type,
 			  void (*handler)(int, void *, struct pt_regs *),
 			  unsigned long irqflags,  const char * devname,
 			  void *dev_id);
+
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif
diff --git a/include/asm-v850/irq.h b/include/asm-v850/irq.h
index 63e682d70de1..90c83aa053c8 100644
--- a/include/asm-v850/irq.h
+++ b/include/asm-v850/irq.h
@@ -65,4 +65,8 @@ extern void disable_irq_nosync (unsigned int irq);
 
 #endif /* !__ASSEMBLY__ */
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* __V850_IRQ_H__ */
diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h
index ad5445ee7460..37c9fd65c97f 100644
--- a/include/asm-x86_64/irq.h
+++ b/include/asm-x86_64/irq.h
@@ -53,4 +53,8 @@ extern int can_request_irq(unsigned int, unsigned long flags);
 #define ARCH_HAS_NMI_WATCHDOG		/* See include/linux/nmi.h */
 #endif
 
+struct irqaction;
+struct pt_regs;
+int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+
 #endif /* _ASM_IRQ_H */
diff --git a/include/linux/compiler-gcc3.h b/include/linux/compiler-gcc3.h
index c472cac3029d..265dad4c3cb4 100644
--- a/include/linux/compiler-gcc3.h
+++ b/include/linux/compiler-gcc3.h
@@ -3,7 +3,7 @@
 /* These definitions are for GCC v3.x.  */
 #include <linux/compiler-gcc.h>
 
-#if __GNUC_MINOR__ >= 1
+#if __GNUC_MINOR__ >= 1  && __GNUC_MINOR__ < 4
 # define inline		__inline__ __attribute__((always_inline))
 # define __inline__	__inline__ __attribute__((always_inline))
 # define __inline	__inline__ __attribute__((always_inline))
diff --git a/include/linux/irq.h b/include/linux/irq.h
index fa03b836c29a..5bc740d9bc47 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -71,7 +71,6 @@ extern irq_desc_t irq_desc [NR_IRQS];
 
 #include <asm/hw_irq.h> /* the arch dependent stuff */
 
-extern int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
 extern int setup_irq(unsigned int , struct irqaction * );
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81c79736ff9e..58c915c202ff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -16,6 +16,7 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 
+#include <asm/irq.h>
 /*
    - No shared variables, all the data are CPU local.
    - If a softirq needs serialization, let it serialize itself
@@ -69,53 +70,66 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void do_softirq(void)
+asmlinkage void __do_softirq(void)
 {
-	int max_restart = MAX_SOFTIRQ_RESTART;
+	struct softirq_action *h;
 	__u32 pending;
-	unsigned long flags;
+	int max_restart = MAX_SOFTIRQ_RESTART;
 
-	if (in_interrupt())
-		return;
+	pending = local_softirq_pending();
 
-	local_irq_save(flags);
+	local_bh_disable();
+restart:
+	/* Reset the pending bitmask before enabling irqs */
+	local_softirq_pending() = 0;
+
+	local_irq_enable();
+
+	h = softirq_vec;
+
+	do {
+		if (pending & 1)
+			h->action(h);
+		h++;
+		pending >>= 1;
+	} while (pending);
+
+	local_irq_disable();
 
 	pending = local_softirq_pending();
+	if (pending && --max_restart)
+		goto restart;
 
-	if (pending) {
-		struct softirq_action *h;
+	if (pending)
+		wakeup_softirqd();
 
-		local_bh_disable();
-restart:
-		/* Reset the pending bitmask before enabling irqs */
-		local_softirq_pending() = 0;
+	__local_bh_enable();
+}
 
-		local_irq_enable();
+#ifndef __ARCH_HAS_DO_SOFTIRQ
+
+asmlinkage void do_softirq(void)
+{
+	__u32 pending;
+	unsigned long flags;
 
-		h = softirq_vec;
+	if (in_interrupt())
+		return;
 
-		do {
-			if (pending & 1)
-				h->action(h);
-			h++;
-			pending >>= 1;
-		} while (pending);
+	local_irq_save(flags);
 
-		local_irq_disable();
+	pending = local_softirq_pending();
 
-		pending = local_softirq_pending();
-		if (pending && --max_restart)
-			goto restart;
-		if (pending)
-			wakeup_softirqd();
-		__local_bh_enable();
-	}
+	if (pending)
+		__do_softirq();
 
 	local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
 
+#endif
+
 void local_bh_enable(void)
 {
 	__local_bh_enable();
-- 
cgit v1.2.3


From 4c886627378d0efc943442e4ec19df4a27bd1636 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:19:09 -0700
Subject: [PATCH] ppc64: NUMA fix for 16MB LMBs

From: Olof Johansson <olof@austin.ibm.com>

As discussed on the ppc64 list yesterday and today:

On some ppc64 systems, Open Firmware will give memory device nodes that are
only 16MB in size, instead of the 256MB that our NUMA code currently
expects (see MEMORY_INCREMENT in mmzone.h).

Just changing the defines from 256MB to 16MB makes the table blow up from
32KB to 512KB, so this patch also makes it dynamically allocated based on
actual memory size.  Since all this is done before (well, during) bootmem
init so we need to use lmb_alloc().

Finally, there's no need to use a full int for node ID. Current max is 16
nodes, so a signed char still leaves plenty of room to grow.
---
 arch/ppc64/mm/numa.c       | 21 ++++++++++++++++++---
 include/asm-ppc64/mmzone.h |  6 +++---
 2 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/arch/ppc64/mm/numa.c b/arch/ppc64/mm/numa.c
index 2044d4ee479e..3166cad799c2 100644
--- a/arch/ppc64/mm/numa.c
+++ b/arch/ppc64/mm/numa.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <asm/lmb.h>
 #include <asm/machdep.h>
+#include <asm/abs_addr.h>
 
 #if 1
 #define dbg(args...) udbg_printf(args)
@@ -31,9 +32,7 @@
 
 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
 	ARRAY_INITIALISER};
-int numa_memory_lookup_table[MAX_MEMORY >> MEMORY_INCREMENT_SHIFT] =
-	{ [ 0 ... ((MAX_MEMORY >> MEMORY_INCREMENT_SHIFT) - 1)] =
-	ARRAY_INITIALISER};
+char *numa_memory_lookup_table;
 cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
 int nr_cpus_in_node[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES -1)] = 0};
 
@@ -65,12 +64,20 @@ static int __init parse_numa_properties(void)
 	int *memory_associativity;
 	int depth;
 	int max_domain = 0;
+	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
+	long i;
 
 	if (strstr(saved_command_line, "numa=off")) {
 		printk(KERN_WARNING "NUMA disabled by user\n");
 		return -1;
 	}
 
+	numa_memory_lookup_table =
+		(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+
+	for (i = 0; i < entries ; i++)
+		numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+
 	cpu = of_find_node_by_type(NULL, "cpu");
 	if (!cpu)
 		goto err;
@@ -235,6 +242,14 @@ static void __init setup_nonnuma(void)
 	printk(KERN_INFO "Memory hole size: %ldMB\n",
 	       (top_of_ram - total_ram) >> 20);
 
+	if (!numa_memory_lookup_table) {
+		long entries = top_of_ram >> MEMORY_INCREMENT_SHIFT;
+		numa_memory_lookup_table =
+			(char *)abs_to_virt(lmb_alloc(entries * sizeof(char), 1));
+		for (i = 0; i < entries ; i++)
+			numa_memory_lookup_table[i] = ARRAY_INITIALISER;
+	}
+
 	for (i = 0; i < NR_CPUS; i++)
 		map_cpu_to_node(i, 0);
 
diff --git a/include/asm-ppc64/mmzone.h b/include/asm-ppc64/mmzone.h
index 7338bbb055a8..4182c647f4e6 100644
--- a/include/asm-ppc64/mmzone.h
+++ b/include/asm-ppc64/mmzone.h
@@ -19,13 +19,13 @@ extern struct pglist_data node_data[];
  */
 
 extern int numa_cpu_lookup_table[];
-extern int numa_memory_lookup_table[];
+extern char *numa_memory_lookup_table;
 extern cpumask_t numa_cpumask_lookup_table[];
 extern int nr_cpus_in_node[];
 
 #define MAX_MEMORY (1UL << 41)
-/* 256MB regions */
-#define MEMORY_INCREMENT_SHIFT 28
+/* 16MB regions */
+#define MEMORY_INCREMENT_SHIFT 24
 #define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT)
 
 /* NUMA debugging, will not work on a DLPAR machine */
-- 
cgit v1.2.3


From 2e061730cce0ec9d6157ee2f548625336647b7db Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:20:27 -0700
Subject: [PATCH] v4l: cropcap ioctl fix

From: Gerd Knorr <kraxel@bytesex.org>

The VIDIOC_CROPCAP ioctl had wrong R/W bits, this patch fixes it.
---
 drivers/media/video/videodev.c | 3 +++
 include/linux/videodev2.h      | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/media/video/videodev.c b/drivers/media/video/videodev.c
index 526fec24ad6c..532c1b4fa3f3 100644
--- a/drivers/media/video/videodev.c
+++ b/drivers/media/video/videodev.c
@@ -160,6 +160,9 @@ video_fix_command(unsigned int cmd)
 	case VIDIOC_G_AUDOUT_OLD:
 		cmd = VIDIOC_G_AUDOUT;
 		break;
+	case VIDIOC_CROPCAP_OLD:
+		cmd = VIDIOC_CROPCAP;
+		break;
 	}
 	return cmd;
 }
diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h
index 1bfc45a4a430..a4ab8e826bbe 100644
--- a/include/linux/videodev2.h
+++ b/include/linux/videodev2.h
@@ -869,7 +869,7 @@ struct v4l2_streamparm
 #define VIDIOC_S_MODULATOR	_IOW  ('V', 55, struct v4l2_modulator)
 #define VIDIOC_G_FREQUENCY	_IOWR ('V', 56, struct v4l2_frequency)
 #define VIDIOC_S_FREQUENCY	_IOW  ('V', 57, struct v4l2_frequency)
-#define VIDIOC_CROPCAP		_IOR  ('V', 58, struct v4l2_cropcap)
+#define VIDIOC_CROPCAP		_IOWR ('V', 58, struct v4l2_cropcap)
 #define VIDIOC_G_CROP		_IOWR ('V', 59, struct v4l2_crop)
 #define VIDIOC_S_CROP		_IOW  ('V', 60, struct v4l2_crop)
 #define VIDIOC_G_JPEGCOMP	_IOR  ('V', 61, struct v4l2_jpegcompression)
@@ -887,6 +887,7 @@ struct v4l2_streamparm
 #define VIDIOC_S_CTRL_OLD      	_IOW  ('V', 28, struct v4l2_control)
 #define VIDIOC_G_AUDIO_OLD     	_IOWR ('V', 33, struct v4l2_audio)
 #define VIDIOC_G_AUDOUT_OLD    	_IOWR ('V', 49, struct v4l2_audioout)
+#define VIDIOC_CROPCAP_OLD     	_IOR  ('V', 58, struct v4l2_cropcap)
 
 #define BASE_VIDIOC_PRIVATE	192		/* 192-255 are private */
 
-- 
cgit v1.2.3


From 831434861116756312a982d2082d91d20fed1de0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:20:39 -0700
Subject: [PATCH] v4l: v4l1-compat fix

From: Gerd Knorr <kraxel@bytesex.org>

Minor tweak in the v4l1 compatibility layer: Make sure that capture actually
is active before going to wait for a frame so we don't block forever.
---
 drivers/media/video/v4l1-compat.c | 15 +++++++++++----
 include/linux/videodev.h          |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/drivers/media/video/v4l1-compat.c b/drivers/media/video/v4l1-compat.c
index 0f15efb6de56..441733d0b1b8 100644
--- a/drivers/media/video/v4l1-compat.c
+++ b/drivers/media/video/v4l1-compat.c
@@ -289,6 +289,7 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 {
 	struct v4l2_capability  *cap2 = NULL;
 	struct v4l2_format	*fmt2 = NULL;
+	enum v4l2_buf_type      captype = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 
 	struct v4l2_framebuffer fbuf2;
 	struct v4l2_input	input2;
@@ -465,6 +466,7 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 		fmt2 = kmalloc(sizeof(*fmt2),GFP_KERNEL);
 		memset(fmt2,0,sizeof(*fmt2));
 		fmt2->type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
+		drv(inode, file, VIDIOC_STREAMOFF, &fmt2->type);
 		err1 = drv(inode, file, VIDIOC_G_FMT, fmt2);
 		if (err1 < 0)
 			dprintk("VIDIOCSWIN / VIDIOC_G_FMT: %d\n",err);
@@ -503,11 +505,10 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 		int *on = arg;
 
 		if (0 == *on) {
-			enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 			/* dirty hack time.  But v4l1 has no STREAMOFF
 			 * equivalent in the API, and this one at
 			 * least comes close ... */
-			drv(inode, file, VIDIOC_STREAMOFF, &type);
+			drv(inode, file, VIDIOC_STREAMOFF, &captype);
 		}
 		err = drv(inode, file, VIDIOC_OVERLAY, arg);
 		if (err < 0)
@@ -858,7 +859,6 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 	case VIDIOCMCAPTURE: /*  capture a frame  */
 	{
 		struct video_mmap	*mm = arg;
-		enum v4l2_buf_type	type = V4L2_BUF_TYPE_VIDEO_CAPTURE;
 
 		fmt2 = kmalloc(sizeof(*fmt2),GFP_KERNEL);
 		memset(&buf2,0,sizeof(buf2));
@@ -899,7 +899,7 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 			dprintk("VIDIOCMCAPTURE / VIDIOC_QBUF: %d\n",err);
 			break;
 		}
-		err = drv(inode, file, VIDIOC_STREAMON, &type);
+		err = drv(inode, file, VIDIOC_STREAMON, &captype);
 		if (err < 0)
 			dprintk("VIDIOCMCAPTURE / VIDIOC_STREAMON: %d\n",err);
 		break;
@@ -922,6 +922,13 @@ v4l_compat_translate_ioctl(struct inode         *inode,
 			break;
 		}
 
+		/* make sure capture actually runs so we don't block forever */
+		err = drv(inode, file, VIDIOC_STREAMON, &captype);
+		if (err < 0) {
+			dprintk("VIDIOCSYNC / VIDIOC_STREAMON: %d\n",err);
+			break;
+		}
+
 		/*  Loop as long as the buffer is queued, but not done  */
 		while ((buf2.flags &
 			(V4L2_BUF_FLAG_QUEUED | V4L2_BUF_FLAG_DONE))
diff --git a/include/linux/videodev.h b/include/linux/videodev.h
index 4e421d3d25ed..cfcf6f1cd0e2 100644
--- a/include/linux/videodev.h
+++ b/include/linux/videodev.h
@@ -430,6 +430,7 @@ struct video_code
 #define VID_HARDWARE_VICAM      34
 #define VID_HARDWARE_SF16FMR2	35
 #define VID_HARDWARE_W9968CF    36
+#define VID_HARDWARE_SAA7114H   37
 #endif /* __LINUX_VIDEODEV_H */
 
 /*
-- 
cgit v1.2.3


From 3cfba98413fa46ce20da284d8bda4fc1d314ce49 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:21:05 -0700
Subject: [PATCH] v4l: msp3400 update

From: Gerd Knorr <kraxel@bytesex.org>

This patch allows to use switch to the second external input of the msp34xx
chips.  Also has some minor cleanups and more verbose debug info.
---
 drivers/media/video/msp3400.c | 47 ++++++++++++++++++++++++-------------------
 include/media/audiochip.h     |  2 ++
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include')

diff --git a/drivers/media/video/msp3400.c b/drivers/media/video/msp3400.c
index 75d77dd56b4f..3576ba504168 100644
--- a/drivers/media/video/msp3400.c
+++ b/drivers/media/video/msp3400.c
@@ -1384,34 +1384,30 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 {
 	struct msp3400c *msp  = i2c_get_clientdata(client);
         __u16           *sarg = arg;
-#if 0
-	int             *iarg = (int*)arg;
-#endif
+	int scart = 0;
 
 	switch (cmd) {
 
 	case AUDC_SET_INPUT:
-		/* scart switching
-		     - IN1 is often used for external input
-		     - Hauppauge uses IN2 for the radio */
 		dprintk(KERN_DEBUG "msp34xx: AUDC_SET_INPUT(%d)\n",*sarg);
 		if (*sarg == msp->input)
 			break;
 		msp->input = *sarg;
 		switch (*sarg) {
 		case AUDIO_RADIO:
+			/* Hauppauge uses IN2 for the radio */
 			msp->mode   = MSP_MODE_FM_RADIO;
-			msp->stereo = VIDEO_SOUND_STEREO;
-			msp3400c_set_scart(client,SCART_IN2,0);
-			msp3400c_write(client,I2C_MSP3400C_DFP,0x000d,0x1900);
-			msp3400c_setstereo(client,msp->stereo);
+			scart       = SCART_IN2;
 			break;
-		case AUDIO_EXTERN:
+		case AUDIO_EXTERN_1:
+			/* IN1 is often used for external input ... */
 			msp->mode   = MSP_MODE_EXTERN;
-			msp->stereo = VIDEO_SOUND_STEREO;
-			msp3400c_set_scart(client,SCART_IN1,0);
-			msp3400c_write(client,I2C_MSP3400C_DFP,0x000d,0x1900);
-			msp3400c_setstereo(client,msp->stereo);
+			scart       = SCART_IN1;
+			break;
+		case AUDIO_EXTERN_2:
+			/* ... sometimes it is IN2 through ;) */
+			msp->mode   = MSP_MODE_EXTERN;
+			scart       = SCART_IN2;
 			break;
 		case AUDIO_TUNER:
 			msp->mode   = -1;
@@ -1422,6 +1418,12 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 				msp3400c_set_scart(client,SCART_MUTE,0);
 			break;
 		}
+		if (scart) {
+			msp->stereo = VIDEO_SOUND_STEREO;
+			msp3400c_set_scart(client,scart,0);
+			msp3400c_write(client,I2C_MSP3400C_DFP,0x000d,0x1900);
+			msp3400c_setstereo(client,msp->stereo);
+		}
 		if (msp->active)
 			msp->restart = 1;
 		break;
@@ -1487,12 +1489,15 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 		if (msp->muted)
 			va->flags |= VIDEO_AUDIO_MUTE;
 		va->volume=max(msp->left,msp->right);
-		va->balance=(32768*min(msp->left,msp->right))/
-			(va->volume ? va->volume : 1);
-		va->balance=(msp->left<msp->right)?
-			(65535-va->balance) : va->balance;
-		if (0 == va->volume)
+
+		if (0 == va->volume) {
 			va->balance = 32768;
+		} else {
+			va->balance = (32768 * min(msp->left,msp->right))
+				/ va->volume;
+			va->balance = (msp->left<msp->right) ?
+				(65535 - va->balance) : va->balance;
+		}
 		va->bass = msp->bass;
 		va->treble = msp->treble;
 
@@ -1530,7 +1535,7 @@ static int msp_command(struct i2c_client *client, unsigned int cmd, void *arg)
 	{
 		struct video_channel *vc = arg;
 		
-		dprintk(KERN_DEBUG "msp34xx: VIDIOCSCHAN\n");
+		dprintk(KERN_DEBUG "msp34xx: VIDIOCSCHAN (norm=%d)\n",vc->norm);
 		msp->norm = vc->norm;
 		break;
 	}
diff --git a/include/media/audiochip.h b/include/media/audiochip.h
index 1fbc90e53232..fadc32dab628 100644
--- a/include/media/audiochip.h
+++ b/include/media/audiochip.h
@@ -15,6 +15,8 @@
 #define AUDIO_INTERN       0x03
 #define AUDIO_OFF          0x04 
 #define AUDIO_ON           0x05
+#define AUDIO_EXTERN_1     AUDIO_EXTERN
+#define AUDIO_EXTERN_2     0x06
 #define AUDIO_MUTE         0x80
 #define AUDIO_UNMUTE       0x81
 
-- 
cgit v1.2.3


From 0109dc6d2d874ea81bc1e9e34bed612959d70365 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:24:32 -0700
Subject: [PATCH] Improve list.h documentation for _rcu() primitives

From: "Paul E. McKenney" <paulmck@us.ibm.com>

The attached patch improves the documentation of the _rcu list primitives.
---
 include/linux/list.h | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'include')

diff --git a/include/linux/list.h b/include/linux/list.h
index 5388098449cc..1269d0c64d33 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -104,6 +104,14 @@ static __inline__ void __list_add_rcu(struct list_head * new,
  *
  * Insert a new entry after the specified head.
  * This is good for implementing stacks.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
  */
 static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head)
 {
@@ -117,6 +125,14 @@ static __inline__ void list_add_rcu(struct list_head *new, struct list_head *hea
  *
  * Insert a new entry before the specified head.
  * This is useful for implementing queues.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_add_tail_rcu()
+ * or list_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
  */
 static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head)
 {
@@ -159,6 +175,19 @@ static inline void list_del(struct list_head *entry)
  *
  * In particular, it means that we can not poison the forward 
  * pointers that may still be used for walking the list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as list_del_rcu()
+ * or list_add_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * list_for_each_entry_rcu().
+ *
+ * Note that the caller is not permitted to immediately free
+ * the newly deleted entry.  Instead, either synchronize_kernel()
+ * or call_rcu() must be used to defer freeing until an RCU
+ * grace period has elapsed.
  */
 static inline void list_del_rcu(struct list_head *entry)
 {
@@ -384,6 +413,10 @@ static inline void list_splice_init(struct list_head *list,
  * list_for_each_rcu	-	iterate over an rcu-protected list
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_rcu(pos, head) \
 	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
@@ -399,6 +432,10 @@ static inline void list_splice_init(struct list_head *list,
  * @pos:	the &struct list_head to use as a loop counter.
  * @n:		another &struct list_head to use as temporary storage
  * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_safe_rcu(pos, n, head) \
 	for (pos = (head)->next, n = pos->next; pos != (head); \
@@ -409,6 +446,10 @@ static inline void list_splice_init(struct list_head *list,
  * @pos:	the type * to use as a loop counter.
  * @head:	the head for your list.
  * @member:	the name of the list_struct within the struct.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member)			\
 	for (pos = list_entry((head)->next, typeof(*pos), member),	\
@@ -424,6 +465,10 @@ static inline void list_splice_init(struct list_head *list,
  *			continuing after existing point.
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
+ *
+ * This list-traversal primitive may safely run concurrently with
+ * the _rcu list-mutation primitives such as list_add_rcu()
+ * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_continue_rcu(pos, head) \
 	for ((pos) = (pos)->next, prefetch((pos)->next); (pos) != (head); \
@@ -485,6 +530,14 @@ static __inline__ void hlist_del(struct hlist_node *n)
  *
  * In particular, it means that we can not poison the forward
  * pointers that may still be used for walking the hash list.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry().
  */
 static inline void hlist_del_rcu(struct hlist_node *n)
 {
@@ -512,6 +565,26 @@ static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h
 	n->pprev = &h->first; 
 } 
 
+
+/**
+ * hlist_add_head_rcu - adds the specified element to the specified hlist,
+ * while permitting racing traversals.
+ * @n: the element to add to the hash list.
+ * @h: the list to add to.
+ *
+ * The caller must take whatever precautions are necessary
+ * (such as holding appropriate locks) to avoid racing
+ * with another list-mutation primitive, such as hlist_add_head_rcu()
+ * or hlist_del_rcu(), running on this same list.
+ * However, it is perfectly legal to run concurrently with
+ * the _rcu list-traversal primitives, such as
+ * hlist_for_each_entry(), but only if smp_read_barrier_depends()
+ * is used to prevent memory-consistency problems on Alpha CPUs.
+ * Regardless of the type of CPU, the list-traversal primitive
+ * must be guarded by rcu_read_lock().
+ *
+ * OK, so why don't we have an hlist_for_each_entry_rcu()???
+ */
 static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) 
 { 
 	struct hlist_node *first = h->first;
-- 
cgit v1.2.3


From 492361a6d915137590a8eba787dd878d71137358 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:24:44 -0700
Subject: [PATCH] list.h cleanup

- s/__inline__/inline/

- Remove lots of extraneous andi-was-here trailing whitespace
---
 include/linux/list.h | 120 ++++++++++++++++++++++++++-------------------------
 1 file changed, 61 insertions(+), 59 deletions(-)

(limited to 'include')

diff --git a/include/linux/list.h b/include/linux/list.h
index 1269d0c64d33..34fd74e050df 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -39,7 +39,7 @@ struct list_head {
 } while (0)
 
 /*
- * Insert a new entry between two known consecutive entries. 
+ * Insert a new entry between two known consecutive entries.
  *
  * This is only for internal list manipulation where we know
  * the prev/next entries already!
@@ -81,14 +81,13 @@ static inline void list_add_tail(struct list_head *new, struct list_head *head)
 }
 
 /*
- * Insert a new entry between two known consecutive entries. 
+ * Insert a new entry between two known consecutive entries.
  *
  * This is only for internal list manipulation where we know
  * the prev/next entries already!
  */
-static __inline__ void __list_add_rcu(struct list_head * new,
-	struct list_head * prev,
-	struct list_head * next)
+static inline void __list_add_rcu(struct list_head * new,
+		struct list_head * prev, struct list_head * next)
 {
 	new->next = next;
 	new->prev = prev;
@@ -113,7 +112,7 @@ static __inline__ void __list_add_rcu(struct list_head * new,
  * the _rcu list-traversal primitives, such as
  * list_for_each_entry_rcu().
  */
-static __inline__ void list_add_rcu(struct list_head *new, struct list_head *head)
+static inline void list_add_rcu(struct list_head *new, struct list_head *head)
 {
 	__list_add_rcu(new, head, head->next);
 }
@@ -134,7 +133,8 @@ static __inline__ void list_add_rcu(struct list_head *new, struct list_head *hea
  * the _rcu list-traversal primitives, such as
  * list_for_each_entry_rcu().
  */
-static __inline__ void list_add_tail_rcu(struct list_head *new, struct list_head *head)
+static inline void list_add_tail_rcu(struct list_head *new,
+					struct list_head *head)
 {
 	__list_add_rcu(new, head->prev, head);
 }
@@ -169,11 +169,11 @@ static inline void list_del(struct list_head *entry)
  * list_del_rcu - deletes entry from list without re-initialization
  * @entry: the element to delete from the list.
  *
- * Note: list_empty on entry does not return true after this, 
+ * Note: list_empty on entry does not return true after this,
  * the entry is in an undefined state. It is useful for RCU based
  * lockfree traversal.
  *
- * In particular, it means that we can not poison the forward 
+ * In particular, it means that we can not poison the forward
  * pointers that may still be used for walking the list.
  *
  * The caller must take whatever precautions are necessary
@@ -202,7 +202,7 @@ static inline void list_del_rcu(struct list_head *entry)
 static inline void list_del_init(struct list_head *entry)
 {
 	__list_del(entry->prev, entry->next);
-	INIT_LIST_HEAD(entry); 
+	INIT_LIST_HEAD(entry);
 }
 
 /**
@@ -335,7 +335,7 @@ static inline void list_splice_init(struct list_head *list,
 #define list_for_each_prev(pos, head) \
 	for (pos = (head)->prev, prefetch(pos->prev); pos != (head); \
         	pos = pos->prev, prefetch(pos->prev))
-        	
+
 /**
  * list_for_each_safe	-	iterate over a list safe against removal of list entry
  * @pos:	the &struct list_head to use as a loop counter.
@@ -421,11 +421,11 @@ static inline void list_splice_init(struct list_head *list,
 #define list_for_each_rcu(pos, head) \
 	for (pos = (head)->next, prefetch(pos->next); pos != (head); \
         	pos = pos->next, ({ smp_read_barrier_depends(); 0;}), prefetch(pos->next))
-        	
+
 #define __list_for_each_rcu(pos, head) \
 	for (pos = (head)->next; pos != (head); \
         	pos = pos->next, ({ smp_read_barrier_depends(); 0;}))
-        	
+
 /**
  * list_for_each_safe_rcu	-	iterate over an rcu-protected list safe
  *					against removal of list entry
@@ -461,7 +461,7 @@ static inline void list_splice_init(struct list_head *list,
 
 
 /**
- * list_for_each_continue_rcu	-	iterate over an rcu-protected list 
+ * list_for_each_continue_rcu	-	iterate over an rcu-protected list
  *			continuing after existing point.
  * @pos:	the &struct list_head to use as a loop counter.
  * @head:	the head for your list.
@@ -474,46 +474,46 @@ static inline void list_splice_init(struct list_head *list,
 	for ((pos) = (pos)->next, prefetch((pos)->next); (pos) != (head); \
         	(pos) = (pos)->next, ({ smp_read_barrier_depends(); 0;}), prefetch((pos)->next))
 
-/* 
- * Double linked lists with a single pointer list head. 
- * Mostly useful for hash tables where the two pointer list head is 
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
  * too wasteful.
  * You lose the ability to access the tail in O(1).
- */ 
+ */
 
-struct hlist_head { 
-	struct hlist_node *first; 
-}; 
+struct hlist_head {
+	struct hlist_node *first;
+};
 
-struct hlist_node { 
-	struct hlist_node *next, **pprev; 
-}; 
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
 
-#define HLIST_HEAD_INIT { .first = NULL } 
+#define HLIST_HEAD_INIT { .first = NULL }
 #define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
-#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) 
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
 #define INIT_HLIST_NODE(ptr) ((ptr)->next = NULL, (ptr)->pprev = NULL)
 
-static __inline__ int hlist_unhashed(const struct hlist_node *h) 
-{ 
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
 	return !h->pprev;
-} 
+}
 
-static __inline__ int hlist_empty(const struct hlist_head *h) 
-{ 
+static inline int hlist_empty(const struct hlist_head *h)
+{
 	return !h->first;
-} 
+}
 
-static __inline__ void __hlist_del(struct hlist_node *n) 
+static inline void __hlist_del(struct hlist_node *n)
 {
 	struct hlist_node *next = n->next;
 	struct hlist_node **pprev = n->pprev;
-	*pprev = next;  
-	if (next) 
+	*pprev = next;
+	if (next)
 		next->pprev = pprev;
-}  
+}
 
-static __inline__ void hlist_del(struct hlist_node *n)
+static inline void hlist_del(struct hlist_node *n)
 {
 	__hlist_del(n);
 	n->next = LIST_POISON1;
@@ -524,7 +524,7 @@ static __inline__ void hlist_del(struct hlist_node *n)
  * hlist_del_rcu - deletes entry from hash list without re-initialization
  * @n: the element to delete from the hash list.
  *
- * Note: list_unhashed() on entry does not return true after this, 
+ * Note: list_unhashed() on entry does not return true after this,
  * the entry is in an undefined state. It is useful for RCU based
  * lockfree traversal.
  *
@@ -545,25 +545,25 @@ static inline void hlist_del_rcu(struct hlist_node *n)
 	n->pprev = LIST_POISON2;
 }
 
-static __inline__ void hlist_del_init(struct hlist_node *n) 
+static inline void hlist_del_init(struct hlist_node *n)
 {
 	if (n->pprev)  {
 		__hlist_del(n);
 		INIT_HLIST_NODE(n);
 	}
-}  
+}
 
 #define hlist_del_rcu_init hlist_del_init
 
-static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h) 
-{ 
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
 	struct hlist_node *first = h->first;
-	n->next = first; 
-	if (first) 
+	n->next = first;
+	if (first)
 		first->pprev = &n->next;
-	h->first = n; 
-	n->pprev = &h->first; 
-} 
+	h->first = n;
+	n->pprev = &h->first;
+}
 
 
 /**
@@ -585,28 +585,30 @@ static __inline__ void hlist_add_head(struct hlist_node *n, struct hlist_head *h
  *
  * OK, so why don't we have an hlist_for_each_entry_rcu()???
  */
-static __inline__ void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) 
-{ 
+static inline void hlist_add_head_rcu(struct hlist_node *n,
+					struct hlist_head *h)
+{
 	struct hlist_node *first = h->first;
 	n->next = first;
-	n->pprev = &h->first; 
+	n->pprev = &h->first;
 	smp_wmb();
-	if (first) 
+	if (first)
 		first->pprev = &n->next;
-	h->first = n; 
-} 
+	h->first = n;
+}
 
 /* next must be != NULL */
-static __inline__ void hlist_add_before(struct hlist_node *n, struct hlist_node *next)
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
 {
 	n->pprev = next->pprev;
-	n->next = next; 
-	next->pprev = &n->next; 
+	n->next = next;
+	next->pprev = &n->next;
 	*(n->pprev) = n;
 }
 
-static __inline__ void hlist_add_after(struct hlist_node *n,
-				       struct hlist_node *next)
+static inline void hlist_add_after(struct hlist_node *n,
+					struct hlist_node *next)
 {
 	next->next	= n->next;
 	*(next->pprev)	= n;
@@ -618,7 +620,7 @@ static __inline__ void hlist_add_after(struct hlist_node *n,
 /* Cannot easily do prefetch unfortunately */
 #define hlist_for_each(pos, head) \
 	for (pos = (head)->first; pos && ({ prefetch(pos->next); 1; }); \
-	     pos = pos->next) 
+	     pos = pos->next)
 
 #define hlist_for_each_safe(pos, n, head) \
 	for (pos = (head)->first; n = pos ? pos->next : 0, pos; \
-- 
cgit v1.2.3


From 01cc53b25e1883ff537d19adc87097e1833deeaa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:24:57 -0700
Subject: [PATCH] Non-Exec stack support

From: Kurt Garloff <garloff@suse.de>

A patch to parse the elf binaries for a PT_GNU_STACK section to set the stack
non-executable if possible.  Most parts have been shamelessly stolen from
Ingo Molnar's more ambitious stackshield
http://people.redhat.com/mingo/exec-shield/exec-shield-2.6.4-C9

The toolchain has meanwhile support for marking the binaries with a
PT_GNU_STACK section wwithout x bit as needed.

If no such section is found, we leave the stack to whatever the arch defaults
to.  If there is one, we explicitly disabled the VM_EXEC bit if no x bit is
found, otherwise explicitly enable.
---
 arch/ia64/ia32/binfmt_elf32.c       | 16 +++++++++++-----
 arch/ia64/ia32/ia32priv.h           |  2 +-
 arch/mips/kernel/irixelf.c          |  2 +-
 arch/s390/kernel/binfmt_elf32.c     |  4 ++--
 arch/s390/kernel/compat_exec.c      |  3 ++-
 arch/sparc64/kernel/binfmt_aout32.c |  2 +-
 arch/x86_64/ia32/ia32_aout.c        |  4 ++--
 arch/x86_64/ia32/ia32_binfmt.c      | 15 ++++++++++-----
 fs/binfmt_aout.c                    |  2 +-
 fs/binfmt_elf.c                     | 12 +++++++++++-
 fs/binfmt_som.c                     |  2 +-
 fs/exec.c                           | 14 +++++++++++---
 include/asm-ia64/pgtable.h          |  3 ++-
 include/linux/binfmts.h             |  8 +++++++-
 include/linux/elf.h                 |  2 ++
 15 files changed, 65 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/ia32/binfmt_elf32.c b/arch/ia64/ia32/binfmt_elf32.c
index 7d8624cef402..679e68afd653 100644
--- a/arch/ia64/ia32/binfmt_elf32.c
+++ b/arch/ia64/ia32/binfmt_elf32.c
@@ -35,7 +35,7 @@ extern void ia64_elf32_init (struct pt_regs *regs);
 
 static void elf32_set_personality (void);
 
-#define setup_arg_pages(bprm)		ia32_setup_arg_pages(bprm)
+#define setup_arg_pages(bprm,exec)		ia32_setup_arg_pages(bprm,exec)
 #define elf_map				elf32_map
 
 #undef SET_PERSONALITY
@@ -149,7 +149,7 @@ ia64_elf32_init (struct pt_regs *regs)
 }
 
 int
-ia32_setup_arg_pages (struct linux_binprm *bprm)
+ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -178,8 +178,14 @@ ia32_setup_arg_pages (struct linux_binprm *bprm)
 		mpnt->vm_mm = current->mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = IA32_STACK_TOP;
-		mpnt->vm_page_prot = PAGE_COPY;
-		mpnt->vm_flags = VM_STACK_FLAGS;
+		if (executable_stack == EXSTACK_ENABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
+		else
+			mpnt->vm_flags = VM_STACK_FLAGS;
+		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC)?
+					PAGE_COPY_EXEC: PAGE_COPY;
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
@@ -192,7 +198,7 @@ ia32_setup_arg_pages (struct linux_binprm *bprm)
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			put_dirty_page(current, page, stack_base, PAGE_COPY);
+			put_dirty_page(current, page, stack_base, mpnt->vm_page_prot);
 		}
 		stack_base += PAGE_SIZE;
 	}
diff --git a/arch/ia64/ia32/ia32priv.h b/arch/ia64/ia32/ia32priv.h
index a445206a8553..e6f95af15972 100644
--- a/arch/ia64/ia32/ia32priv.h
+++ b/arch/ia64/ia32/ia32priv.h
@@ -494,7 +494,7 @@ struct ia32_user_desc {
 struct linux_binprm;
 
 extern void ia32_init_addr_space (struct pt_regs *regs);
-extern int ia32_setup_arg_pages (struct linux_binprm *bprm);
+extern int ia32_setup_arg_pages (struct linux_binprm *bprm, int exec_stack);
 extern unsigned long ia32_do_mmap (struct file *, unsigned long, unsigned long, int, int, loff_t);
 extern void ia32_load_segment_descriptors (struct task_struct *task);
 
diff --git a/arch/mips/kernel/irixelf.c b/arch/mips/kernel/irixelf.c
index f925c6514aad..eac9e07f5d0f 100644
--- a/arch/mips/kernel/irixelf.c
+++ b/arch/mips/kernel/irixelf.c
@@ -688,7 +688,7 @@ static int load_irix_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	 * change some of these later.
 	 */
 	current->mm->rss = 0;
-	setup_arg_pages(bprm);
+	setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	current->mm->start_stack = bprm->p;
 
 	/* At this point, we assume that the image should be loaded at
diff --git a/arch/s390/kernel/binfmt_elf32.c b/arch/s390/kernel/binfmt_elf32.c
index 8f80960cdda8..0cc8e5be48ba 100644
--- a/arch/s390/kernel/binfmt_elf32.c
+++ b/arch/s390/kernel/binfmt_elf32.c
@@ -115,7 +115,7 @@ static inline int dump_regs32(struct pt_regs *ptregs, elf_gregset_t *regs)
 #include <linux/binfmts.h>
 #include <linux/compat.h>
 
-int setup_arg_pages32(struct linux_binprm *bprm);
+int setup_arg_pages32(struct linux_binprm *bprm, int executable_stack);
 
 #define elf_prstatus elf_prstatus32
 struct elf_prstatus32
@@ -166,7 +166,7 @@ struct elf_prpsinfo32
 
 #undef start_thread
 #define start_thread                    start_thread31 
-#define setup_arg_pages(bprm)           setup_arg_pages32(bprm)
+#define setup_arg_pages(bprm, exec)     setup_arg_pages32(bprm, exec)
 #define elf_map				elf_map32
 
 MODULE_DESCRIPTION("Binary format loader for compatibility with 32bit Linux for S390 binaries,"
diff --git a/arch/s390/kernel/compat_exec.c b/arch/s390/kernel/compat_exec.c
index 33832846833f..162deb2bb007 100644
--- a/arch/s390/kernel/compat_exec.c
+++ b/arch/s390/kernel/compat_exec.c
@@ -37,7 +37,7 @@
 #undef STACK_TOP
 #define STACK_TOP TASK31_SIZE
 
-int setup_arg_pages32(struct linux_binprm *bprm)
+int setup_arg_pages32(struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -66,6 +66,7 @@ int setup_arg_pages32(struct linux_binprm *bprm)
 		mpnt->vm_mm = mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = STACK_TOP;
+		/* executable stack setting would be applied here */
 		mpnt->vm_page_prot = PAGE_COPY;
 		mpnt->vm_flags = VM_STACK_FLAGS;
 		mpnt->vm_ops = NULL;
diff --git a/arch/sparc64/kernel/binfmt_aout32.c b/arch/sparc64/kernel/binfmt_aout32.c
index dcae86a7228b..4ba5d4801bae 100644
--- a/arch/sparc64/kernel/binfmt_aout32.c
+++ b/arch/sparc64/kernel/binfmt_aout32.c
@@ -310,7 +310,7 @@ beyond_if:
 	orig_thr_flags = current_thread_info()->flags;
 	current_thread_info()->flags |= _TIF_32BIT;
 
-	retval = setup_arg_pages(bprm);
+	retval = setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		current_thread_info()->flags = orig_thr_flags;
 
diff --git a/arch/x86_64/ia32/ia32_aout.c b/arch/x86_64/ia32/ia32_aout.c
index 4c2d1100d2d7..040adf6991a2 100644
--- a/arch/x86_64/ia32/ia32_aout.c
+++ b/arch/x86_64/ia32/ia32_aout.c
@@ -35,7 +35,7 @@
 #undef WARN_OLD
 #undef CORE_DUMP /* probably broken */
 
-extern int ia32_setup_arg_pages(struct linux_binprm *bprm);
+extern int ia32_setup_arg_pages(struct linux_binprm *bprm, int exec_stack);
 
 static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
 static int load_aout_library(struct file*);
@@ -395,7 +395,7 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = ia32_setup_arg_pages(bprm); 
+	retval = ia32_setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		/* Someone check-me: is this error path enough? */ 
 		send_sig(SIGKILL, current, 0); 
diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c
index 82e9bc2ddc3f..92817f18e39b 100644
--- a/arch/x86_64/ia32/ia32_binfmt.c
+++ b/arch/x86_64/ia32/ia32_binfmt.c
@@ -272,8 +272,8 @@ do {							\
 #define load_elf_binary load_elf32_binary
 
 #define ELF_PLAT_INIT(r, load_addr)	elf32_init(r)
-#define setup_arg_pages(bprm)		ia32_setup_arg_pages(bprm)
-int ia32_setup_arg_pages(struct linux_binprm *bprm);
+#define setup_arg_pages(bprm, exec_stack)	ia32_setup_arg_pages(bprm, exec_stack)
+int ia32_setup_arg_pages(struct linux_binprm *bprm, int executable_stack);
 
 #undef start_thread
 #define start_thread(regs,new_rip,new_rsp) do { \
@@ -325,7 +325,7 @@ static void elf32_init(struct pt_regs *regs)
 	me->thread.es = __USER_DS;
 }
 
-int setup_arg_pages(struct linux_binprm *bprm)
+int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -354,7 +354,12 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		mpnt->vm_mm = mm;
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = IA32_STACK_TOP;
-		mpnt->vm_flags = vm_stack_flags32; 
+		if (executable_stack == EXSTACK_ENABLE_X)
+			mpnt->vm_flags = vm_stack_flags32 |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = vm_stack_flags32 & ~VM_EXEC;
+		else
+			mpnt->vm_flags = vm_stack_flags32;
  		mpnt->vm_page_prot = (mpnt->vm_flags & VM_EXEC) ? 
  			PAGE_COPY_EXEC : PAGE_COPY;
 		mpnt->vm_ops = NULL;
@@ -370,7 +375,7 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		struct page *page = bprm->page[i];
 		if (page) {
 			bprm->page[i] = NULL;
-			put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC);
+			put_dirty_page(current,page,stack_base,mpnt->vm_page_prot);
 		}
 		stack_base += PAGE_SIZE;
 	}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 849324bbf3e3..7827c1255848 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -413,7 +413,7 @@ beyond_if:
 
 	set_brk(current->mm->start_brk, current->mm->brk);
 
-	retval = setup_arg_pages(bprm); 
+	retval = setup_arg_pages(bprm, EXSTACK_DEFAULT);
 	if (retval < 0) { 
 		/* Someone check-me: is this error path enough? */ 
 		send_sig(SIGKILL, current, 0); 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index e5b79a294c80..c420a261e5f3 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -476,6 +476,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
   	struct exec interp_ex;
 	char passed_fileno[6];
 	struct files_struct *files;
+	int executable_stack = EXSTACK_DEFAULT;
 	
 	/* Get the exec-header */
 	elf_ex = *((struct elfhdr *) bprm->buf);
@@ -599,6 +600,15 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		elf_ppnt++;
 	}
 
+	elf_ppnt = elf_phdata;
+	for (i = 0; i < elf_ex.e_phnum; i++, elf_ppnt++)
+		if (elf_ppnt->p_type == PT_GNU_STACK) {
+			if (elf_ppnt->p_flags & PF_X)
+				executable_stack = EXSTACK_ENABLE_X;
+			else
+				executable_stack = EXSTACK_DISABLE_X;
+		}
+
 	/* Some simple consistency checks for the interpreter */
 	if (elf_interpreter) {
 		interpreter_type = INTERPRETER_ELF | INTERPRETER_AOUT;
@@ -674,7 +684,7 @@ static int load_elf_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 	   change some of these later */
 	current->mm->rss = 0;
 	current->mm->free_area_cache = TASK_UNMAPPED_BASE;
-	retval = setup_arg_pages(bprm);
+	retval = setup_arg_pages(bprm, executable_stack);
 	if (retval < 0) {
 		send_sig(SIGKILL, current, 0);
 		goto out_free_dentry;
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index a4e7b03ff836..cabf3ccc09e2 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -254,7 +254,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 	set_binfmt(&som_format);
 	compute_creds(bprm);
-	setup_arg_pages(bprm);
+	setup_arg_pages(bprm, EXSTACK_DEFAULT);
 
 	create_som_tables(bprm);
 
diff --git a/fs/exec.c b/fs/exec.c
index 62bf2c537abd..26e3392b6369 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -342,7 +342,7 @@ out_sig:
 	return;
 }
 
-int setup_arg_pages(struct linux_binprm *bprm)
+int setup_arg_pages(struct linux_binprm *bprm, int executable_stack)
 {
 	unsigned long stack_base;
 	struct vm_area_struct *mpnt;
@@ -425,8 +425,16 @@ int setup_arg_pages(struct linux_binprm *bprm)
 		mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p;
 		mpnt->vm_end = STACK_TOP;
 #endif
-		mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7];
-		mpnt->vm_flags = VM_STACK_FLAGS;
+		/* Adjust stack execute permissions; explicitly enable
+		 * for EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X
+		 * and leave alone (arch default) otherwise. */
+		if (unlikely(executable_stack == EXSTACK_ENABLE_X))
+			mpnt->vm_flags = VM_STACK_FLAGS |  VM_EXEC;
+		else if (executable_stack == EXSTACK_DISABLE_X)
+			mpnt->vm_flags = VM_STACK_FLAGS & ~VM_EXEC;
+		else
+			mpnt->vm_flags = VM_STACK_FLAGS;
+		mpnt->vm_page_prot = protection_map[mpnt->vm_flags & 0x7];
 		mpnt->vm_ops = NULL;
 		mpnt->vm_pgoff = 0;
 		mpnt->vm_file = NULL;
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index 86d7f7f91bfb..bec5c8cd0079 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -119,7 +119,8 @@
 #define PAGE_NONE	__pgprot(_PAGE_PROTNONE | _PAGE_A)
 #define PAGE_SHARED	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RW)
 #define PAGE_READONLY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
-#define PAGE_COPY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX)
+#define PAGE_COPY	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_R)
+#define PAGE_COPY_EXEC	__pgprot(__ACCESS_BITS | _PAGE_PL_3 | _PAGE_AR_RX)
 #define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
 #define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
 #define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 3d9a86eff6ab..60726b29603c 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -58,7 +58,13 @@ extern int prepare_binprm(struct linux_binprm *);
 extern void remove_arg_zero(struct linux_binprm *);
 extern int search_binary_handler(struct linux_binprm *,struct pt_regs *);
 extern int flush_old_exec(struct linux_binprm * bprm);
-extern int setup_arg_pages(struct linux_binprm * bprm);
+
+/* Stack area protections */
+#define EXSTACK_DEFAULT   0	/* Whatever the arch defaults to */
+#define EXSTACK_DISABLE_X 1	/* Disable executable stacks */
+#define EXSTACK_ENABLE_X  2	/* Enable executable stacks */
+
+extern int setup_arg_pages(struct linux_binprm * bprm, int executable_stack);
 extern int copy_strings(int argc,char __user * __user * argv,struct linux_binprm *bprm); 
 extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
 extern void compute_creds(struct linux_binprm *binprm);
diff --git a/include/linux/elf.h b/include/linux/elf.h
index 2f8005729fb6..7f21bfaa2e71 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -35,6 +35,8 @@ typedef __s64	Elf64_Sxword;
 #define PT_HIPROC  0x7fffffff
 #define PT_GNU_EH_FRAME		0x6474e550
 
+#define PT_GNU_STACK	(PT_LOOS + 0x474e551)
+
 /* These constants define the different elf file types */
 #define ET_NONE   0
 #define ET_REL    1
-- 
cgit v1.2.3


From 7a10b433f0911d25b4fd1d1b033cbd119be8fc5f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:25:23 -0700
Subject: [PATCH] reiserfs: support for nested transactions

From: Chris Mason <mason@suse.com>

reiserfs support for nested transactions.  This originally came from Peter
Braam for 2.4.x and was ported forward by Jeff Mahoney.
---
 fs/reiserfs/inode.c            |  4 +++
 fs/reiserfs/journal.c          | 78 +++++++++++++++++++++++++++++++++++++++---
 fs/reiserfs/namei.c            |  1 -
 include/linux/reiserfs_fs_sb.h | 12 ++++---
 4 files changed, 86 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index c01847228d2c..d1c8a83a7d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -206,6 +206,10 @@ static int file_capable (struct inode * inode, long block)
   struct super_block *s = th->t_super ;
   int len = th->t_blocks_allocated ;
 
+  /* we cannot restart while nested */
+  if (th->t_refcount > 1) {
+      return  ;
+  }
   pathrelse(path) ;
   reiserfs_update_sd(th, inode) ;
   journal_end(th, s, len) ;
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 02351f60808a..53d425fd8ea5 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2157,6 +2157,9 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int n
   time_t now = get_seconds() ;
   if (reiserfs_dont_log(th->t_super)) 
     return 0 ;
+  /* cannot restart while nested */
+  if (th->t_refcount > 1)
+    return 0 ;
   if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 ||
        (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) || 
        atomic_read(&(SB_JOURNAL(th->t_super)->j_jlock)) ||
@@ -2212,6 +2215,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
     return 0 ;
   }
   PROC_INFO_INC( p_s_sb, journal.journal_being );
+  /* set here for journal_join */
+  th->t_refcount = 1;
+  th->t_super = p_s_sb ;
 
 relock:
   lock_journal(p_s_sb) ;
@@ -2268,9 +2274,7 @@ relock:
   SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
   th->t_blocks_logged = 0 ;
   th->t_blocks_allocated = nblocks ;
-  th->t_super = p_s_sb ;
   th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  th->t_caller = "Unknown" ;
   unlock_journal(p_s_sb) ;
   p_s_sb->s_dirt = 1; 
   return 0 ;
@@ -2278,11 +2282,47 @@ relock:
 
 
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
+  struct reiserfs_transaction_handle *cur_th = current->journal_info;
+
+  /* this keeps do_journal_end from NULLing out the current->journal_info
+  ** pointer
+  */
+  th->t_handle_save = cur_th ;
+  if (cur_th && cur_th->t_refcount > 1) {
+      BUG() ;
+  }
   return do_journal_begin_r(th, p_s_sb, nblocks, 1) ;
 }
 
 int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  * p_s_sb, unsigned long nblocks) {
-  return do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+    int ret ;
+
+    th->t_handle_save = NULL ;
+    if (cur_th) {
+	/* we are nesting into the current transaction */
+	if (cur_th->t_super == p_s_sb) {
+	      cur_th->t_refcount++ ;
+	      memcpy(th, cur_th, sizeof(*th));
+	      if (th->t_refcount <= 1)
+		      printk("BAD: refcount <= 1, but journal_info != 0\n");
+	      return 0;
+	} else {
+	    /* we've ended up with a handle from a different filesystem.
+	    ** save it and restore on journal_end.  This should never
+	    ** really happen...
+	    */
+	    reiserfs_warning("clm-2100: nesting info a different FS\n") ;
+	    th->t_handle_save = current->journal_info ;
+	    current->journal_info = th;
+	}
+    } else {
+	current->journal_info = th;
+    }
+    ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ;
+    if (current->journal_info != th)
+        BUG() ;
+    return ret ;
 }
 
 /* not used at all */
@@ -2422,7 +2462,26 @@ int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct supe
 }
 
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
-  return do_journal_end(th, p_s_sb, nblocks, 0) ;
+  if (!current->journal_info && th->t_refcount > 1)
+    printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
+  if (th->t_refcount > 1) {
+    struct reiserfs_transaction_handle *cur_th = current->journal_info ;
+
+    /* we aren't allowed to close a nested transaction on a different
+    ** filesystem from the one in the task struct
+    */
+    if (cur_th->t_super != th->t_super)
+      BUG() ;
+
+    th->t_refcount--;
+    if (th != cur_th) {
+      memcpy(current->journal_info, th, sizeof(*th));
+      th->t_trans_id = 0;
+    }
+    return 0;
+  } else {
+    return do_journal_end(th, p_s_sb, nblocks, 0) ;
+  }
 }
 
 /* removes from the current transaction, relsing and descrementing any counters.  
@@ -2520,6 +2579,10 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) {
 */
 int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
 
+  /* you can sync while nested, very, very bad */
+  if (th->t_refcount > 1) {
+    BUG() ;
+  }
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
     reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
     journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
@@ -2901,6 +2964,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   struct reiserfs_super_block *rs ; 
   int trans_half ;
 
+  if (th->t_refcount > 1)
+    BUG() ;
+
+  current->journal_info = th->t_handle_save;
   if (reiserfs_dont_log(th->t_super)) {
     return 0 ;
   }
@@ -2938,8 +3005,11 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   }
 
 #ifdef REISERFS_PREALLOCATE
+  /* quota ops might need to nest, setup the journal_info pointer for them */
+  current->journal_info = th ;
   reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into
 				      * the transaction */
+  current->journal_info = th->t_handle_save ;
 #endif
   
   rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index bdbe89bf99f1..70dec0317a1f 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -575,7 +575,6 @@ static int reiserfs_create (struct inode * dir, struct dentry *dentry, int mode,
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    th.t_caller = "create" ;
     retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode);
     if (retval) {
         goto out_failed;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 4c675f30a8ae..e6d9fefce42c 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -157,13 +157,17 @@ struct reiserfs_list_bitmap {
 ** transaction handle which is passed around for all journal calls
 */
 struct reiserfs_transaction_handle {
-				/* ifdef it. -Hans */
-  char *t_caller ;              /* debugging use */
+  struct super_block *t_super ; /* super for this FS when journal_begin was
+				   called. saves calls to reiserfs_get_super
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle
+				*/
+  int t_refcount;
   int t_blocks_logged ;         /* number of blocks this writer has logged */
   int t_blocks_allocated ;      /* number of blocks this writer allocated */
   unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  struct super_block *t_super ; /* super for this FS when journal_begin was 
-                                   called. saves calls to reiserfs_get_super */
+  void *t_handle_save ;		/* save existing current->journal_info */
   int displace_new_blocks:1;	/* if new block allocation occurres, that block
 				   should be displaced from others */
 
-- 
cgit v1.2.3


From 8f57688237995959aee38d48a0c92e203dbec676 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:25:37 -0700
Subject: [PATCH] reiserfs: cleanups

From: Chris Mason <mason@suse.com>

reiserfs cleanup, get rid of old debugging code.
---
 fs/reiserfs/do_balan.c         |  2 --
 fs/reiserfs/file.c             |  3 ---
 fs/reiserfs/fix_node.c         |  5 ----
 fs/reiserfs/inode.c            | 12 ---------
 fs/reiserfs/journal.c          | 55 ------------------------------------------
 fs/reiserfs/namei.c            | 18 --------------
 fs/reiserfs/prints.c           |  1 -
 include/linux/reiserfs_fs.h    |  2 --
 include/linux/reiserfs_fs_sb.h |  1 -
 9 files changed, 99 deletions(-)

(limited to 'include')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index eb23eb4ceda7..c90f0edbc167 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -54,9 +54,7 @@ inline void do_balance_mark_leaf_dirty (struct tree_balance * tb,
 	    tb->need_balance_dirty = 1;
 	}
     } else {
-	int windex = push_journal_writer("do_balance") ;
 	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-	pop_journal_writer(windex) ;
     }
 }
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 47503bb3a3b3..99321f2fcdf6 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -29,7 +29,6 @@ static int reiserfs_file_release (struct inode * inode, struct file * filp)
 {
 
     struct reiserfs_transaction_handle th ;
-    int windex ;
 
     if (!S_ISREG (inode->i_mode))
 	BUG ();
@@ -59,9 +58,7 @@ static int reiserfs_file_release (struct inode * inode, struct file * filp)
 	   appended (we append by unformatted node only) or its direct
 	   item(s) had to be converted, then it may have to be
 	   indirect2direct converted */
-	windex = push_journal_writer("file_release") ;
 	reiserfs_truncate_file(inode, 0) ;
-	pop_journal_writer(windex) ;
     }
     up (&inode->i_sem); 
     reiserfs_write_unlock(inode->i_sb);
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index e16c276dc82a..95a429ab77d6 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2280,7 +2280,6 @@ int fix_nodes (int n_op_mode,
     ** during wait_tb_buffers_run
     */
     int wait_tb_buffers_run = 0 ; 
-    int windex ;
     struct buffer_head  * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path);
 
     ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes;
@@ -2407,10 +2406,7 @@ int fix_nodes (int n_op_mode,
 		p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1);
     }
 
-    
-    windex = push_journal_writer("fix_nodes") ;
     if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) {
-	pop_journal_writer(windex) ;
 	if (FILESYSTEM_CHANGED_TB(p_s_tb)) {
 	    wait_tb_buffers_run = 1 ;
 	    n_ret_value = REPEAT_SEARCH ;
@@ -2420,7 +2416,6 @@ int fix_nodes (int n_op_mode,
 	}
     } else {
 	wait_tb_buffers_run = 1 ;
-	pop_journal_writer(windex) ;
 	goto repeat; 
     }
 
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index d1c8a83a7d66..e1c7928d0633 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -29,9 +29,7 @@ static int reiserfs_get_block (struct inode * inode, sector_t block,
 void reiserfs_delete_inode (struct inode * inode)
 {
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2; 
-    int windex ;
     struct reiserfs_transaction_handle th ;
-
   
     reiserfs_write_lock(inode->i_sb);
 
@@ -41,10 +39,8 @@ void reiserfs_delete_inode (struct inode * inode)
 
 	journal_begin(&th, inode->i_sb, jbegin_count) ;
 	reiserfs_update_inode_transaction(inode) ;
-	windex = push_journal_writer("delete_inode") ;
 
 	reiserfs_delete_object (&th, inode);
-	pop_journal_writer(windex) ;
 
 	journal_end(&th, inode->i_sb, jbegin_count) ;
 
@@ -561,7 +557,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     __u32 * item;
     int done;
     int fs_gen;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
@@ -607,8 +602,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	 (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) )
 	REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
 
-    windex = push_journal_writer("reiserfs_get_block") ;
-  
     /* set the key of the first byte in the 'block'-th block of file */
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
@@ -687,7 +680,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
-	pop_journal_writer(windex) ;
 	if (transaction_started)
 	    journal_end(&th, inode->i_sb, jbegin_count) ;
 
@@ -933,7 +925,6 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
       reiserfs_update_sd(&th, inode) ;
       journal_end(&th, inode->i_sb, jbegin_count) ;
     }
-    pop_journal_writer(windex) ;
     reiserfs_write_unlock(inode->i_sb);
     reiserfs_check_path(&path) ;
     return retval;
@@ -1836,7 +1827,6 @@ unlock:
 */
 void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
     struct reiserfs_transaction_handle th ;
-    int windex ;
     /* we want the offset for the first byte after the end of the file */
     unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ;
     unsigned blocksize = p_s_inode->i_sb->s_blocksize ;
@@ -1871,14 +1861,12 @@ void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) {
        cut_from_item. 1 is for update_sd */
     journal_begin(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
     reiserfs_update_inode_transaction(p_s_inode) ;
-    windex = push_journal_writer("reiserfs_vfs_truncate_file") ;
     if (update_timestamps)
 	    /* we are doing real truncate: if the system crashes before the last
 	       transaction of truncating gets committed - on reboot the file
 	       either appears truncated properly or not truncated at all */
 	add_save_link (&th, p_s_inode, 1);
     reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ;
-    pop_journal_writer(windex) ;
     journal_end(&th, p_s_inode->i_sb,  JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ;
 
     if (update_timestamps)
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 53d425fd8ea5..95cf46212d68 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -408,7 +408,6 @@ void reiserfs_check_lock_depth(char *caller) {
 #ifdef CONFIG_SMP
   if (current->lock_depth < 0) {
     printk("%s called without kernel lock held\n", caller) ;
-    show_reiserfs_locks() ;
     BUG() ;
   }
 #else
@@ -444,52 +443,6 @@ static inline struct reiserfs_journal_cnode *get_journal_hash(struct super_block
   return cn ;
 }
 
-/* once upon a time, the journal would deadlock.  a lot.  Now, when
-** CONFIG_REISERFS_CHECK is defined, anytime someone enters a
-** transaction, it pushes itself into this ugly static list, and pops
-** itself off before calling journal_end.  I made a SysRq key to dump
-** the list, and tell me what the writers are when I'm deadlocked.  */
-
-				/* are you depending on the compiler
-                                   to optimize this function away
-                                   everywhere it is called? It is not
-                                   obvious how this works, but I
-                                   suppose debugging code need not be
-                                   clear.  -Hans */
-static char *journal_writers[512] ;
-int push_journal_writer(char *s) {
-#ifdef CONFIG_REISERFS_CHECK
-  int i ;
-  for (i = 0 ; i < 512 ; i++) {
-    if (!journal_writers[i]) {
-      journal_writers[i] = s ;
-      return i ;
-    }
-  }
-  return -1 ;
-#else
-  return 0 ;
-#endif
-}
-int pop_journal_writer(int index) {
-#ifdef CONFIG_REISERFS_CHECK
-  if (index >= 0) {
-    journal_writers[index] = NULL ;
-  }
-#endif
-  return 0 ;
-}
-
-int dump_journal_writers(void) {
-  int i ;
-  for (i = 0 ; i < 512 ; i++) {
-    if (journal_writers[i]) {
-      printk("%d: %s\n", i, journal_writers[i]) ;
-    }
-  }
-  return 0 ;
-}
-
 /*
 ** this actually means 'can this block be reallocated yet?'.  If you set search_all, a block can only be allocated
 ** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever
@@ -2095,7 +2048,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   journal_list_init(p_s_sb) ;
 
   memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
-  memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */
 
   INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ;
   spin_lock_init(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock) ;
@@ -2372,7 +2324,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
                             buffer_locked(bh) ? ' ' : '!',
 			    buffer_dirty(bh) ? ' ' : '!',
 			    buffer_journal_dirty(bh) ? ' ' : '!') ;
-    show_reiserfs_locks() ;
   }
   count_already_incd = clear_prepared_bits(bh) ;
 
@@ -2590,12 +2541,6 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
   return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ;
 }
 
-int show_reiserfs_locks(void) {
-
-  dump_journal_writers() ;
-  return 0 ;
-}
-
 /*
 ** used to get memory back from async commits that are floating around
 ** and to reclaim any blocks deleted but unusable because their commits
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 70dec0317a1f..5dae18f5b8e9 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -737,7 +737,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
 {
     int retval;
     struct inode * inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count; 
     INITIALIZE_PATH (path);
@@ -749,7 +748,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_rmdir") ;
 
     de.de_gen_number_bit_string = 0;
     if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
@@ -798,7 +796,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
     /* prevent empty directory from getting lost */
     add_save_link (&th, inode, 0/* not truncate */);
 
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -809,7 +806,6 @@ static int reiserfs_rmdir (struct inode * dir, struct dentry *dentry)
        reiserfs_cut_from_item, or reiserfs_cut_from_item does not
        release path if operation was not complete */
     pathrelse (&path);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(dir->i_sb);
     return retval;	
@@ -821,7 +817,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
     struct inode * inode;
     struct reiserfs_dir_entry de;
     INITIALIZE_PATH (path);
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count;
     unsigned long savelink;
@@ -834,7 +829,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
 
     reiserfs_write_lock(dir->i_sb);
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_unlink") ;
 	
     de.de_gen_number_bit_string = 0;
     if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) {
@@ -887,7 +881,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
        /* prevent file from getting lost */
        add_save_link (&th, inode, 0/* not truncate */);
 
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -895,7 +888,6 @@ static int reiserfs_unlink (struct inode * dir, struct dentry *dentry)
 
  end_unlink:
     pathrelse (&path);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_check_path(&path) ;
     reiserfs_write_unlock(dir->i_sb);
@@ -978,7 +970,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
 {
     int retval;
     struct inode *inode = old_dentry->d_inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; 
 
@@ -996,7 +987,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
     inode->i_nlink++;
 
     journal_begin(&th, dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_link") ;
 
     /* create new entry */
     retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len,
@@ -1007,7 +997,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
 
     if (retval) {
 	inode->i_nlink--;
-	pop_journal_writer(windex) ;
 	journal_end(&th, dir->i_sb, jbegin_count) ;
 	reiserfs_write_unlock(dir->i_sb);
 	return retval;
@@ -1018,7 +1007,6 @@ static int reiserfs_link (struct dentry * old_dentry, struct inode * dir, struct
 
     atomic_inc(&inode->i_count) ;
     d_instantiate(dentry, inode);
-    pop_journal_writer(windex) ;
     journal_end(&th, dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(dir->i_sb);
     return 0;
@@ -1082,7 +1070,6 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
     struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ;
     struct reiserfs_dir_entry old_de, new_de, dot_dot_de;
     struct inode * old_inode, * new_dentry_inode;
-    int windex ;
     struct reiserfs_transaction_handle th ;
     int jbegin_count ; 
     umode_t old_inode_mode;
@@ -1150,7 +1137,6 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
     }
 
     journal_begin(&th, old_dir->i_sb, jbegin_count) ;
-    windex = push_journal_writer("reiserfs_rename") ;
 
     /* add new entry (or find the existing one) */
     retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, 
@@ -1161,7 +1147,6 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
 			    "vs-7050: new entry is found, new inode == 0\n");
 	}
     } else if (retval) {
-	pop_journal_writer(windex) ;
 	journal_end(&th, old_dir->i_sb, jbegin_count) ;
 	reiserfs_write_unlock(old_dir->i_sb);
 	return retval;
@@ -1314,14 +1299,11 @@ static int reiserfs_rename (struct inode * old_dir, struct dentry *old_dentry,
 	reiserfs_update_sd (&th, new_dentry_inode);
     }
 
-    pop_journal_writer(windex) ;
     journal_end(&th, old_dir->i_sb, jbegin_count) ;
     reiserfs_write_unlock(old_dir->i_sb);
     return 0;
 }
 
-
-
 /*
  * directories can handle most operations...
  */
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 862311f0365e..ac20f2dc94af 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -333,7 +333,6 @@ extern struct tree_balance * cur_tb;
 
 void reiserfs_panic (struct super_block * sb, const char * fmt, ...)
 {
-  show_reiserfs_locks() ;
   do_reiserfs_warning(fmt);
   printk ( KERN_EMERG "%s", error_buf);
   BUG ();
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 4e2b898a8d98..e4695e7b7ba3 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1727,8 +1727,6 @@ int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsi
 int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
 int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
-int push_journal_writer(char *w) ;
-int pop_journal_writer(int windex) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index e6d9fefce42c..b848ccd7ed41 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -475,7 +475,6 @@ void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
 int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int flush_old_commits(struct super_block *s, int) ;
-int show_reiserfs_locks(void) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 
 #define CARRY_ON                0
-- 
cgit v1.2.3


From 7c563ced265e3134a5c5c5b7ca2b31218993a204 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:25:50 -0700
Subject: [PATCH] reiserfs: logging rework

From: Chris Mason <mason@suse.com>

reiserfs logging rework, making things much faster for small transactions.
metadata buffers are dirtied when they are safe to write, so normal kernel
mechanisms can contribute to log cleaning.
---
 fs/reiserfs/do_balan.c         |   25 +-
 fs/reiserfs/fix_node.c         |   34 +-
 fs/reiserfs/ibalance.c         |    2 -
 fs/reiserfs/inode.c            |    4 +-
 fs/reiserfs/journal.c          | 1606 ++++++++++++++++++++++------------------
 fs/reiserfs/objectid.c         |    3 -
 fs/reiserfs/procfs.c           |    5 +-
 fs/reiserfs/super.c            |   31 +-
 include/linux/reiserfs_fs.h    |   29 +-
 include/linux/reiserfs_fs_i.h  |    4 +-
 include/linux/reiserfs_fs_sb.h |   70 +-
 11 files changed, 967 insertions(+), 846 deletions(-)

(limited to 'include')

diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index c90f0edbc167..60baf14b580b 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -30,32 +30,11 @@ struct tree_balance * cur_tb = NULL; /* detects whether more than one
                                         is interrupting do_balance */
 #endif
 
-/*
- * AKPM: The __mark_buffer_dirty() call here will not
- * put the buffer on the dirty buffer LRU because we've just
- * set BH_Dirty.  That's a thinko in reiserfs.
- *
- * I'm reluctant to "fix" this bug because that would change
- * behaviour.  Using mark_buffer_dirty() here would make the
- * buffer eligible for VM and periodic writeback, which may
- * violate ordering constraints.  I'll just leave the code
- * as-is by removing the __mark_buffer_dirty call altogether.
- *
- * Chris says this code has "probably never been run" anyway.
- * It is due to go away.
- */
-
 inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, 
 					struct buffer_head * bh, int flag)
 {
-    if (reiserfs_dont_log(tb->tb_sb)) {
-	if (!test_set_buffer_dirty(bh)) {
-//	    __mark_buffer_dirty(bh) ;
-	    tb->need_balance_dirty = 1;
-	}
-    } else {
-	journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ;
-    }
+    journal_mark_dirty(tb->transaction_handle,
+                       tb->transaction_handle->t_super, bh) ;
 }
 
 #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 95a429ab77d6..b40c7de1c96f 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -2106,9 +2106,9 @@ static void tb_buffer_sanity_check (struct super_block * p_s_sb,
 {;}
 #endif
 
-static void clear_all_dirty_bits(struct super_block *s, 
+static int clear_all_dirty_bits(struct super_block *s,
                                  struct buffer_head *bh) {
-  reiserfs_prepare_for_journal(s, bh, 0) ;
+  return reiserfs_prepare_for_journal(s, bh, 0) ;
 }
 
 static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
@@ -2137,11 +2137,11 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 					    p_s_tb->tb_path->path_length - i);
 		}
 #endif
-		clear_all_dirty_bits(p_s_tb->tb_sb, 
-				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ;
-
-		if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) )
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb,
+				     PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)))
+		{
 		    locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i);
+		}
 	    }
 	}
 
@@ -2151,22 +2151,19 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 
 		if ( p_s_tb->L[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ;
-		    if ( buffer_locked (p_s_tb->L[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]))
 			locked = p_s_tb->L[i];
 		}
 
 		if ( !locked && p_s_tb->FL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ;
-		    if ( buffer_locked (p_s_tb->FL[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]))
 			locked = p_s_tb->FL[i];
 		}
 
 		if ( !locked && p_s_tb->CFL[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ;
-		    if ( buffer_locked (p_s_tb->CFL[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]))
 			locked = p_s_tb->CFL[i];
 		}
 
@@ -2176,23 +2173,20 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 
 		if ( p_s_tb->R[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ;
-		    if ( buffer_locked (p_s_tb->R[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]))
 			locked = p_s_tb->R[i];
 		}
 
        
 		if ( !locked && p_s_tb->FR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ;
-		    if ( buffer_locked (p_s_tb->FR[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]))
 			locked = p_s_tb->FR[i];
 		}
 
 		if ( !locked && p_s_tb->CFR[i] ) {
 		    tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i);
-		    clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ;
-		    if ( buffer_locked (p_s_tb->CFR[i]) )
+		    if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]))
 			locked = p_s_tb->CFR[i];
 		}
 	    }
@@ -2207,10 +2201,8 @@ static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb)
 	*/
 	for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { 
 	    if ( p_s_tb->FEB[i] ) {
-		clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ;
-		if (buffer_locked(p_s_tb->FEB[i])) {
+		if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]))
 		    locked = p_s_tb->FEB[i] ;
-		}
 	    }
 	}
 
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index 78fc3b301c22..3df6dda7d776 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -633,7 +633,6 @@ static void balance_internal_when_delete (struct tree_balance * tb,
 		/* use check_internal if new root is an internal node */
 		check_internal (new_root);
 	    /*&&&&&&&&&&&&&&&&&&&&&&*/
-	    tb->tb_sb->s_dirt = 1;
 
 	    /* do what is needed for buffer thrown from tree */
 	    reiserfs_invalidate_buffer(tb, tbSh);
@@ -951,7 +950,6 @@ int balance_internal (struct tree_balance * tb,			/* tree_balance structure 		*/
         PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr );
         PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 );
 	do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1);
-	tb->tb_sb->s_dirt = 1;
     }
 	
     if ( tb->blknum[h] == 2 ) {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index e1c7928d0633..06635c7f18a9 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -964,7 +964,7 @@ static void init_inode (struct inode * inode, struct path * path)
     REISERFS_I(inode)->i_prealloc_block = 0;
     REISERFS_I(inode)->i_prealloc_count = 0;
     REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = NULL;
 
     if (stat_data_v1 (ih)) {
 	struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih);
@@ -1621,7 +1621,7 @@ int reiserfs_new_inode (struct reiserfs_transaction_handle *th,
     REISERFS_I(inode)->i_prealloc_block = 0;
     REISERFS_I(inode)->i_prealloc_count = 0;
     REISERFS_I(inode)->i_trans_id = 0;
-    REISERFS_I(inode)->i_trans_index = 0;
+    REISERFS_I(inode)->i_jl = 0;
     REISERFS_I(inode)->i_attrs =
 	REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK;
     sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode );
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 95cf46212d68..cfff6ec0871f 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -32,13 +32,6 @@
 **                      around too long.
 **		     -- Note, if you call this as an immediate flush from 
 **		        from within kupdate, it will ignore the immediate flag
-**
-** The commit thread -- a writer process for async commits.  It allows a 
-**                      a process to request a log flush on a task queue.
-**                      the commit will happen once the commit thread wakes up.
-**                      The benefit here is the writer (with whatever
-**                      related locks it has) doesn't have to wait for the
-**                      log blocks to hit disk if it doesn't want to.
 */
 
 #include <linux/config.h>
@@ -60,6 +53,14 @@
 #include <linux/suspend.h>
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
+#include <linux/writeback.h>
+
+
+/* gets a struct reiserfs_journal_list * from a list head */
+#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_list))
+#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \
+                               j_working_list))
 
 /* the number of mounted filesystems.  This is used to decide when to
 ** start and kill the commit workqueue
@@ -78,6 +79,12 @@ static struct workqueue_struct *commit_wq;
 #define BLOCK_FREED_HOLDER 3    /* this block was freed during this transaction, and can't be written */
 
 #define BLOCK_NEEDS_FLUSH 4	/* used in flush_journal_list */
+#define BLOCK_DIRTIED 5
+
+
+/* journal list state bits */
+#define LIST_TOUCHED 1
+#define LIST_DIRTY   2
 
 /* flags for do_journal_end */
 #define FLUSH_ALL   1		/* flush commit and real blocks */
@@ -86,6 +93,9 @@ static struct workqueue_struct *commit_wq;
 
 /* state bits for the journal */
 #define WRITERS_BLOCKED 1      /* set when new writers not allowed */
+#define WRITERS_QUEUED 2       /* set when log is full due to too many
+				* writers
+				*/
 
 static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ;
 static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ;
@@ -94,6 +104,9 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) ;
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks);
 static int release_journal_dev( struct super_block *super,
 				struct reiserfs_journal *journal );
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl);
+static void flush_async_commits(void *p);
 
 static void init_journal_hash(struct super_block *p_s_sb) {
   memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -105,8 +118,10 @@ static void init_journal_hash(struct super_block *p_s_sb) {
 ** more details.
 */
 static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) {
-  if (bh)
+  if (bh) {
     clear_buffer_dirty(bh);
+    clear_bit(BH_JTest, &bh->b_state);
+  }
   return 0 ;
 }
 
@@ -367,6 +382,7 @@ static void free_cnode(struct super_block *p_s_sb, struct reiserfs_journal_cnode
 
 static int clear_prepared_bits(struct buffer_head *bh) {
   clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state) ;
   return 0 ;
 }
 
@@ -471,11 +487,6 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 
   *next_zero_bit = 0 ; /* always start this at zero. */
 
-  /* we aren't logging all blocks are safe for reuse */
-  if (reiserfs_dont_log(p_s_sb)) {
-    return 0 ;
-  }
-
   PROC_INFO_INC( p_s_sb, journal.in_journal );
   /* If we aren't doing a search_all, this is a metablock, and it will be logged before use.
   ** if we crash before the transaction that freed it commits,  this transaction won't
@@ -503,6 +514,7 @@ int reiserfs_in_journal(struct super_block *p_s_sb,
 
   /* is it in the current transaction.  This should never happen */
   if ((cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, bl))) {
+    BUG();
     return 1; 
   }
 
@@ -527,18 +539,30 @@ inline void insert_journal_hash(struct reiserfs_journal_cnode **table, struct re
 
 /* lock the current transaction */
 inline static void lock_journal(struct super_block *p_s_sb) {
-  PROC_INFO_INC( p_s_sb, journal.lock_journal );
-  while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) {
-    PROC_INFO_INC( p_s_sb, journal.lock_journal_wait );
-    sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
-  }
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ;
+    PROC_INFO_INC( p_s_sb, journal.lock_journal );
+    down(&SB_JOURNAL(p_s_sb)->j_lock);
 }
 
 /* unlock the current transaction */
 inline static void unlock_journal(struct super_block *p_s_sb) {
-  atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ;
-  wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ;
+    up(&SB_JOURNAL(p_s_sb)->j_lock);
+}
+
+static inline void get_journal_list(struct reiserfs_journal_list *jl)
+{
+    jl->j_refcount++;
+}
+
+static inline void put_journal_list(struct super_block *s,
+                                   struct reiserfs_journal_list *jl)
+{
+    if (jl->j_refcount < 1) {
+        printk("trans id %lu, refcount at %d\n", jl->j_trans_id,
+	                                         jl->j_refcount);
+        BUG();
+    }
+    if (--jl->j_refcount == 0)
+        reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s);
 }
 
 /*
@@ -556,6 +580,83 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct re
   jl->j_list_bitmap = NULL ;
 }
 
+static int journal_list_still_alive(struct super_block *s,
+                                    unsigned long trans_id)
+{
+    struct list_head *entry = &SB_JOURNAL(s)->j_journal_list;
+    struct reiserfs_journal_list *jl;
+
+    if (!list_empty(entry)) {
+        jl = JOURNAL_LIST_ENTRY(entry->next);
+	if (jl->j_trans_id <= trans_id) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
+    struct reiserfs_journal_list *other_jl;
+    struct reiserfs_journal_list *first_jl;
+    struct list_head *entry;
+    unsigned long trans_id = jl->j_trans_id;
+    unsigned long other_trans_id;
+    unsigned long first_trans_id;
+
+find_first:
+    /*
+     * first we walk backwards to find the oldest uncommitted transation
+     */
+    first_jl = jl;
+    entry = jl->j_list.prev;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	if (entry == &SB_JOURNAL(s)->j_journal_list ||
+	    atomic_read(&other_jl->j_older_commits_done))
+	    break;
+
+        first_jl = other_jl;
+	entry = other_jl->j_list.prev;
+    }
+
+    /* if we didn't find any older uncommitted transactions, return now */
+    if (first_jl == jl) {
+        return 0;
+    }
+
+    first_trans_id = first_jl->j_trans_id;
+
+    entry = &first_jl->j_list;
+    while(1) {
+	other_jl = JOURNAL_LIST_ENTRY(entry);
+	other_trans_id = other_jl->j_trans_id;
+
+	if (other_trans_id < trans_id) {
+	    if (atomic_read(&other_jl->j_commit_left) != 0) {
+		flush_commit_list(s, other_jl, 0);
+
+		/* list we were called with is gone, return */
+		if (!journal_list_still_alive(s, trans_id))
+		    return 1;
+
+		/* the one we just flushed is gone, this means all
+		 * older lists are also gone, so first_jl is no longer
+		 * valid either.  Go back to the beginning.
+		 */
+		if (!journal_list_still_alive(s, other_trans_id)) {
+		    goto find_first;
+		}
+	    }
+	    entry = entry->next;
+	    if (entry == &SB_JOURNAL(s)->j_journal_list)
+		return 0;
+	} else {
+	    return 0;
+	}
+    }
+    return 0;
+}
+
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
 **
@@ -564,13 +665,10 @@ static void cleanup_freed_for_journal_list(struct super_block *p_s_sb, struct re
 **
 */
 static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) {
-  int i, count ;
-  int index = 0 ;
+  int i;
   int bn ;
-  int retry_count = 0 ;
-  int orig_commit_left = 0 ;
   struct buffer_head *tbh = NULL ;
-  struct reiserfs_journal_list *other_jl ;
+  unsigned long trans_id = jl->j_trans_id;
 
   reiserfs_check_lock_depth("flush_commit_list") ;
 
@@ -581,133 +679,100 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
   /* before we can put our commit blocks on disk, we have to make sure everyone older than
   ** us is on disk too
   */
-  if (jl->j_len <= 0) {
-    return 0 ;
-  }
+  if (jl->j_len <= 0)
+    BUG();
+  if (trans_id == SB_JOURNAL(s)->j_trans_id)
+    BUG();
+
+  get_journal_list(jl);
   if (flushall) {
-    /* we _must_ make sure the transactions are committed in order.  Start with the
-    ** index after this one, wrap all the way around 
-    */
-    index = (jl - SB_JOURNAL_LIST(s)) + 1 ;
-    for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-      other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ;
-      if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && 
-          other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) {
-        flush_commit_list(s, other_jl, 0) ;
-      }
+    if (flush_older_commits(s, jl) == 1) {
+      /* list disappeared during flush_older_commits.  return */
+      goto put_jl;
     }
   }
 
-  count = 0 ;
-  /* don't flush the commit list for the current transactoin */
-  if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) {
-    return 0 ;
-  }
-
   /* make sure nobody is trying to flush this one at the same time */
-  if (atomic_read(&(jl->j_commit_flushing))) {
-    sleep_on(&(jl->j_commit_wait)) ;
-    if (flushall) {
-      atomic_set(&(jl->j_older_commits_done), 1) ;
-    }
-    return 0 ;
+  down(&jl->j_commit_lock);
+  if (!journal_list_still_alive(s, trans_id)) {
+    up(&jl->j_commit_lock);
+    goto put_jl;
   }
-  
+  if (jl->j_trans_id == 0)
+    BUG();
+
   /* this commit is done, exit */
   if (atomic_read(&(jl->j_commit_left)) <= 0) {
     if (flushall) {
       atomic_set(&(jl->j_older_commits_done), 1) ;
     }
-    return 0 ;
+    up(&jl->j_commit_lock);
+    goto put_jl;
   }
-  /* keeps others from flushing while we are flushing */
-  atomic_set(&(jl->j_commit_flushing), 1) ; 
 
-
-  if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ;
-    return 0 ;
+  /*
+   * for the description block and all the log blocks, submit any buffers
+   * that haven't already reached the disk
+   */
+  for (i = 0 ; i < (jl->j_len + 1) ; i++) {
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
+         SB_ONDISK_JOURNAL_SIZE(s);
+    tbh = journal_find_get_block(s, bn) ;
+    wait_on_buffer(tbh) ;
+    ll_rw_block(WRITE, 1, &tbh) ;
+    put_bh(tbh) ;
   }
 
-  orig_commit_left = atomic_read(&(jl->j_commit_left)) ; 
-
-  /* start by checking all the commit blocks in this transaction.  
-  ** Add anyone not on disk into tbh.  Stop checking once commit_left <= 1, because that means we
-  ** only have the commit block left 
-  */
-retry:
-  count = 0 ;
-  for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %  SB_ONDISK_JOURNAL_SIZE(s);
+  /* wait on everything written so far before writing the commit */
+  for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
+    bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
+	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
     tbh = journal_find_get_block(s, bn) ;
 
-/* kill this sanity check */
-if (count > (orig_commit_left + 2)) {
-reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ;
-}
-    if (tbh) {
-      if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */
-	wait_on_buffer(tbh) ;
-	if (!buffer_uptodate(tbh)) {
-	  reiserfs_panic(s, "journal-584, buffer write failed\n") ;
-	}
-      } 
-      if (buffer_dirty(tbh)) {
-	printk("journal-569: flush_commit_list, block already dirty!\n") ;
-      } else {				
-	mark_buffer_dirty(tbh) ;
-      }
-      ll_rw_block(WRITE, 1, &tbh) ;
-      count++ ;
-      put_bh(tbh) ; /* once for our get_hash */
-    } 
-  }
-
-  /* wait on everyone in tbh before writing commit block*/
-  if (count > 0) {
-    for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && 
-                 i < (jl->j_len + 1) ; i++) {  /* everything but commit_bh */
-      bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
-      tbh = journal_find_get_block(s, bn) ;
-
-      wait_on_buffer(tbh) ;
-      if (!buffer_uptodate(tbh)) {
-	reiserfs_panic(s, "journal-601, buffer write failed\n") ;
-      }
-      put_bh(tbh) ; /* once for our get_hash */
-      bforget(tbh) ;    /* once due to original getblk in do_journal_end */
-      atomic_dec(&(jl->j_commit_left)) ;
+    wait_on_buffer(tbh) ;
+    if (buffer_dirty(tbh))
+      BUG();
+    if (!buffer_uptodate(tbh)) {
+      reiserfs_panic(s, "journal-601, buffer write failed\n") ;
     }
+    put_bh(tbh) ; /* once for journal_find_get_block */
+    put_bh(tbh) ;    /* once due to original getblk in do_journal_end */
+    atomic_dec(&(jl->j_commit_left)) ;
   }
 
-  if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */
-    if (retry_count < 2) {
-      printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ;
-      retry_count++ ;
-      goto retry;
-    }
-    reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %u, should be 1\n", 
-		   atomic_read(&(jl->j_commit_left)));
-  }
+  if (atomic_read(&(jl->j_commit_left)) != 1)
+    BUG();
 
+  if (buffer_dirty(jl->j_commit_bh))
+    BUG();
   mark_buffer_dirty(jl->j_commit_bh) ;
   sync_dirty_buffer(jl->j_commit_bh) ;
   if (!buffer_uptodate(jl->j_commit_bh)) {
     reiserfs_panic(s, "journal-615: buffer write failed\n") ;
   }
-  atomic_dec(&(jl->j_commit_left)) ;
   bforget(jl->j_commit_bh) ;
+  if (SB_JOURNAL(s)->j_last_commit_id != 0 &&
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) {
+      reiserfs_warning("clm-2200: last commit %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_commit_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id;
 
   /* now, every commit block is on the disk.  It is safe to allow blocks freed during this transaction to be reallocated */
   cleanup_freed_for_journal_list(s, jl) ;
 
+  /* mark the metadata dirty */
+  dirty_one_transaction(s, jl);
+  atomic_dec(&(jl->j_commit_left)) ;
+
   if (flushall) {
     atomic_set(&(jl->j_older_commits_done), 1) ;
   }
-  atomic_set(&(jl->j_commit_flushing), 0) ;
-  wake_up(&(jl->j_commit_wait)) ;
+  up(&jl->j_commit_lock);
+put_jl:
+  put_journal_list(s, jl);
 
-  s->s_dirt = 1 ;
   return 0 ;
 }
 
@@ -804,22 +869,27 @@ static int update_journal_header_block(struct super_block *p_s_sb,
 ** flush any and all journal lists older than you are 
 ** can only be called from flush_journal_list
 */
-static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) {
-  int i, index ;
-  struct reiserfs_journal_list *other_jl ;
-
-  index = jl - SB_JOURNAL_LIST(p_s_sb) ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ;
-    if (other_jl && other_jl->j_len > 0 && 
-        other_jl->j_trans_id > 0 && 
-	other_jl->j_trans_id < trans_id && 
-        other_jl != jl) {
-      /* do not flush all */
-      flush_journal_list(p_s_sb, other_jl, 0) ; 
+static int flush_older_journal_lists(struct super_block *p_s_sb,
+                                     struct reiserfs_journal_list *jl)
+{
+    struct list_head *entry;
+    struct reiserfs_journal_list *other_jl ;
+    unsigned long trans_id = jl->j_trans_id;
+
+    /* we know we are the only ones flushing things, no extra race
+     * protection is required.
+     */
+restart:
+    entry = SB_JOURNAL(p_s_sb)->j_journal_list.next;
+    other_jl = JOURNAL_LIST_ENTRY(entry);
+    if (other_jl->j_trans_id < trans_id) {
+	/* do not flush all */
+	flush_journal_list(p_s_sb, other_jl, 0) ;
+
+	/* other_jl is now deleted from the list */
+	goto restart;
     }
-  }
-  return 0 ;
+    return 0 ;
 }
 
 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
@@ -836,15 +906,27 @@ static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
     unlock_buffer(bh) ;
     put_bh(bh) ;
 }
+
 static void submit_logged_buffer(struct buffer_head *bh) {
-    lock_buffer(bh) ;
     get_bh(bh) ;
     bh->b_end_io = reiserfs_end_buffer_io_sync ;
     mark_buffer_notjournal_new(bh) ;
     clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
     submit_bh(WRITE, bh) ;
 }
 
+static void del_from_work_list(struct super_block *s,
+                               struct reiserfs_journal_list *jl) {
+    if (!list_empty(&jl->j_working_list)) {
+	list_del_init(&jl->j_working_list);
+	SB_JOURNAL(s)->j_num_work_lists--;
+    }
+}
+
 /* flush a journal list, both commit and real blocks
 **
 ** always set flushall to 1, unless you are calling from inside
@@ -865,29 +947,26 @@ static int flush_journal_list(struct super_block *s,
   unsigned long j_len_saved = jl->j_len ;
 
   if (j_len_saved <= 0) {
-    return 0 ;
+    BUG();
   }
 
   if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) {
     reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n",
                       atomic_read(&SB_JOURNAL(s)->j_wcount)) ;
   }
-  /* if someone is getting the commit list, we must wait for them */
-  while (atomic_read(&(jl->j_commit_flushing))) { 
-    sleep_on(&(jl->j_commit_wait)) ;
-  }
-  /* if someone is flushing this list, we must wait for them */
-  while (atomic_read(&(jl->j_flushing))) {
-    sleep_on(&(jl->j_flush_wait)) ;
-  }
+  if (jl->j_trans_id == 0)
+    BUG();
 
-  /* this list is now ours, we can change anything we want */
-  atomic_set(&(jl->j_flushing), 1) ;
+  /* if flushall == 0, the lock is already held */
+  if (flushall) {
+      down(&SB_JOURNAL(s)->j_flush_sem);
+  } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) {
+      BUG();
+  }
 
   count = 0 ;
   if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) {
-    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ;
-    atomic_dec(&(jl->j_flushing)) ;
+    reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id);
     return 0 ;
   }
 
@@ -902,6 +981,9 @@ static int flush_journal_list(struct super_block *s,
   */
   flush_commit_list(s, jl, 1) ;
 
+  if (!(jl->j_state & LIST_DIRTY))
+      BUG();
+
   /* are we done now? */
   if (atomic_read(&(jl->j_nonzerolen)) <= 0 && 
       atomic_read(&(jl->j_commit_left)) <= 0) {
@@ -937,13 +1019,13 @@ static int flush_journal_list(struct super_block *s,
       get_bh(saved_bh) ;
 
       if (buffer_journal_dirty(saved_bh)) {
+	if (!can_dirty(cn))
+	  BUG();
         was_jwait = 1 ;
-	mark_buffer_notjournal_dirty(saved_bh) ;
-        /* undo the inc from journal_mark_dirty */
-	put_bh(saved_bh) ;
-      }
-      if (can_dirty(cn)) {
         was_dirty = 1 ;
+      } else if (can_dirty(cn)) {
+        /* everything with !pjl && jwait should be writable */
+	BUG();
       }
     }
 
@@ -951,7 +1033,8 @@ static int flush_journal_list(struct super_block *s,
     ** sure they are commited, and don't try writing it to disk
     */
     if (pjl) {
-      flush_commit_list(s, pjl, 1) ;
+      if (atomic_read(&pjl->j_commit_left))
+        flush_commit_list(s, pjl, 1) ;
       goto free_cnode ;
     }
 
@@ -970,22 +1053,17 @@ static int flush_journal_list(struct super_block *s,
 printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasction\n", (unsigned long long)saved_bh->b_blocknr,
         was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ;
     }
-    /* kupdate_one_transaction waits on the buffers it is writing, so we
-    ** should never see locked buffers here
-    */
-    if (buffer_locked(saved_bh)) {
-      printk("clm-2083: locked buffer %llu in flush_journal_list\n", 
-              (unsigned long long)saved_bh->b_blocknr) ;
-      wait_on_buffer(saved_bh) ;
-      if (!buffer_uptodate(saved_bh)) {
-        reiserfs_panic(s, "journal-923: buffer write failed\n") ;
-      }
-    } 
     if (was_dirty) { 
       /* we inc again because saved_bh gets decremented at free_cnode */
       get_bh(saved_bh) ;
       set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-      submit_logged_buffer(saved_bh) ;
+      lock_buffer(saved_bh);
+      if (cn->blocknr != saved_bh->b_blocknr)
+        BUG();
+      if (buffer_dirty(saved_bh))
+        submit_logged_buffer(saved_bh) ;
+      else
+        unlock_buffer(saved_bh);
       count++ ;
     } else {
       printk("clm-2082: Unable to flush buffer %llu in flush_journal_list\n",
@@ -1016,6 +1094,14 @@ free_cnode:
 	if (!buffer_uptodate(cn->bh)) {
 	  reiserfs_panic(s, "journal-949: buffer write failed\n") ;
 	}
+	/* note, we must clear the JDirty_wait bit after the up to date
+	** check, otherwise we race against our flushpage routine
+	*/
+	if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state))
+	    BUG();
+
+        /* undo the inc from journal_mark_dirty */
+	put_bh(cn->bh) ;
         brelse(cn->bh) ;
       }
       cn = cn->next ;
@@ -1029,7 +1115,7 @@ flush_older_and_return:
   ** replayed after a crash
   */
   if (flushall) {
-    flush_older_journal_lists(s, jl, jl->j_trans_id) ;
+    flush_older_journal_lists(s, jl);
   } 
   
   /* before we can remove everything from the hash tables for this 
@@ -1044,181 +1130,246 @@ flush_older_and_return:
     update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ;
   }
   remove_all_from_journal_list(s, jl, 0) ;
+  list_del(&jl->j_list);
+  SB_JOURNAL(s)->j_num_lists--;
+  del_from_work_list(s, jl);
+
+  if (SB_JOURNAL(s)->j_last_flush_id != 0 &&
+     (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) {
+      reiserfs_warning("clm-2201: last flush %lu, current %lu\n",
+                       SB_JOURNAL(s)->j_last_flush_id,
+		       jl->j_trans_id);
+  }
+  SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id;
+
+  /* not strictly required since we are freeing the list, but it should
+   * help find code using dead lists later on
+   */
   jl->j_len = 0 ;
   atomic_set(&(jl->j_nonzerolen), 0) ;
   jl->j_start = 0 ;
   jl->j_realblock = NULL ;
   jl->j_commit_bh = NULL ;
   jl->j_trans_id = 0 ;
-  atomic_dec(&(jl->j_flushing)) ;
-  wake_up(&(jl->j_flush_wait)) ;
+  jl->j_state = 0;
+  put_journal_list(s, jl);
+  if (flushall)
+    up(&SB_JOURNAL(s)->j_flush_sem);
   return 0 ;
 } 
 
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
 
-static int kupdate_one_transaction(struct super_block *s,
-                                    struct reiserfs_journal_list *jl) 
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE)
+        write_chunk(chunk);
+}
+
+static int write_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl,
+				 struct buffer_chunk *chunk)
 {
-    struct reiserfs_journal_list *pjl ; /* previous list for this cn */
-    struct reiserfs_journal_cnode *cn, *walk_cn ;
-    b_blocknr_t blocknr ;
-    int run = 0 ;
-    int orig_trans_id = jl->j_trans_id ;
-    struct buffer_head *saved_bh ; 
+    struct reiserfs_journal_cnode *cn;
     int ret = 0 ;
 
-    /* if someone is getting the commit list, we must wait for them */
-    while (atomic_read(&(jl->j_commit_flushing))) {
-        sleep_on(&(jl->j_commit_wait)) ;
-    }
-    /* if someone is flushing this list, we must wait for them */
-    while (atomic_read(&(jl->j_flushing))) {
-        sleep_on(&(jl->j_flush_wait)) ;
+    jl->j_state |= LIST_TOUCHED;
+    del_from_work_list(s, jl);
+    if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) {
+        return 0;
     }
-    /* was it flushed while we slept? */
-    if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) {
-        return 0 ;
-    }
-
-    /* this list is now ours, we can change anything we want */
-    atomic_set(&(jl->j_flushing), 1) ;
 
-loop_start:
     cn = jl->j_realblock ;
     while(cn) {
-        saved_bh = NULL ;
         /* if the blocknr == 0, this has been cleared from the hash,
         ** skip it
         */
         if (cn->blocknr == 0) {
             goto next ;
         }
+        if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) {
+	    struct buffer_head *tmp_bh;
+	    /* we can race against journal_mark_freed when we try
+	     * to lock_buffer(cn->bh), so we have to inc the buffer
+	     * count, and recheck things after locking
+	     */
+	    tmp_bh = cn->bh;
+	    get_bh(tmp_bh);
+	    lock_buffer(tmp_bh);
+	    if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) {
+		if (!buffer_journal_dirty(tmp_bh) ||
+		    reiserfs_buffer_prepared(tmp_bh))
+		    BUG();
+		add_to_chunk(chunk, tmp_bh);
+		ret++;
+	    } else {
+		/* note, cn->bh might be null now */
+		unlock_buffer(tmp_bh);
+	    }
+	    put_bh(tmp_bh);
+        }
+next:
+        cn = cn->next ;
+	cond_resched();
+    }
+    return ret ;
+}
+
+/* used by flush_commit_list */
+static int dirty_one_transaction(struct super_block *s,
+                                 struct reiserfs_journal_list *jl)
+{
+    struct reiserfs_journal_cnode *cn;
+    struct reiserfs_journal_list *pjl;
+    int ret = 0 ;
+
+    jl->j_state |= LIST_DIRTY;
+    cn = jl->j_realblock ;
+    while(cn) {
         /* look for a more recent transaction that logged this
         ** buffer.  Only the most recent transaction with a buffer in
         ** it is allowed to send that buffer to disk
         */
-        pjl = find_newer_jl_for_cn(cn) ;
-        if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) &&
-            can_dirty(cn)) 
-        {
-            if (!test_bit(BH_JPrepared, &cn->bh->b_state)) {
-                set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-		submit_logged_buffer(cn->bh) ;
-            } else {
-                /* someone else is using this buffer.  We can't 
-                ** send it to disk right now because they might
-                ** be changing/logging it.
-                */
-                ret = 1 ;
-            }
-        } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) {
-            clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ;
-            if (!pjl && cn->bh) {
-                wait_on_buffer(cn->bh) ;
-            }
-            /* check again, someone could have logged while we scheduled */
-            pjl = find_newer_jl_for_cn(cn) ;
-
-            /* before the JDirty_wait bit is set, the 
-            ** buffer is added to the hash list.  So, if we are
-            ** run in the middle of a do_journal_end, we will notice
-            ** if this buffer was logged and added from the latest
-            ** transaction.  In this case, we don't want to decrement
-            ** b_count
-            */
-            if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) {
-                blocknr = cn->blocknr ;
-                walk_cn = cn ;
-                saved_bh= cn->bh ;
-                /* update all older transactions to show this block
-                ** was flushed
-                */
-                mark_buffer_notjournal_dirty(cn->bh) ;
-                while(walk_cn) {
-                    if (walk_cn->bh && walk_cn->blocknr == blocknr && 
-                         walk_cn->sb == cn->sb) {
-                        if (walk_cn->jlist) {
-                            atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ;
-                        }
-                        walk_cn->bh = NULL ;
-                    }
-                    walk_cn = walk_cn->hnext ;
-                }
-                if (atomic_read(&saved_bh->b_count) < 1) {
-                    reiserfs_warning("clm-2081: bad count on %lu\n", 
-                                      saved_bh->b_blocknr) ;
-                }
-                brelse(saved_bh) ;
-            }
-        }
-        /*
-        ** if the more recent transaction is committed to the log,
-        ** this buffer can be considered flushed.  Decrement our
-        ** counters to reflect one less buffer that needs writing.
-        **
-        ** note, this relies on all of the above code being
-        ** schedule free once pjl comes back non-null.
-        */
-        if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) {
-            atomic_dec(&cn->jlist->j_nonzerolen) ;
-            cn->bh = NULL ;
+	pjl = find_newer_jl_for_cn(cn) ;
+        if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh))
+	{
+	    if (!can_dirty(cn))
+	        BUG();
+	    /* if the buffer is prepared, it will either be logged
+	     * or restored.  If restored, we need to make sure
+	     * it actually gets marked dirty
+	     */
+	    mark_buffer_notjournal_new(cn->bh) ;
+	    if (test_bit(BH_JPrepared, &cn->bh->b_state)) {
+	        set_bit(BH_JRestore_dirty, &cn->bh->b_state);
+	    } else {
+	        set_bit(BH_JTest, &cn->bh->b_state);
+	        mark_buffer_dirty(cn->bh);
+	    }
         } 
-next:
         cn = cn->next ;
     }
-    /* the first run through the loop sends all the dirty buffers to
-    ** ll_rw_block.
-    ** the second run through the loop does all the accounting
-    */
-    if (run++ == 0) {
-        goto loop_start ;
+    return ret ;
+}
+
+static int kupdate_transactions(struct super_block *s,
+                                   struct reiserfs_journal_list *jl,
+				   struct reiserfs_journal_list **next_jl,
+				   unsigned long *next_trans_id,
+				   int num_blocks,
+				   int num_trans) {
+    int ret = 0;
+    int written = 0 ;
+    int transactions_flushed = 0;
+    unsigned long orig_trans_id = jl->j_trans_id;
+    struct buffer_chunk chunk;
+    struct list_head *entry;
+    chunk.nr = 0;
+
+    down(&SB_JOURNAL(s)->j_flush_sem);
+    if (!journal_list_still_alive(s, orig_trans_id)) {
+	goto done;
+    }
+
+    /* we've got j_flush_sem held, nobody is going to delete any
+     * of these lists out from underneath us
+     */
+    while((num_trans && transactions_flushed < num_trans) ||
+          (!num_trans && written < num_blocks)) {
+
+	if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) ||
+	    atomic_read(&jl->j_commit_left))
+	{
+	    del_from_work_list(s, jl);
+	    break;
+	}
+	ret = write_one_transaction(s, jl, &chunk);
+
+	if (ret < 0)
+	    goto done;
+	transactions_flushed++;
+	written += ret;
+	entry = jl->j_list.next;
+
+	/* did we wrap? */
+	if (entry == &SB_JOURNAL(s)->j_journal_list) {
+	    break;
+        }
+	jl = JOURNAL_LIST_ENTRY(entry);
+
+	/* don't bother with older transactions */
+	if (jl->j_trans_id <= orig_trans_id)
+	    break;
+    }
+    if (chunk.nr) {
+        write_chunk(&chunk);
     }
 
-    atomic_set(&(jl->j_flushing), 0) ;
-    wake_up(&(jl->j_flush_wait)) ;
-    return ret ;
+done:
+    up(&SB_JOURNAL(s)->j_flush_sem);
+    return ret;
 }
-/* since we never give dirty buffers to bdflush/kupdate, we have to
-** flush them ourselves.  This runs through the journal lists, finds
-** old metadata in need of flushing and sends it to disk.
-** this does not end transactions, commit anything, or free
-** cnodes.
+
+/* for o_sync and fsync heavy applications, they tend to use
+** all the journa list slots with tiny transactions.  These
+** trigger lots and lots of calls to update the header block, which
+** adds seeks and slows things down.
 **
-** returns the highest transaction id that was flushed last time
+** This function tries to clear out a large chunk of the journal lists
+** at once, which makes everything faster since only the newest journal
+** list updates the header block
 */
-static unsigned long reiserfs_journal_kupdate(struct super_block *s) {
-    struct reiserfs_journal_list *jl ;
-    int i ;
-    int start ;
-    time_t age ;
-    int ret = 0 ;
-
-    start = SB_JOURNAL_LIST_INDEX(s) ;
-
-    /* safety check to prevent flush attempts during a mount */
-    if (start < 0) {
-        return 0 ;
-    }
-    i = (start + 1) % JOURNAL_LIST_COUNT ;
-    while(i != start) {
-        jl = SB_JOURNAL_LIST(s) + i  ;
-        age = get_seconds() - jl->j_timestamp ;
-        if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && 
-            atomic_read(&(jl->j_nonzerolen)) > 0 &&
-            atomic_read(&(jl->j_commit_left)) == 0) {
-
-            if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) {
-                break ;
-            }
-            /* if ret was already 1, we want to preserve that */
-            ret |= kupdate_one_transaction(s, jl) ;
-        } 
-        if (atomic_read(&(jl->j_nonzerolen)) > 0) {
-            ret |= 1 ;
-        }
-        i = (i + 1) % JOURNAL_LIST_COUNT ;
+static int flush_used_journal_lists(struct super_block *s,
+                                    struct reiserfs_journal_list *jl) {
+    unsigned long len = 0;
+    unsigned long cur_len;
+    int ret;
+    int i;
+    struct reiserfs_journal_list *tjl;
+    struct reiserfs_journal_list *flush_jl;
+    unsigned long trans_id;
+
+    flush_jl = tjl = jl;
+
+    /* flush for 256 transactions or 256 blocks, whichever comes first */
+    for(i = 0 ; i < 256 && len < 256 ; i++) {
+	if (atomic_read(&tjl->j_commit_left) ||
+	    tjl->j_trans_id < jl->j_trans_id) {
+	    break;
+	}
+	cur_len = atomic_read(&tjl->j_nonzerolen);
+	if (cur_len > 0) {
+	    tjl->j_state &= ~LIST_TOUCHED;
+	}
+	len += cur_len;
+	flush_jl = tjl;
+	if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list)
+	    break;
+	tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next);
+    }
+    /* try to find a group of blocks we can flush across all the
+    ** transactions, but only bother if we've actually spanned
+    ** across multiple lists
+    */
+    if (flush_jl != jl) {
+        ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i);
     }
-    return ret ;
+    flush_journal_list(s, flush_jl, 1);
+    return 0;
 }
 
 /*
@@ -1262,6 +1413,10 @@ void remove_journal_hash(struct super_block *sb,
 }
 
 static void free_journal_ram(struct super_block *p_s_sb) {
+  reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl,
+                 sizeof(struct reiserfs_journal_list), p_s_sb);
+  SB_JOURNAL(p_s_sb)->j_num_lists--;
+
   vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ;
   free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ;
   free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */
@@ -1392,7 +1547,7 @@ static int journal_transaction_is_valid(struct super_block *p_s_sb, struct buffe
     }
     brelse(c_bh) ;
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid "
-                   "transaction start offset %lu, len %d id %d\n", 
+                   "transaction start offset %llu, len %d id %d\n",
 		   d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_desc_trans_len(desc), get_desc_trans_id(desc)) ;
     return 1 ;
@@ -1432,7 +1587,7 @@ static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cu
   desc = (struct reiserfs_journal_desc *)d_bh->b_data ;
   trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ;
   reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: "
-                 "journal_read_transaction, offset %lu, len %d mount_id %d\n", 
+                 "journal_read_transaction, offset %llu, len %d mount_id %d\n",
 		 d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		 get_desc_trans_len(desc), get_desc_mount_id(desc)) ;
   if (get_desc_trans_id(desc) < oldest_trans_id) {
@@ -1460,7 +1615,7 @@ static int journal_read_transaction(struct super_block *p_s_sb, unsigned long cu
   commit = (struct reiserfs_journal_commit *)c_bh->b_data ;
   if (journal_compare_desc_commit(p_s_sb, desc, commit)) {
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, "
-                   "commit offset %ld had bad time %d or length %d\n", 
+                   "commit offset %llu had bad time %d or length %d\n",
 		   c_bh->b_blocknr -  SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   get_commit_trans_id(commit), get_commit_trans_len(commit));
     brelse(c_bh) ;
@@ -1628,7 +1783,7 @@ static int journal_read(struct super_block *p_s_sb) {
   printk("reiserfs: checking transaction log (%s) for (%s)\n",
 	 bdevname(SB_JOURNAL(p_s_sb)->j_dev_bd, b),
 	 reiserfs_bdevname(p_s_sb));
-  start = get_seconds() ;
+  start = get_seconds();
 
   /* step 1, read in the journal header block.  Check the transaction it says 
   ** is the first unflushed, and if that transaction is not valid, 
@@ -1688,7 +1843,7 @@ static int journal_read(struct super_block *p_s_sb) {
 	oldest_start = d_bh->b_blocknr ;
 	newest_mount_id = get_desc_mount_id(desc) ;
 	reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting "
-	               "oldest_start to offset %lu, trans_id %lu\n", 
+	               "oldest_start to offset %llu, trans_id %lu\n",
 		       oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		       oldest_trans_id) ;
       } else if (oldest_trans_id > get_desc_trans_id(desc)) { 
@@ -1716,7 +1871,7 @@ start_log_replay:
   cur_dblock = oldest_start ;
   if (oldest_trans_id)  {
     reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay "
-                   "from offset %lu, trans_id %lu\n", 
+                   "from offset %llu, trans_id %lu\n",
 		   cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), 
 		   oldest_trans_id) ;
 
@@ -1770,70 +1925,26 @@ start_log_replay:
   return 0 ;
 }
 
-
-struct reiserfs_journal_commit_task {
-  struct super_block *p_s_sb ;
-  int jindex ;
-  int wake_on_finish ; /* if this is one, we wake the task_done queue, if it
-                       ** is zero, we free the whole struct on finish
-		       */
-  struct reiserfs_journal_commit_task *self ;
-  struct work_struct work;
-} ;
-
-static void reiserfs_journal_commit_task_func(void *__ct) {
-  struct reiserfs_journal_commit_task *ct = __ct;
-  struct reiserfs_journal_list *jl ;
-
-  reiserfs_write_lock(ct->p_s_sb);
-
-  jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ;
-
-  flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; 
-
-  if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 &&
-      atomic_read(&(jl->j_commit_left)) == 0) {
-    kupdate_one_transaction(ct->p_s_sb, jl) ;
-  }
-  reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ;
-  reiserfs_write_unlock(ct->p_s_sb);
-}
-
-static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct,
-                                  struct super_block *p_s_sb, 
-				  int jindex) {
-  if (!ct) {
-    reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ;
-  }
-  ct->p_s_sb = p_s_sb ;
-  ct->jindex = jindex ;
-  INIT_WORK(&ct->work, reiserfs_journal_commit_task_func, ct);
-  ct->self = ct ;
-}
-
-static void commit_flush_async(struct super_block *p_s_sb, int jindex) {
-  struct reiserfs_journal_commit_task *ct ;
-  /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try
-  ** to start/join a transaction, which will deadlock
-  */
-  ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ;
-  if (ct) {
-    setup_commit_task_arg(ct, p_s_sb, jindex) ;
-    queue_work(commit_wq, &ct->work) ;
-  } else {
-#ifdef CONFIG_REISERFS_CHECK
-    reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ;
-#endif
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ;
-  }
+static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s)
+{
+    struct reiserfs_journal_list *jl;
+retry:
+    jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s);
+    if (!jl) {
+	yield();
+	goto retry;
+    }
+    memset(jl, 0, sizeof(*jl));
+    INIT_LIST_HEAD(&jl->j_list);
+    INIT_LIST_HEAD(&jl->j_working_list);
+    sema_init(&jl->j_commit_lock, 1);
+    SB_JOURNAL(s)->j_num_lists++;
+    get_journal_list(jl);
+    return jl;
 }
 
 static void journal_list_init(struct super_block *p_s_sb) {
-  int i ;
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ;
-    init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ;
-  }
+    SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 }
 
 static int release_journal_dev( struct super_block *super,
@@ -1924,6 +2035,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
     struct reiserfs_super_block * rs;
     struct reiserfs_journal_header *jh;
     struct reiserfs_journal *journal;
+    struct reiserfs_journal_list *jl;
     char b[BDEVNAME_SIZE];
 
     journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ;
@@ -1934,6 +2046,8 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
     memset(journal, 0, sizeof(struct reiserfs_journal)) ;
     INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ;
     INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_working_list);
+    INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_journal_list);
     reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, 
  				   SB_BMAP_NR(p_s_sb)) ;
     allocate_bitmap_nodes(p_s_sb) ;
@@ -2041,10 +2155,6 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   brelse (bhjh);
      
   SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ;
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */
-
-  /* clear out the journal list array */
-  memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; 
   journal_list_init(p_s_sb) ;
 
   memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ;
@@ -2061,13 +2171,13 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
   SB_JOURNAL(p_s_sb)->j_first = NULL ;     
   init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-  init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; 
+  sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1);
+  sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1);
 
   SB_JOURNAL(p_s_sb)->j_trans_id = 10 ;  
   SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; 
   SB_JOURNAL(p_s_sb)->j_state = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-  atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ;
   SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ;
   SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ;
   SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ;
@@ -2075,8 +2185,9 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
 
   init_journal_hash(p_s_sb) ;
-  SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ;
-  if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) {
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+  jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl);
+  if (!jl->j_list_bitmap) {
     reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ;
     goto free_and_return;
   }
@@ -2084,16 +2195,12 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
     reiserfs_warning("Replay Failure, unable to mount\n") ;
     goto free_and_return;
   }
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this
-                                         where it belongs */
-
-  if (reiserfs_dont_log (p_s_sb))
-    return 0;
 
   reiserfs_mounted_fs_count++ ;
   if (reiserfs_mounted_fs_count <= 1)
     commit_wq = create_workqueue("reiserfs");
 
+  INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb);
   return 0 ;
 free_and_return:
   free_journal_ram(p_s_sb);
@@ -2107,8 +2214,6 @@ free_and_return:
 */
 int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) {
   time_t now = get_seconds() ;
-  if (reiserfs_dont_log(th->t_super)) 
-    return 0 ;
   /* cannot restart while nested */
   if (th->t_refcount > 1)
     return 0 ;
@@ -2148,6 +2253,35 @@ void reiserfs_wait_on_write_block(struct super_block *s) {
                !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ;
 }
 
+static void queue_log_writer(struct super_block *s) {
+    set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state);
+    sleep_on(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void wake_queued_writers(struct super_block *s) {
+    if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state))
+        wake_up(&SB_JOURNAL(s)->j_join_wait);
+}
+
+static void let_transaction_grow(struct super_block *sb,
+                                 unsigned long trans_id)
+{
+    unsigned long bcount = SB_JOURNAL(sb)->j_bcount;
+    while(1) {
+	yield();
+        while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 ||
+	        atomic_read(&SB_JOURNAL(sb)->j_jlock)) &&
+	       SB_JOURNAL(sb)->j_trans_id == trans_id) {
+	    queue_log_writer(sb);
+	}
+	if (SB_JOURNAL(sb)->j_trans_id != trans_id)
+	    break;
+	if (bcount == SB_JOURNAL(sb)->j_bcount)
+	    break;
+	bcount = SB_JOURNAL(sb)->j_bcount;
+    }
+}
+
 /* join == true if you must join an existing transaction.
 ** join == false if you can deal with waiting for others to finish
 **
@@ -2157,15 +2291,14 @@ void reiserfs_wait_on_write_block(struct super_block *s) {
 static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) {
   time_t now = get_seconds() ;
   int old_trans_id  ;
+  struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb);
+  struct reiserfs_transaction_handle myth;
+  int sched_count = 0;
 
   reiserfs_check_lock_depth("journal_begin") ;
   RFALSE( p_s_sb->s_flags & MS_RDONLY, 
 	  "clm-2078: calling journal_begin on readonly FS") ;
 
-  if (reiserfs_dont_log(p_s_sb)) {
-    th->t_super = p_s_sb ; /* others will check this for the don't log flag */
-    return 0 ;
-  }
   PROC_INFO_INC( p_s_sb, journal.journal_being );
   /* set here for journal_join */
   th->t_refcount = 1;
@@ -2173,66 +2306,76 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
 
 relock:
   lock_journal(p_s_sb) ;
+  journal->j_bcount++;
 
-  if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) {
+  if (test_bit(WRITERS_BLOCKED, &journal->j_state)) {
     unlock_journal(p_s_sb) ;
     reiserfs_wait_on_write_block(p_s_sb) ;
     PROC_INFO_INC( p_s_sb, journal.journal_relock_writers );
     goto relock ;
   }
+  now = get_seconds();
 
   /* if there is no room in the journal OR
   ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning 
   ** we don't sleep if there aren't other writers
   */
 
-  if (  (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) ||
-     ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || 
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-      (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
-     (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) ||
-     (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
+  if ( (!join && journal->j_must_wait > 0) ||
+     ( !join && (journal->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) ||
+     (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 &&
+      (now - journal->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) ||
+     (!join && atomic_read(&journal->j_jlock)) ||
+     (!join && journal->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) {
 
+    old_trans_id = journal->j_trans_id;
     unlock_journal(p_s_sb) ; /* allow others to finish this transaction */
 
-    /* if writer count is 0, we can just force this transaction to end, and start
-    ** a new one afterwards.
-    */
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
-      struct reiserfs_transaction_handle myth ;
-      journal_join(&myth, p_s_sb, 1) ;
-      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-      journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-      do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ;
+    if (!join && (journal->j_len_alloc + nblocks + 2) >=
+        SB_JOURNAL_MAX_BATCH(p_s_sb) &&
+	((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75))
+    {
+	if (atomic_read(&journal->j_wcount) > 10) {
+	    sched_count++;
+	    queue_log_writer(p_s_sb);
+	    goto relock;
+	}
+    }
+    /* don't mess with joining the transaction if all we have to do is
+     * wait for someone else to do a commit
+     */
+    if (atomic_read(&journal->j_jlock)) {
+	while (journal->j_trans_id == old_trans_id &&
+	       atomic_read(&journal->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        }
+	goto relock;
+    }
+    journal_join(&myth, p_s_sb, 1) ;
+
+    /* someone might have ended the transaction while we joined */
+    if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
+        do_journal_end(&myth, p_s_sb, 1, 0) ;
     } else {
-      /* but if the writer count isn't zero, we have to wait for the current writers to finish.
-      ** They won't batch on transaction end once we set j_jlock
-      */
-      atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
-      old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) &&
-            SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
+        do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ;
     }
+
     PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount );
     goto relock ;
   }
-
-  if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */
-    SB_JOURNAL(p_s_sb)->j_trans_start_time = now ;
+  /* we are the first writer, set trans_id */
+  if (journal->j_trans_start_time == 0) {
+    journal->j_trans_start_time = get_seconds();
   }
-  atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
-  SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ;
+  atomic_inc(&(journal->j_wcount)) ;
+  journal->j_len_alloc += nblocks ;
   th->t_blocks_logged = 0 ;
   th->t_blocks_allocated = nblocks ;
-  th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  th->t_trans_id = journal->j_trans_id ;
   unlock_journal(p_s_sb) ;
-  p_s_sb->s_dirt = 1; 
   return 0 ;
 }
 
-
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -2277,11 +2420,6 @@ int journal_begin(struct reiserfs_transaction_handle *th, struct super_block  *
     return ret ;
 }
 
-/* not used at all */
-int journal_prepare(struct super_block  * p_s_sb, struct buffer_head *bh) {
-  return 0 ;
-}
-
 /*
 ** puts bh into the current transaction.  If it was already there, reorders removes the
 ** old pointers from the hash, and puts new ones in (to make sure replay happen in the right order).
@@ -2297,18 +2435,14 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
   int prepared = 0 ;
 
   PROC_INFO_INC( p_s_sb, journal.mark_dirty );
-  if (reiserfs_dont_log(th->t_super)) {
-    mark_buffer_dirty(bh) ;
-    return 0 ;
-  }
-
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
                    th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id);
   }
-  p_s_sb->s_dirt = 1 ;
+  p_s_sb->s_dirt = 1;
 
   prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ;
+  clear_bit(BH_JRestore_dirty, &bh->b_state);
   /* already in this transaction, we are done */
   if (buffer_journaled(bh)) {
     PROC_INFO_INC( p_s_sb, journal.mark_dirty_already );
@@ -2319,13 +2453,12 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
   ** a dirty or journal_dirty or locked buffer to be logged, as some changes
   ** could get to disk too early.  NOT GOOD.
   */
-  if (!prepared || buffer_locked(bh)) {
+  if (!prepared || buffer_locked(bh) || buffer_dirty(bh)) {
     printk("journal-1777: buffer %llu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', 
                             buffer_locked(bh) ? ' ' : '!',
 			    buffer_dirty(bh) ? ' ' : '!',
 			    buffer_journal_dirty(bh) ? ' ' : '!') ;
   }
-  count_already_incd = clear_prepared_bits(bh) ;
 
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) {
     printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ;
@@ -2344,14 +2477,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
     mark_buffer_notjournal_dirty(bh) ;
   }
 
-  if (buffer_dirty(bh)) {
-    clear_buffer_dirty(bh) ;
-  }
-
-  if (buffer_journaled(bh)) { /* must double check after getting lock */
-    goto done ;
-  }
-
   if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) {
     SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ;
   }
@@ -2391,24 +2516,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
     SB_JOURNAL(p_s_sb)->j_first = cn ;
     SB_JOURNAL(p_s_sb)->j_last = cn ;
   }
-done:
-  return 0 ;
-}
-
-/*
-** if buffer already in current transaction, do a journal_mark_dirty
-** otherwise, just mark it dirty and move on.  Used for writes to meta blocks
-** that don't need journaling
-*/
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) {
-  if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || 
-      buffer_journal_dirty(bh)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  if (get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_blocknr)) {
-    return journal_mark_dirty(th, p_s_sb, bh) ;
-  }
-  mark_buffer_dirty(bh) ;
   return 0 ;
 }
 
@@ -2474,7 +2581,6 @@ static int remove_from_transaction(struct super_block *p_s_sb, b_blocknr_t block
     if (atomic_read(&(bh->b_count)) < 0) {
       printk("journal-1752: remove from trans, b_count < 0\n") ;
     }
-    if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; 
     ret = 1 ;
   }
   SB_JOURNAL(p_s_sb)->j_len-- ;
@@ -2500,7 +2606,7 @@ static int can_dirty(struct reiserfs_journal_cnode *cn) {
   int can_dirty = 1 ;
   
   /* first test hprev.  These are all newer than cn, so any node here
-  ** with the name block number and dev means this node can't be sent
+  ** with the same block number and dev means this node can't be sent
   ** to disk right now.
   */
   while(cur && can_dirty) {
@@ -2551,72 +2657,56 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
 ** change flush_commit_lists to have a repeat parameter too.
 **
 */
-void flush_async_commits(struct super_block *p_s_sb) {
-  int i ;
+static void flush_async_commits(void *p) {
+  struct super_block *p_s_sb = p;
+  struct reiserfs_journal_list *jl;
+  struct list_head *entry;
 
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; 
-    }
+  lock_kernel();
+  if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+      /* last entry is the youngest, commit it and you get everything */
+      entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev;
+      jl = JOURNAL_LIST_ENTRY(entry);
+      flush_commit_list(p_s_sb, jl, 1);
   }
+  unlock_kernel();
 }
 
 /*
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
-**
-** also calls flush_journal_list with old_only == 1, which allows me to reclaim
-** memory and such from the journal lists whose real blocks are all on disk.
-**
-** called by sync_dev_journal from buffer.c
 */
-int flush_old_commits(struct super_block *p_s_sb, int immediate) {
-  int i ;
-  int count = 0;
-  int start ; 
-  time_t now ; 
-  struct reiserfs_transaction_handle th ; 
-
-  start =  SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  now = get_seconds() ;
-
-  /* safety check so we don't flush while we are replaying the log during mount */
-  if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) {
-    return 0  ;
-  }
-  /* starting with oldest, loop until we get to the start */
-  i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ;
-  while(i != start) {
-    if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) ||
-       immediate)) {
-      /* we have to check again to be sure the current transaction did not change */
-      if (i != SB_JOURNAL_LIST_INDEX(p_s_sb))  {
-	flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ;
-      }
-    }
-    i = (i + 1) % JOURNAL_LIST_COUNT ;
-    count++ ;
-  }
-  /* now, check the current transaction.  If there are no writers, and it is too old, finish it, and
-  ** force the commit blocks to disk
-  */
-  if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&  
-     SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && 
-     SB_JOURNAL(p_s_sb)->j_len > 0 && 
-     (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ;
-  } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case.  If they say to 
-                             flush, we must be sure old transactions hit the disk too. */
-    journal_join(&th, p_s_sb, 1) ;
-    reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
-    journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
-    do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
-  }
-   reiserfs_journal_kupdate(p_s_sb) ;
-   return 0 ;
+int reiserfs_flush_old_commits(struct super_block *p_s_sb) {
+    time_t now ;
+    struct reiserfs_transaction_handle th ;
+
+    now = get_seconds();
+    /* safety check so we don't flush while we are replaying the log during
+     * mount
+     */
+    if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) {
+	return 0  ;
+    }
+
+    /* check the current transaction.  If there are no writers, and it is
+     * too old, finish it, and force the commit blocks to disk
+     */
+    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 &&
+        SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 &&
+        SB_JOURNAL(p_s_sb)->j_len > 0 &&
+        (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) >
+	SB_JOURNAL_MAX_TRANS_AGE(p_s_sb))
+    {
+	journal_join(&th, p_s_sb, 1) ;
+	reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+	journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
+
+	/* we're only being called from kreiserfsd, it makes no sense to do
+	** an async commit so that kreiserfsd can do it later
+	*/
+	do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ;
+    }
+    return p_s_sb->s_dirt;
 }
 
 /*
@@ -2637,6 +2727,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   int flush = flags & FLUSH_ALL ;
   int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
+  struct reiserfs_journal_list *jl;
 
   if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) {
     reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", 
@@ -2653,13 +2744,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   ** care of in this trans
   */
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ;
-    unlock_journal(p_s_sb) ;
-    if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock))  > 0 && wcount <= 0) {
-      atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ;
-      wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    }
-    return 0 ;
+    BUG();
   }
   /* if wcount > 0, and we are called to with flush or commit_now,
   ** we wait on j_join_wait.  We will wake up when the last writer has
@@ -2669,24 +2754,37 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   */
   if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) {
     if (flush || commit_now) {
-      int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      unsigned trans_id ;
+
+      jl = SB_JOURNAL(p_s_sb)->j_current_jl;
+      trans_id = jl->j_trans_id;
+
       atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
       if (flush) {
         SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ;
       }
       unlock_journal(p_s_sb) ;
+
       /* sleep while the current transaction is still j_jlocked */
-      while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && 
-            SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) {
-	sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-      }
-      if (commit_now) {
-	if (wait_on_commit) {
-	  flush_commit_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-	} else {
-	  commit_flush_async(p_s_sb, orig_jindex) ; 
+      while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) {
+	    queue_log_writer(p_s_sb);
+        } else {
+	    lock_journal(p_s_sb);
+	    if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+	        atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ;
+	    }
+	    unlock_journal(p_s_sb);
 	}
       }
+      if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) {
+          BUG();
+      }
+      if (commit_now && journal_list_still_alive(p_s_sb, trans_id) &&
+          wait_on_commit)
+      {
+	  flush_commit_list(p_s_sb, jl, 1) ;
+      }
       return 0 ;
     } 
     unlock_journal(p_s_sb) ;
@@ -2694,7 +2792,7 @@ static int check_journal_end(struct reiserfs_transaction_handle *th, struct supe
   }
 
   /* deal with old transactions where we are the last writers */
-  now = get_seconds() ;
+  now = get_seconds();
   if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) {
     commit_now = 1 ;
     SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ;
@@ -2734,25 +2832,21 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
   struct buffer_head *bh = NULL ;
   struct reiserfs_list_bitmap *jb = NULL ;
   int cleaned = 0 ;
-  
-  if (reiserfs_dont_log(th->t_super)) {
-    bh = sb_find_get_block(p_s_sb, blocknr) ;
-    if (bh && buffer_dirty (bh)) {
-      printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %d\n", bh->b_state, blocknr);
-      BUG ();
-    }
-    brelse (bh);
-    return 0 ;
+
+  cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, blocknr);
+  if (cn && cn->bh) {
+      bh = cn->bh ;
+      get_bh(bh) ;
   }
-  bh = sb_find_get_block(p_s_sb, blocknr) ;
   /* if it is journal new, we just remove it from this transaction */
   if (bh && buffer_journal_new(bh)) {
     mark_buffer_notjournal_new(bh) ;
     clear_prepared_bits(bh) ;
+    reiserfs_clean_and_file_buffer(bh) ;
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
   } else {
     /* set the bit for this block in the journal bitmap for this transaction */
-    jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ;
+    jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap;
     if (!jb) {
       reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ;
     }
@@ -2762,6 +2856,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
 
     if (bh) {
       clear_prepared_bits(bh) ;
+      reiserfs_clean_and_file_buffer(bh) ;
     }
     cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ;
 
@@ -2793,7 +2888,6 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
   }
 
   if (bh) {
-    reiserfs_clean_and_file_buffer(bh) ;
     put_bh(bh) ; /* get_hash grabs the buffer */
     if (atomic_read(&(bh->b_count)) < 0) {
       printk("journal-2165: bh->b_count < 0\n") ;
@@ -2803,50 +2897,84 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
 }
 
 void reiserfs_update_inode_transaction(struct inode *inode) {
-  
-  REISERFS_I(inode)->i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb);
-
+  REISERFS_I(inode)->i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl;
   REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ;
 }
 
-static int reiserfs_inode_in_this_transaction(struct inode *inode) {
-  if (REISERFS_I(inode)->i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id || 
-      REISERFS_I(inode)->i_trans_id == 0) {
-    return 1; 
-  } 
-  return 0 ;
+static void __commit_trans_jl(struct inode *inode, unsigned long id,
+                                 struct reiserfs_journal_list *jl)
+{
+    struct reiserfs_transaction_handle th ;
+    struct super_block *sb = inode->i_sb ;
+
+    /* is it from the current transaction, or from an unknown transaction? */
+    if (id == SB_JOURNAL(sb)->j_trans_id) {
+	jl = SB_JOURNAL(sb)->j_current_jl;
+	/* try to let other writers come in and grow this transaction */
+	let_transaction_grow(sb, id);
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    goto flush_commit_only;
+	}
+
+	journal_begin(&th, sb, 1) ;
+
+	/* someone might have ended this transaction while we joined */
+	if (SB_JOURNAL(sb)->j_trans_id != id) {
+	    reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ;
+	    journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ;
+	    journal_end(&th, sb, 1) ;
+	    goto flush_commit_only;
+	}
+
+	journal_end_sync(&th, sb, 1) ;
+
+    } else {
+	/* this gets tricky, we have to make sure the journal list in
+	 * the inode still exists.  We know the list is still around
+	 * if we've got a larger transaction id than the oldest list
+	 */
+flush_commit_only:
+	if (journal_list_still_alive(inode->i_sb, id)) {
+	    flush_commit_list(sb, jl, 1) ;
+	}
+    }
+    /* otherwise the list is gone, and long since committed */
 }
 
 void reiserfs_commit_for_inode(struct inode *inode) {
-  struct reiserfs_journal_list *jl ;
-  struct reiserfs_transaction_handle th ;
-  struct super_block *sb = inode->i_sb ;
-
-  jl = SB_JOURNAL_LIST(sb) + REISERFS_I(inode)->i_trans_index ;
-
-  /* is it from the current transaction, or from an unknown transaction? */
-  if (reiserfs_inode_in_this_transaction(inode)) {
-    journal_join(&th, sb, 1) ;
-    reiserfs_update_inode_transaction(inode) ;
-    journal_end_sync(&th, sb, 1) ;
-  } else if (jl->j_trans_id == REISERFS_I(inode)->i_trans_id) {
-    flush_commit_list(sb, jl, 1) ;
-  }
-  /* if the transaction id does not match, this list is long since flushed
-  ** and we don't have to do anything here
-  */
+    unsigned long id = REISERFS_I(inode)->i_trans_id;
+    struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl;
+
+    /* for the whole inode, assume unset id means it was
+     * changed in the current transaction.  More conservative
+     */
+    if (!id || !jl) {
+	reiserfs_update_inode_transaction(inode) ;
+	id = REISERFS_I(inode)->i_trans_id;
+	/* jl will be updated in __commit_trans_jl */
+    }
+
+    __commit_trans_jl(inode, id, jl);
 }
 
 void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, 
                                       struct buffer_head *bh) {
-  PROC_INFO_INC( p_s_sb, journal.restore_prepared );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
-
-  if (!bh) {
-    return ;
-  }
-  clear_bit(BH_JPrepared, &bh->b_state) ;
+    PROC_INFO_INC( p_s_sb, journal.restore_prepared );
+    if (!bh) {
+	return ;
+    }
+    if (test_and_clear_bit(BH_JRestore_dirty, &bh->b_state) &&
+	buffer_journal_dirty(bh)) {
+	struct reiserfs_journal_cnode *cn;
+	cn = get_journal_hash_dev(p_s_sb,
+	                          SB_JOURNAL(p_s_sb)->j_list_hash_table,
+				  bh->b_blocknr);
+	if (cn && can_dirty(cn)) {
+	    set_bit(BH_JTest, &bh->b_state);
+	    mark_buffer_dirty(bh);
+        }
+    }
+    clear_bit(BH_JPrepared, &bh->b_state) ;
 }
 
 extern struct tree_balance *cur_tb ;
@@ -2857,29 +2985,39 @@ extern struct tree_balance *cur_tb ;
 ** wait on it.
 ** 
 */
-void reiserfs_prepare_for_journal(struct super_block *p_s_sb, 
+int reiserfs_prepare_for_journal(struct super_block *p_s_sb,
                                   struct buffer_head *bh, int wait) {
-  int retry_count = 0 ;
-
   PROC_INFO_INC( p_s_sb, journal.prepare );
-  if (reiserfs_dont_log (p_s_sb))
-    return;
 
-  while(!test_bit(BH_JPrepared, &bh->b_state) ||
-        (wait && buffer_locked(bh))) {
-    if (buffer_journaled(bh)) {
-      set_bit(BH_JPrepared, &bh->b_state) ;
-      return ;
+    if (test_set_buffer_locked(bh)) {
+	if (!wait)
+	    return 0;
+	lock_buffer(bh);
     }
-    set_bit(BH_JPrepared, &bh->b_state) ;
-    if (wait) {
-      RFALSE( buffer_locked(bh) && cur_tb != NULL,
-	      "waiting while do_balance was running\n") ;
-      wait_on_buffer(bh) ;
+    set_bit(BH_JPrepared, &bh->b_state);
+    if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh))  {
+	clear_bit(BH_JTest, &bh->b_state);
+	set_bit(BH_JRestore_dirty, &bh->b_state);
+    }
+    unlock_buffer(bh);
+    return 1;
+}
+
+static void flush_old_journal_lists(struct super_block *s) {
+    struct reiserfs_journal_list *jl;
+    struct list_head *entry;
+    time_t now = get_seconds();
+
+    while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) {
+        entry = SB_JOURNAL(s)->j_journal_list.next;
+	jl = JOURNAL_LIST_ENTRY(entry);
+	/* this check should always be run, to send old lists to disk */
+	if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) {
+	    flush_used_journal_lists(s, jl);
+	} else {
+	    break;
+	}
     }
-    PROC_INFO_INC( p_s_sb, journal.prepare_retry );
-    retry_count++ ;
-  }
 }
 
 /* 
@@ -2898,23 +3036,24 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   struct buffer_head *c_bh ; /* commit bh */
   struct buffer_head *d_bh ; /* desc bh */
   int cur_write_start = 0 ; /* start index of current log write */
-  int cur_blocks_left = 0 ; /* number of journal blocks left to write */
   int old_start ;
   int i ;
-  int jindex ;
-  int orig_jindex ;
   int flush = flags & FLUSH_ALL ;
-  int commit_now = flags & COMMIT_NOW ;
   int wait_on_commit = flags & WAIT ;
-  struct reiserfs_super_block *rs ; 
-  int trans_half ;
+  struct reiserfs_journal_list *jl, *temp_jl;
+  struct list_head *entry, *safe;
+  unsigned long jindex;
+  unsigned long commit_trans_id;
+  int trans_half;
 
   if (th->t_refcount > 1)
     BUG() ;
 
   current->journal_info = th->t_handle_save;
-  if (reiserfs_dont_log(th->t_super)) {
-    return 0 ;
+  reiserfs_check_lock_depth("journal end");
+  if (SB_JOURNAL(p_s_sb)->j_len == 0) {
+      reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ;
+      journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ;
   }
 
   lock_journal(p_s_sb) ;
@@ -2923,24 +3062,24 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
     flush = 1 ;
   }
   if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    flags |= COMMIT_NOW ;
-    commit_now = 1 ;
+    flags |= COMMIT_NOW | WAIT;
+    wait_on_commit = 1;
   }
 
   /* check_journal_end locks the journal, and unlocks if it does not return 1 
   ** it tells us if we should continue with the journal_end, or just return
   */
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
-    return 0 ;
+    p_s_sb->s_dirt = 1;
+    wake_queued_writers(p_s_sb);
+    goto out ;
   }
 
   /* check_journal_end might set these, check again */
   if (SB_JOURNAL(p_s_sb)->j_next_full_flush) {
     flush = 1 ;
   }
-  if (SB_JOURNAL(p_s_sb)->j_next_async_flush) {
-    commit_now = 1 ;
-  }
+
   /*
   ** j must wait means we have to flush the log blocks, and the real blocks for
   ** this transaction
@@ -2957,10 +3096,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   current->journal_info = th->t_handle_save ;
 #endif
   
-  rs = SB_DISK_SUPER_BLOCK(p_s_sb) ;
   /* setup description block */
   d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start) ; 
-  set_buffer_uptodate(d_bh) ;
+  set_buffer_uptodate(d_bh);
   desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ;
   memset(d_bh->b_data, 0, d_bh->b_size) ;
   memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ;
@@ -2975,28 +3113,33 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   set_buffer_uptodate(c_bh) ;
 
   /* init this journal list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ;  
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2);
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-
-  /* which is faster, locking/unlocking at the start and end of the for
-  ** or locking once per iteration around the insert_journal_hash?
-  ** eitherway, we are write locking insert_journal_hash.  The ENTIRE FOR
-  ** LOOP MUST not cause schedule to occur.
-  */
+  jl = SB_JOURNAL(p_s_sb)->j_current_jl;
 
-  /* for each real block, add it to the journal list hash,
+  /* we lock the commit before doing anything because
+   * we want to make sure nobody tries to run flush_commit_list until
+   * the new transaction is fully setup, and we've already flushed the
+   * ordered bh list
+   */
+  down(&jl->j_commit_lock);
+
+  /* save the transaction id in case we need to commit it later */
+  commit_trans_id = jl->j_trans_id;
+
+  atomic_set(&jl->j_older_commits_done, 0) ;
+  jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ;
+  jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ;
+  jl->j_commit_bh = c_bh ;
+  jl->j_start = SB_JOURNAL(p_s_sb)->j_start ;
+  jl->j_len = SB_JOURNAL(p_s_sb)->j_len ;
+  atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ;
+  atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2);
+  jl->j_realblock = NULL ;
+
+  /* The ENTIRE FOR LOOP MUST not cause schedule to occur.
+  **  for each real block, add it to the journal list hash,
   ** copy into real block index array in the commit or desc block
   */
-  trans_half = journal_trans_half(p_s_sb->s_blocksize) ;
+  trans_half = journal_trans_half(p_s_sb->s_blocksize);
   for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) {
     if (test_bit(BH_JDirty, &cn->bh->b_state) ) {
       jl_cn = get_cnode(p_s_sb) ;
@@ -3004,7 +3147,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
         reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ;
       }
       if (i == 0) {
-        SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ;
+        jl->j_realblock = jl_cn ;
       }
       jl_cn->prev = last_cn ;
       jl_cn->next = NULL ;
@@ -3020,9 +3163,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
       }
       jl_cn->blocknr = cn->bh->b_blocknr ; 
       jl_cn->state = 0 ;
-      jl_cn->sb = p_s_sb ;
+      jl_cn->sb = p_s_sb;
       jl_cn->bh = cn->bh ;
-      jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ;
+      jl_cn->jlist = jl;
       insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; 
       if (i < trans_half) {
 	desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ;
@@ -3033,7 +3176,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
       i-- ;
     }
   }
-  
   set_desc_trans_len(desc, SB_JOURNAL(p_s_sb)->j_len) ;
   set_desc_mount_id(desc, SB_JOURNAL(p_s_sb)->j_mount_id) ;
   set_desc_trans_id(desc, SB_JOURNAL(p_s_sb)->j_trans_id) ;
@@ -3041,53 +3183,35 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
 
   /* special check in case all buffers in the journal were marked for not logging */
   if (SB_JOURNAL(p_s_sb)->j_len == 0) {
-    brelse(d_bh) ;
-    brelse(c_bh) ;
-    unlock_journal(p_s_sb) ;
-    printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ;
-    atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
-    wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
-    return 0 ;
+    BUG();
   }
 
+  /* we're about to dirty all the log blocks, mark the description block
+   * dirty now too.  Don't mark the commit block dirty until all the
+   * others are on disk
+   */
+  mark_buffer_dirty(d_bh);
+
   /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */
   cur_write_start = SB_JOURNAL(p_s_sb)->j_start ;
-  cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len  ;
   cn = SB_JOURNAL(p_s_sb)->j_first ;
   jindex = 1 ; /* start at one so we don't get the desc again */
-  while(cur_blocks_left > 0) {
+  while(cn) {
+    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
     /* copy all the real blocks into log area.  dirty log blocks */
     if (test_bit(BH_JDirty, &cn->bh->b_state)) {
       struct buffer_head *tmp_bh ;
       tmp_bh =  journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + 
 		       ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ;
-      set_buffer_uptodate(tmp_bh) ;
+      set_buffer_uptodate(tmp_bh);
       memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ;  
+      mark_buffer_dirty(tmp_bh);
       jindex++ ;
-    } else {
-      /* JDirty cleared sometime during transaction.  don't log this one */
-      printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
-    }
-    cn = cn->next ;
-    cur_blocks_left-- ;
-  }
-
-  /* we are done  with both the c_bh and d_bh, but
-  ** c_bh must be written after all other commit blocks,
-  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
-  */
-
-  /* now loop through and mark all buffers from this transaction as JDirty_wait
-  ** clear the JDirty bit, clear BH_JNew too.  
-  ** if they weren't JDirty, they weren't logged, just relse them and move on
-  */
-  cn = SB_JOURNAL(p_s_sb)->j_first ; 
-  while(cn) {
-    clear_bit(BH_JNew, &(cn->bh->b_state)) ;
-    if (test_bit(BH_JDirty, &(cn->bh->b_state))) {
       set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; 
       clear_bit(BH_JDirty, &(cn->bh->b_state)) ;
     } else {
+      /* JDirty cleared sometime during transaction.  don't log this one */
+      reiserfs_warning("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ;
       brelse(cn->bh) ;
     }
     next = cn->next ;
@@ -3095,30 +3219,17 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
     cn = next ;
   }
 
-  /* unlock the journal list for committing and flushing */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ;
-
-  orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ;
-  jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; 
-  SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ;
+  /* we are done  with both the c_bh and d_bh, but
+  ** c_bh must be written after all other commit blocks,
+  ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1.
+  */
 
-  /* write any buffers that must hit disk before this commit is done */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb);
 
-  /* honor the flush and async wishes from the caller */
-  if (flush) {
-  
-    flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    flush_journal_list(p_s_sb,  SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ;  
-  } else if (commit_now) {
-    if (wait_on_commit) {
-      flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ;
-    } else {
-      commit_flush_async(p_s_sb, orig_jindex) ; 
-    }
-  }
+  /* now it is safe to insert this transaction on the main list */
+  list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list);
+  list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list);
+  SB_JOURNAL(p_s_sb)->j_num_work_lists++;
 
   /* reset journal values for the next transaction */
   old_start = SB_JOURNAL(p_s_sb)->j_start ;
@@ -3130,57 +3241,96 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;
   SB_JOURNAL(p_s_sb)->j_trans_id++ ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id;
   SB_JOURNAL(p_s_sb)->j_must_wait = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ;
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  /* tail conversion targets have to hit the disk before we end the
+   * transaction.  Otherwise a later transaction might repack the tail
+   * before this transaction commits, leaving the data block unflushed and
+   * clean, if we crash before the later transaction commits, the data block
+   * is lost.
+   */
+  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
+		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  up(&jl->j_commit_lock);
+
+  /* honor the flush wishes from the caller, simple commits can
+  ** be done outside the journal lock, they are done below
+  */
+  if (flush) {
+    flush_commit_list(p_s_sb, jl, 1) ;
+    flush_journal_list(p_s_sb, jl, 1) ;
+  }
+
+
   /* if the next transaction has any chance of wrapping, flush 
   ** transactions that might get overwritten.  If any journal lists are very 
   ** old flush them as well.  
   */
-  for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) {
-    jindex = i ;
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; 
+first_jl:
+  list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) {
+    temp_jl = JOURNAL_LIST_ENTRY(entry);
+    if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) {
+      if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >=
+          temp_jl->j_start)
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else if ((SB_JOURNAL(p_s_sb)->j_start +
+                  SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) <
+		  SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+      {
+          /* if we don't cross into the next transaction and we don't
+	   * wrap, there is no way we can overlap any later transactions
+	   * break now
+	   */
+	  break;
       }
-    } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) {
-      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= 
-            SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
+    } else if ((SB_JOURNAL(p_s_sb)->j_start +
+                SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >
+		SB_ONDISK_JOURNAL_SIZE(p_s_sb))
+    {
+      if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) %
+            SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start)
+      {
+	flush_used_journal_lists(p_s_sb, temp_jl);
+	goto first_jl;
+      } else {
+	  /* we don't overlap anything from out start to the end of the
+	   * log, and our wrapped portion doesn't overlap anything at
+	   * the start of the log.  We can break
+	   */
+	  break;
       }
-    } 
-    /* this check should always be run, to send old lists to disk */
-    if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && 
-              SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < 
-	      (get_seconds() - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) {
-	flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; 
     }
   }
+  flush_old_journal_lists(p_s_sb);
 
-  /* if the next journal_list is still in use, flush it */
-  if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) {
-    flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; 
-  }
-
-  /* we don't want anyone flushing the new transaction's list */
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ;
-  atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ;
-  SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + 
-											 SB_JOURNAL_LIST_INDEX(p_s_sb)) ;
+  SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ;
 
-  if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) {
+  if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) {
     reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ;
   }
-  unlock_journal(p_s_sb) ;
+
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ;
+  unlock_journal(p_s_sb) ;
   /* wake up any body waiting to join. */
+  clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
+
+  if (!flush) {
+      if (wait_on_commit) {
+	  if (journal_list_still_alive(p_s_sb, commit_trans_id))
+	      flush_commit_list(p_s_sb, jl, 1) ;
+      } else {
+          queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
+      }
+  }
+out:
+  reiserfs_check_lock_depth("journal end2");
   return 0 ;
 }
-
-
-
diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c
index 8d47a4edabd9..f6a289f4532c 100644
--- a/fs/reiserfs/objectid.c
+++ b/fs/reiserfs/objectid.c
@@ -86,7 +86,6 @@ __u32 reiserfs_get_unused_objectid (struct reiserfs_transaction_handle *th)
     }
 
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 1;
     return unused_objectid;
 }
 
@@ -105,8 +104,6 @@ void reiserfs_release_objectid (struct reiserfs_transaction_handle *th,
 
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); 
-    s->s_dirt = 1;
-
 
     /* start at the beginning of the objectid map (i = 0) and go to
        the end of it (i = disk_sb->s_oid_cursize).  Linear search is
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 0b4db272a566..d7c20a7c0e46 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -87,7 +87,7 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
     
 	seq_printf(m,	"state: \t%s\n"
-			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s%s\n"
+			"mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n"
 			"gen. counter: \t%i\n"
 			"s_kmallocs: \t%i\n"
 			"s_disk_reads: \t%i\n"
@@ -131,7 +131,6 @@ static int show_super(struct seq_file *m, struct super_block *sb)
 			reiserfs_test4( sb ) ? "TEST4 " : "",
 			have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ",
 			replay_only( sb ) ? "REPLAY_ONLY " : "",
-			reiserfs_dont_log( sb ) ? "DONT_LOG " : "LOG ",
 			convert_reiserfs( sb ) ? "CONV " : "",
 
 			atomic_read( &r -> s_generation_counter ),
@@ -370,7 +369,6 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 			"j_first_unflushed_offset: \t%lu\n"
 			"j_last_flush_trans_id: \t%lu\n"
 			"j_trans_start_time: \t%li\n"
-			"j_journal_list_index: \t%i\n"
 			"j_list_bitmap_index: \t%i\n"
 			"j_must_wait: \t%i\n"
 			"j_next_full_flush: \t%i\n"
@@ -416,7 +414,6 @@ static int show_journal(struct seq_file *m, struct super_block *sb)
 			JF( j_first_unflushed_offset ),
 			JF( j_last_flush_trans_id ),
 			JF( j_trans_start_time ),
-			JF( j_journal_list_index ),
 			JF( j_list_bitmap_index ),
 			JF( j_must_wait ),
 			JF( j_next_full_flush ),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index df7baf79e889..f75349fe4787 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -59,22 +59,26 @@ static int is_any_reiserfs_magic_string (struct reiserfs_super_block * rs)
 static int reiserfs_remount (struct super_block * s, int * flags, char * data);
 static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf);
 
-static void reiserfs_write_super (struct super_block * s)
+static void reiserfs_sync_fs (struct super_block * s)
 {
+    if (!(s->s_flags & MS_RDONLY)) {
+        struct reiserfs_transaction_handle th;
+	reiserfs_write_lock(s);
+	journal_begin(&th, s, 1);
+	journal_end_sync(&th, s, 1);
+	reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;
+	reiserfs_write_unlock(s);
+    }
+}
 
-  int dirty = 0 ;
-  reiserfs_write_lock(s);
-  if (!(s->s_flags & MS_RDONLY)) {
-    dirty = flush_old_commits(s, 1) ;
-  }
-  s->s_dirt = dirty;
-  reiserfs_write_unlock(s);
+static void reiserfs_write_super(struct super_block *s)
+{
+    reiserfs_sync_fs(s);
 }
 
 static void reiserfs_write_super_lockfs (struct super_block * s)
 {
-
-  int dirty = 0 ;
   struct reiserfs_transaction_handle th ;
   reiserfs_write_lock(s);
   if (!(s->s_flags & MS_RDONLY)) {
@@ -84,7 +88,7 @@ static void reiserfs_write_super_lockfs (struct super_block * s)
     reiserfs_block_writes(&th) ;
     journal_end(&th, s, 1) ;
   }
-  s->s_dirt = dirty;
+  s->s_dirt = 0;
   reiserfs_write_unlock(s);
 }
 
@@ -805,7 +809,6 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ;
     set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state );
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
   } else {
     /* remount read-write */
     if (!(s->s_flags & MS_RDONLY))
@@ -822,12 +825,12 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     set_sb_umount_state( rs, REISERFS_ERROR_FS );
     /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */
     journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s));
-    s->s_dirt = 0;
     REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ;
   }
   /* this will force a full flush of all journal lists */
   SB_JOURNAL(s)->j_must_wait = 1 ;
   journal_end(&th, s, 10) ;
+  s->s_dirt = 0;
 
   if (!( *mount_flags & MS_RDONLY ) )
     finish_unfinished( s );
@@ -1392,8 +1395,6 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
 	
 	/* look for files which were to be removed in previous session */
 	finish_unfinished (s);
-
-	s->s_dirt = 0;
     } else {
 	if ( old_format_only(s) && !silent) {
 	    reiserfs_warning("reiserfs: using 3.5.x disk format\n") ;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index e4695e7b7ba3..fb0bf2af7fd7 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1702,23 +1702,39 @@ struct reiserfs_journal_header {
 	 (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12))))
 #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK])
 
-/* finds n'th buffer with 0 being the start of this commit.  Needs to go away, j_ap_blocks has changed
-** since I created this.  One chunk of code in journal.c needs changing before deleting it
-*/
-#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
-
 // We need these to make journal.c code more readable
 #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
+/*
+** transaction handle which is passed around for all journal calls
+*/
+struct reiserfs_transaction_handle {
+  struct super_block *t_super ; /* super for this FS when journal_begin was
+				   called. saves calls to reiserfs_get_super
+				   also used by nested transactions to make
+				   sure they are nesting on the right FS
+				   _must_ be first in the handle
+				*/
+  int t_refcount;
+  int t_blocks_logged ;         /* number of blocks this writer has logged */
+  int t_blocks_allocated ;      /* number of blocks this writer allocated */
+  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
+  void *t_handle_save ;		/* save existing current->journal_info */
+  int displace_new_blocks:1;	/* if new block allocation occurres, that block
+				   should be displaced from others */
+} ;
+
+int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
 void reiserfs_wait_on_write_block(struct super_block *s) ;
 void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ;
 void reiserfs_allow_writes(struct super_block *s) ;
 void reiserfs_check_lock_depth(char *caller) ;
-void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
+int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ;
 void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ;
 int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ;
 int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
@@ -1730,7 +1746,6 @@ int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
 int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ;
-void flush_async_commits(struct super_block *p_s_sb) ;
 
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;
diff --git a/include/linux/reiserfs_fs_i.h b/include/linux/reiserfs_fs_i.h
index 87e1b74e1125..e689a12bcb9b 100644
--- a/include/linux/reiserfs_fs_i.h
+++ b/include/linux/reiserfs_fs_i.h
@@ -3,6 +3,8 @@
 
 #include <linux/list.h>
 
+struct reiserfs_journal_list;
+
 /** bitmasks for i_flags field in reiserfs-specific part of inode */
 typedef enum {
     /** this says what format of key do all items (but stat data) of
@@ -48,7 +50,7 @@ struct reiserfs_inode_info {
     ** needs to be committed in order for this inode to be properly
     ** flushed */
     unsigned long i_trans_id ;
-    unsigned long i_trans_index ;
+    struct reiserfs_journal_list *i_jl;
     struct inode vfs_inode;
 };
 
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index b848ccd7ed41..e1fe3ebe33c0 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -106,7 +106,6 @@ typedef enum {
 #define JOURNAL_MAX_CNODE   1500 /* max cnodes to allocate. */
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
-#define JOURNAL_LIST_COUNT 64
 
 /* these are bh_state bit flag offset numbers, for use in the buffer head */
 
@@ -121,6 +120,7 @@ typedef enum {
 */
 #define BH_JPrepared 20		/* block has been prepared for the log */
 #define BH_JRestore_dirty 22    /* restore the dirty bit later */
+#define BH_JTest 23             /* debugging use only */
 
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
@@ -153,26 +153,6 @@ struct reiserfs_list_bitmap {
   struct reiserfs_bitmap_node **bitmaps ;
 } ;
 
-/*
-** transaction handle which is passed around for all journal calls
-*/
-struct reiserfs_transaction_handle {
-  struct super_block *t_super ; /* super for this FS when journal_begin was
-				   called. saves calls to reiserfs_get_super
-				   also used by nested transactions to make
-				   sure they are nesting on the right FS
-				   _must_ be first in the handle
-				*/
-  int t_refcount;
-  int t_blocks_logged ;         /* number of blocks this writer has logged */
-  int t_blocks_allocated ;      /* number of blocks this writer allocated */
-  unsigned long t_trans_id ;    /* sanity check, equals the current trans id */
-  void *t_handle_save ;		/* save existing current->journal_info */
-  int displace_new_blocks:1;	/* if new block allocation occurres, that block
-				   should be displaced from others */
-
-} ;
-
 /*
 ** one of these for each transaction.  The most important part here is the j_realblock.
 ** this list of cnodes is used to hash all the blocks in all the commits, to mark all the
@@ -181,23 +161,25 @@ struct reiserfs_transaction_handle {
 ** to be overwritten */
 struct reiserfs_journal_list {
   unsigned long j_start ;
+  unsigned long j_state;
   unsigned long j_len ;
   atomic_t j_nonzerolen ;
   atomic_t j_commit_left ;
-  atomic_t j_flushing ;
-  atomic_t j_commit_flushing ;
   atomic_t j_older_commits_done ;      /* all commits older than this on disk*/
+  struct semaphore j_commit_lock;
   unsigned long j_trans_id ;
   time_t j_timestamp ;
   struct reiserfs_list_bitmap *j_list_bitmap ;
   struct buffer_head *j_commit_bh ; /* commit buffer head */
   struct reiserfs_journal_cnode *j_realblock  ;
   struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans.  free each of these on flush */
-  wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */
-  wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */
-} ;
+  /* time ordered list of all active transactions */
+  struct list_head j_list;
 
-struct reiserfs_page_list  ; /* defined in reiserfs_fs.h */
+  /* time ordered list of all transactions we haven't tried to flush yet */
+  struct list_head j_working_list;
+  int j_refcount;
+} ;
 
 struct reiserfs_journal {
   struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */
@@ -220,16 +202,11 @@ struct reiserfs_journal {
   unsigned long j_last_flush_trans_id ;    /* last fully flushed journal timestamp */
   struct buffer_head *j_header_bh ;   
 
-  /* j_flush_pages must be flushed before the current transaction can
-  ** commit
-  */
-  struct reiserfs_page_list *j_flush_pages ;
   time_t j_trans_start_time ;         /* time this transaction started */
-  wait_queue_head_t j_wait ;         /* wait  journal_end to finish I/O */
-  atomic_t j_wlock ;                       /* lock for j_wait */
+  struct semaphore j_lock;
+  struct semaphore j_flush_sem;
   wait_queue_head_t j_join_wait ;    /* wait for current transaction to finish before starting new one */
   atomic_t j_jlock ;                       /* lock for j_join_wait */
-  int j_journal_list_index ;	      /* journal list number of the current trans */
   int j_list_bitmap_index ;	      /* number of next list bitmap to use */
   int j_must_wait ;		       /* no more journal begins allowed. MUST sleep on j_join_wait */
   int j_next_full_flush ;             /* next journal_end will flush all journal list */
@@ -246,19 +223,37 @@ struct reiserfs_journal {
   struct reiserfs_journal_cnode *j_cnode_free_list ;
   struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */
 
+  struct reiserfs_journal_list *j_current_jl;
   int j_free_bitmap_nodes ;
   int j_used_bitmap_nodes ;
+
+  int j_num_lists;      /* total number of active transactions */
+  int j_num_work_lists; /* number that need attention from kreiserfsd */
+
+  /* debugging to make sure things are flushed in order */
+  int j_last_flush_id;
+
+  /* debugging to make sure things are committed in order */
+  int j_last_commit_id;
+
   struct list_head j_bitmap_nodes ;
   struct list_head j_dirty_buffers ;
   spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */
+
+  /* list of all active transactions */
+  struct list_head j_journal_list;
+  /* lists that haven't been touched by writeback attempts */
+  struct list_head j_working_list;
+
   struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ;	/* array of bitmaps to record the deleted blocks */
-  struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ;	    /* array of all the journal lists */
   struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; 	    /* hash table for real buffer heads in current trans */ 
   struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all 
   										the transactions */
   struct list_head j_prealloc_list;     /* list of inodes which have preallocated blocks */
   unsigned long j_max_trans_size ;
   unsigned long j_max_batch_size ;
+
+  struct work_struct j_work;
 };
 
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
@@ -417,7 +412,6 @@ struct reiserfs_sb_info
 #define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
 #define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
 #define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_NOLOG 4      /* -o nolog: turn journalling off */
 #define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
@@ -473,8 +467,6 @@ struct reiserfs_sb_info
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
-int flush_old_commits(struct super_block *s, int) ;
 int reiserfs_resize(struct super_block *, unsigned long) ;
 
 #define CARRY_ON                0
@@ -484,8 +476,6 @@ int reiserfs_resize(struct super_block *, unsigned long) ;
 #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh)
 #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal)
 #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block)
-#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list)
-#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) 
 #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) 
 #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap)
 
-- 
cgit v1.2.3


From bb0d96728fb63cf1d2294bb1dcafd60926e49cd5 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:26:08 -0700
Subject: [PATCH] reiserfs: data=ordered support

From: Chris Mason <mason@suse.com>

reiserfs data=ordered support.
---
 fs/buffer.c                    |   1 +
 fs/reiserfs/file.c             | 167 ++++++++++++-----
 fs/reiserfs/inode.c            | 285 +++++++++++++++++++++-------
 fs/reiserfs/ioctl.c            |   7 +-
 fs/reiserfs/journal.c          | 411 +++++++++++++++++++++++++++++++++--------
 fs/reiserfs/super.c            |  52 ++++++
 include/linux/reiserfs_fs.h    |  44 ++++-
 include/linux/reiserfs_fs_sb.h |  65 ++++---
 8 files changed, 803 insertions(+), 229 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 605ce2099aa5..be9cc963a178 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1583,6 +1583,7 @@ int try_to_release_page(struct page *page, int gfp_mask)
 		return mapping->a_ops->releasepage(page, gfp_mask);
 	return try_to_free_buffers(page);
 }
+EXPORT_SYMBOL(try_to_release_page);
 
 /**
  * block_invalidatepage - invalidate part of all of a buffer-backed page
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 99321f2fcdf6..4b461667b231 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -9,6 +9,8 @@
 #include <asm/uaccess.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
 
 /*
 ** We pack the tails of files on file close, not at the time they are written.
@@ -150,6 +152,7 @@ out:
    Maps all unmapped but prepared pages from the list.
    Updates metadata with newly allocated blocknumbers as needed */
 int reiserfs_allocate_blocks_for_region(
+				struct reiserfs_transaction_handle *th,
 				struct inode *inode, /* Inode we work with */
 				loff_t pos, /* Writing position */
 				int num_pages, /* number of pages write going
@@ -167,7 +170,6 @@ int reiserfs_allocate_blocks_for_region(
     struct cpu_key key; // cpu key of item that we are going to deal with
     struct item_head *ih; // pointer to item head that we are going to deal with
     struct buffer_head *bh; // Buffer head that contains items that we are going to deal with
-    struct reiserfs_transaction_handle th; // transaction handle for transaction we are going to create.
     __u32 * item; // pointer to item we are going to deal with
     INITIALIZE_PATH(path); // path to item, that we are going to deal with.
     b_blocknr_t allocated_blocks[blocks_to_allocate]; // Pointer to a place where allocated blocknumbers would be stored. Right now statically allocated, later that will change.
@@ -194,7 +196,7 @@ int reiserfs_allocate_blocks_for_region(
     /* If we came here, it means we absolutely need to open a transaction,
        since we need to allocate some blocks */
     reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that.
-    journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
+    journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough
     reiserfs_update_inode_transaction(inode) ;
 
     /* Look for the in-tree position of our write, need path for block allocator */
@@ -206,7 +208,7 @@ int reiserfs_allocate_blocks_for_region(
    
     /* Allocate blocks */
     /* First fill in "hint" structure for block allocator */
-    hint.th = &th; // transaction handle.
+    hint.th = th; // transaction handle.
     hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine.
     hint.inode = inode; // Inode is needed by block allocator too.
     hint.search_start = 0; // We have no hint on where to search free blocks for block allocator.
@@ -222,7 +224,7 @@ int reiserfs_allocate_blocks_for_region(
 	    /* We flush the transaction in case of no space. This way some
 	       blocks might become free */
 	    SB_JOURNAL(inode->i_sb)->j_must_wait = 1;
-	    restart_transaction(&th, inode, &path);
+	    restart_transaction(th, inode, &path);
 
 	    /* We might have scheduled, so search again */
 	    res = search_for_position_by_key(inode->i_sb, &key, &path);
@@ -296,7 +298,7 @@ int reiserfs_allocate_blocks_for_region(
 		    /* Ok, there is existing indirect item already. Need to append it */
 		    /* Calculate position past inserted item */
 		    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-		    res = reiserfs_paste_into_item( &th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
+		    res = reiserfs_paste_into_item( th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste);
 		    if ( res ) {
 			kfree(zeros);
 			goto error_exit_free_blocks;
@@ -326,7 +328,7 @@ int reiserfs_allocate_blocks_for_region(
 		        kfree(zeros);
 			goto error_exit_free_blocks;
 		    }
-		    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)zeros);
+		    res = reiserfs_insert_item( th, &path, &key, &ins_ih, (char *)zeros);
 		} else {
 		    reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key);
 		}
@@ -336,8 +338,8 @@ int reiserfs_allocate_blocks_for_region(
 		}
 		/* Now we want to check if transaction is too full, and if it is
 		   we restart it. This will also free the path. */
-		if (journal_transaction_should_end(&th, th.t_blocks_allocated))
-		    restart_transaction(&th, inode, &path);
+		if (journal_transaction_should_end(th, th->t_blocks_allocated))
+		    restart_transaction(th, inode, &path);
 
 		/* Well, need to recalculate path and stuff */
 		set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits));
@@ -368,7 +370,7 @@ retry:
 	       one. */
 	    /* First if we are already modifying current item, log it */
 	    if ( modifying_this_item ) {
-		journal_mark_dirty (&th, inode->i_sb, bh);
+		journal_mark_dirty (th, inode->i_sb, bh);
 		modifying_this_item = 0;
 	    }
 	    /* Then set the key to look for a new indirect item (offset of old
@@ -432,7 +434,7 @@ retry:
 
     if ( modifying_this_item ) { // We need to log last-accessed block, if it
 				 // was modified, but not logged yet.
-	journal_mark_dirty (&th, inode->i_sb, bh);
+	journal_mark_dirty (th, inode->i_sb, bh);
     }
 
     if ( curr_block < blocks_to_allocate ) {
@@ -443,7 +445,7 @@ retry:
 	    // position. We do not need to recalculate path as it should
 	    // already point to correct place.
 	    make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3);
-	    res = reiserfs_paste_into_item( &th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
+	    res = reiserfs_paste_into_item( th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block));
 	    if ( res ) {
 		goto error_exit_free_blocks;
 	    }
@@ -474,29 +476,18 @@ retry:
 		goto error_exit_free_blocks;
 	    }
 	    /* Insert item into the tree with the data as its body */
-	    res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
+	    res = reiserfs_insert_item( th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block));
 	} else {
 	    reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key);
 	}
     }
 
-    /* Now the final thing, if we have grew the file, we must update it's size*/
-    if ( pos + write_bytes > inode->i_size) {
-	inode->i_size = pos + write_bytes; // Set new size
-	/* If the file have grown so much that tail packing is no longer possible, reset
-	   "need to pack" flag */
-	if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) ||
-	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
-	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
-    }
-
-    /* Amount of on-disk blocks used by file have changed, update it */
+    // the caller is responsible for closing the transaction
+    // unless we return an error, they are also responsible for logging
+    // the inode.
+    //
     inode->i_blocks += blocks_to_allocate << (inode->i_blkbits - 9);
-    reiserfs_update_sd(&th, inode); // And update on-disk metadata
-    // finish all journal stuff now, We are not going to play with metadata
-    // anymore.
     pathrelse(&path);
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     // go through all the pages/buffers and map the buffers to newly allocated
@@ -527,6 +518,7 @@ retry:
 	    if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it
 		map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block]));
 		curr_block++;
+		set_buffer_new(bh);
 	    }
 	}
     }
@@ -540,10 +532,11 @@ error_exit_free_blocks:
     pathrelse(&path);
     // free blocks
     for( i = 0; i < blocks_to_allocate; i++ )
-	reiserfs_free_block( &th, le32_to_cpu(allocated_blocks[i]));
+	reiserfs_free_block(th, le32_to_cpu(allocated_blocks[i]));
 
 error_exit:
-    journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
+    reiserfs_update_sd(th, inode); // update any changes we made to blk count
+    journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1);
     reiserfs_write_unlock(inode->i_sb);
 
     return res;
@@ -603,12 +596,63 @@ int reiserfs_copy_from_user_to_file_region(
     return page_fault?-EFAULT:0;
 }
 
+/* taken fs/buffer.c:__block_commit_write */
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to)
+{
+    unsigned block_start, block_end;
+    int partial = 0;
+    unsigned blocksize;
+    struct buffer_head *bh, *head;
+    unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT;
+    int new;
+
+    blocksize = 1 << inode->i_blkbits;
+
+    for(bh = head = page_buffers(page), block_start = 0;
+        bh != head || !block_start;
+	block_start=block_end, bh = bh->b_this_page)
+    {
+
+	new = buffer_new(bh);
+	clear_buffer_new(bh);
+	block_end = block_start + blocksize;
+	if (block_end <= from || block_start >= to) {
+	    if (!buffer_uptodate(bh))
+		    partial = 1;
+	} else {
+	    set_buffer_uptodate(bh);
+	    if (!buffer_dirty(bh)) {
+		mark_buffer_dirty(bh);
+		/* do data=ordered on any page past the end
+		 * of file and any buffer marked BH_New.
+		 */
+		if (reiserfs_data_ordered(inode->i_sb) &&
+		    (new || page->index >= i_size_index)) {
+		    reiserfs_add_ordered_list(inode, bh);
+	        }
+	    }
+	}
+    }
+
+    /*
+     * If this is a partial write which happened to make all buffers
+     * uptodate then we can optimize away a bogus readpage() for
+     * the next read(). Here we 'discover' whether the page went
+     * uptodate as a result of this (potentially partial) write.
+     */
+    if (!partial)
+	SetPageUptodate(page);
+    return 0;
+}
 
 
 /* Submit pages for write. This was separated from actual file copying
    because we might want to allocate block numbers in-between.
    This function assumes that caller will adjust file size to correct value. */
 int reiserfs_submit_file_region_for_write(
+				struct reiserfs_transaction_handle *th,
+				struct inode *inode,
 				loff_t pos, /* Writing position offset */
 				int num_pages, /* Number of pages to write */
 				int write_bytes, /* number of bytes to write */
@@ -619,12 +663,14 @@ int reiserfs_submit_file_region_for_write(
     int retval = 0; // Return value we are going to return.
     int i; // loop counter
     int offset; // Writing offset in page.
+    int orig_write_bytes = write_bytes;
+    int sd_update = 0;
 
     for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) {
 	int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page
 	struct page *page=prepared_pages[i]; // Current page we process.
 
-	status = block_commit_write(page, offset, offset+count);
+	status = reiserfs_commit_page(inode, page, offset, offset+count);
 	if ( status )
 	    retval = status; // To not overcomplicate matters We are going to
 			     // submit all the pages even if there was error.
@@ -636,6 +682,41 @@ int reiserfs_submit_file_region_for_write(
 			  // to grab_cache_page
 	page_cache_release(page);
     }
+    /* now that we've gotten all the ordered buffers marked dirty,
+     * we can safely update i_size and close any running transaction
+     */
+    if ( pos + orig_write_bytes > inode->i_size) {
+	inode->i_size = pos + orig_write_bytes; // Set new size
+	/* If the file have grown so much that tail packing is no
+	 * longer possible, reset "need to pack" flag */
+	if ( (have_large_tails (inode->i_sb) &&
+	      inode->i_size > i_block_size (inode)*4) ||
+	     (have_small_tails (inode->i_sb) &&
+	     inode->i_size > i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
+        else if ( (have_large_tails (inode->i_sb) &&
+	          inode->i_size < i_block_size (inode)*4) ||
+	          (have_small_tails (inode->i_sb) &&
+		  inode->i_size < i_block_size(inode)) )
+	    REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ;
+
+	if (th->t_trans_id) {
+	    reiserfs_write_lock(inode->i_sb);
+	    reiserfs_update_sd(th, inode); // And update on-disk metadata
+	    reiserfs_write_unlock(inode->i_sb);
+	} else
+	    inode->i_sb->s_op->dirty_inode(inode);
+
+        sd_update = 1;
+    }
+    if (th->t_trans_id) {
+	reiserfs_write_lock(inode->i_sb);
+	if (!sd_update)
+	    reiserfs_update_sd(th, inode);
+	journal_end(th, th->t_super, th->t_blocks_allocated);
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    th->t_trans_id = 0;
     return retval;
 }
 
@@ -1003,19 +1084,18 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
     loff_t pos; // Current position in the file.
     size_t res; // return value of various functions that we call.
     struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to.
-    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
 				/* To simplify coding at this time, we store
 				   locked pages in array for now */
-    if ( count <= PAGE_CACHE_SIZE )
-        return generic_file_write(file, buf, count, ppos);
+    struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME];
+    struct reiserfs_transaction_handle th;
+    th.t_trans_id = 0;
 
-    if ( file->f_flags & O_DIRECT) { // Direct IO needs some special threating.
+    if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment
 	int result, after_file_end = 0;
 	if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) {
 	    /* If we are appending a file, we need to put this savelink in here.
 	       If we will crash while doing direct io, finish_unfinished will
 	       cut the garbage from the file end. */
-	    struct reiserfs_transaction_handle th;
 	    reiserfs_write_lock(inode->i_sb);
 	    journal_begin(&th, inode->i_sb,  JOURNAL_PER_BALANCE_CNT );
 	    reiserfs_update_inode_transaction(inode);
@@ -1040,7 +1120,6 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	return result;
     }
 
-
     if ( unlikely((ssize_t) count < 0 ))
         return -EINVAL;
 
@@ -1146,11 +1225,7 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 
 	if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/
 	    /* Fill in all the possible holes and append the file if needed */
-	    res = reiserfs_allocate_blocks_for_region(inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
-	} else if ( pos + write_bytes > inode->i_size ) {
-	    /* File might have grown even though no new blocks were added */
-	    inode->i_size = pos + write_bytes;
-	    inode->i_sb->s_op->dirty_inode(inode);
+	    res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate);
 	}
 
 	/* well, we have allocated the blocks, so it is time to free
@@ -1173,7 +1248,8 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	}
 
 	/* Send the pages to disk and unlock them. */
-	res = reiserfs_submit_file_region_for_write(pos, num_pages, write_bytes, prepared_pages);
+	res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages,
+	                                            write_bytes,prepared_pages);
 	if ( res )
 	    break;
 
@@ -1184,10 +1260,17 @@ ssize_t reiserfs_file_write( struct file *file, /* the file we are going to writ
 	balance_dirty_pages_ratelimited(inode->i_mapping);
     }
 
+    /* this is only true on error */
+    if (th.t_trans_id) {
+        reiserfs_write_lock(inode->i_sb);
+	journal_end(&th, th.t_super, th.t_blocks_allocated);
+        reiserfs_write_unlock(inode->i_sb);
+    }
     if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
 	res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA);
 
     up(&inode->i_sem);
+    reiserfs_async_progress_wait(inode->i_sb);
     return (already_written != 0)?already_written:res;
 
 out:
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 06635c7f18a9..cf88e52a2cfc 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -22,9 +22,12 @@ extern int reiserfs_default_io_size; /* default io size devuned in super.c */
 #define GET_BLOCK_NO_HOLE 2   /* return -ENOENT for file holes */
 #define GET_BLOCK_READ_DIRECT 4  /* read the tail if indirect item not found */
 #define GET_BLOCK_NO_ISEM     8 /* i_sem is not held, don't preallocate */
+#define GET_BLOCK_NO_DANGLE   16 /* don't leave any transactions running */
 
 static int reiserfs_get_block (struct inode * inode, sector_t block,
 			       struct buffer_head * bh_result, int create);
+static int reiserfs_commit_write(struct file *f, struct page *page,
+                                 unsigned from, unsigned to);
 
 void reiserfs_delete_inode (struct inode * inode)
 {
@@ -103,12 +106,6 @@ inline void make_le_item_head (struct item_head * ih, const struct cpu_key * key
     put_ih_entry_count( ih, entry_count );
 }
 
-static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
-    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
-
-    buffer_insert_list(&j->j_dirty_buffers_lock, bh, &j->j_dirty_buffers) ;
-}
-
 //
 // FIXME: we might cache recently accessed indirect item
 
@@ -437,7 +434,8 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
        reiserfs_get_block() */
     bh_result->b_size = (1 << inode->i_blkbits);
 
-    ret = reiserfs_get_block(inode, iblock, bh_result, create) ;
+    ret = reiserfs_get_block(inode, iblock, bh_result,
+                             create | GET_BLOCK_NO_DANGLE) ;
 
     /* don't allow direct io onto tail pages */
     if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) {
@@ -510,15 +508,14 @@ static int convert_tail_for_hole(struct inode *inode,
     ** won't trigger a get_block in this case.
     */
     fix_tail_page_for_writing(tail_page) ;
-    retval = block_prepare_write(tail_page, tail_start, tail_end, 
-                                 reiserfs_get_block) ; 
+    retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end);
     if (retval)
         goto unlock ;
 
     /* tail conversion might change the data in the page */
     flush_dcache_page(tail_page) ;
 
-    retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ;
+    retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ;
 
 unlock:
     if (tail_page != hole_page) {
@@ -557,7 +554,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     __u32 * item;
     int done;
     int fs_gen;
-    struct reiserfs_transaction_handle th ;
+    struct reiserfs_transaction_handle *th = NULL;
     /* space reserved in transaction batch: 
         . 3 balancings in direct->indirect conversion
         . 1 block involved into reiserfs_update_sd()
@@ -565,12 +562,11 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
        can incur (much) more that 3 balancings. */
     int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1;
     int version;
-    int transaction_started = 0 ;
+    int dangle = 1;
     loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ;
 
 				/* bad.... */
     reiserfs_write_lock(inode->i_sb);
-    th.t_trans_id = 0 ;
     version = get_inode_item_key_version (inode);
 
     if (block < 0) {
@@ -594,6 +590,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	reiserfs_write_unlock(inode->i_sb);
 	return ret;
     }
+    /*
+     * if we're already in a transaction, make sure to close
+     * any new transactions we start in this func
+     */
+    if ((create & GET_BLOCK_NO_DANGLE) ||
+        reiserfs_transaction_running(inode->i_sb))
+        dangle = 0;
 
     /* If file is of such a size, that it might have a tail and tails are enabled
     ** we should mark it as possibly needing tail packing on close
@@ -606,9 +609,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     make_cpu_key (&key, inode, new_offset,
 		  TYPE_ANY, 3/*key length*/);
     if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) {
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
+start_trans:
+	th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count);
+	if (!th) {
+	    retval = -ENOMEM;
+	    goto failure;
+	}
 	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
     }
  research:
 
@@ -628,23 +635,21 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 
     if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) {
 	/* we have to allocate block for the unformatted node */
-	if (!transaction_started) {
+	if (!th) {
 	    pathrelse(&path) ;
-	    journal_begin(&th, inode->i_sb, jbegin_count) ;
-	    reiserfs_update_inode_transaction(inode) ;
-	    transaction_started = 1 ;
-	    goto research ;
+	    goto start_trans;
 	}
 
-	repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create);
+	repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create);
 
 	if (repeat == NO_DISK_SPACE) {
 	    /* restart the transaction to give the journal a chance to free
 	    ** some blocks.  releases the path, so we have to go back to
 	    ** research if we succeed on the second try
 	    */
-	    restart_transaction(&th, inode, &path) ; 
-	    repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create);
+	    SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1;
+	    restart_transaction(th, inode, &path) ;
+	    repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create);
 
 	    if (repeat != NO_DISK_SPACE) {
 		goto research ;
@@ -672,16 +677,18 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		goto research;
 	    }
 	    set_buffer_new(bh_result);
+	    if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb))
+	    	reiserfs_add_ordered_list(inode, bh_result);
 	    put_block_num(item, pos_in_item, allocated_block_nr) ;
             unfm_ptr = allocated_block_nr;
-	    journal_mark_dirty (&th, inode->i_sb, bh);
+	    journal_mark_dirty (th, inode->i_sb, bh);
 	    inode->i_blocks += (inode->i_sb->s_blocksize / 512) ;
-	    reiserfs_update_sd(&th, inode) ;
+	    reiserfs_update_sd(th, inode) ;
 	}
 	set_block_dev_mapped(bh_result, unfm_ptr, inode);
 	pathrelse (&path);
-	if (transaction_started)
-	    journal_end(&th, inode->i_sb, jbegin_count) ;
+	if (!dangle && th)
+	    reiserfs_end_persistent_transaction(th);
 
 	reiserfs_write_unlock(inode->i_sb);
 	 
@@ -692,16 +699,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	return 0;
     }
 
-    if (!transaction_started) {
-	/* if we don't pathrelse, we could vs-3050 on the buffer if
-	** someone is waiting for it (they can't finish until the buffer
-	** is released, we can start a new transaction until they finish)
-	*/
+    if (!th) {
 	pathrelse(&path) ;
-	journal_begin(&th, inode->i_sb, jbegin_count) ;
-	reiserfs_update_inode_transaction(inode) ;
-	transaction_started = 1 ;
-	goto research;
+	goto start_trans;
     }
 
     /* desired position is not found or is in the direct item. We have
@@ -729,9 +729,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	    set_cpu_key_k_offset (&tmp_key, 1);
 	    PATH_LAST_POSITION(&path) ++;
 
-	    retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp);
+	    retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, (char *)&unp);
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure; // retval == -ENOSPC or -EIO or -EEXIST
 	    }
 	    if (unp)
@@ -755,8 +755,14 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		   node. FIXME: this should also get into page cache */
 
 		pathrelse(&path) ;
-		journal_end(&th, inode->i_sb, jbegin_count) ;
-		transaction_started = 0 ;
+		/*
+		 * ugly, but we can only end the transaction if
+		 * we aren't nested
+		 */
+		if (th->t_refcount == 1) {
+		    reiserfs_end_persistent_transaction(th);
+		    th = NULL;
+		}
 
 		retval = convert_tail_for_hole(inode, bh_result, tail_offset) ;
 		if (retval) {
@@ -764,18 +770,19 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 			printk("clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ;
 		    if (allocated_block_nr) {
 			/* the bitmap, the super, and the stat data == 3 */
-			journal_begin(&th, inode->i_sb, 3) ;
-			reiserfs_free_block (&th, allocated_block_nr);
-			transaction_started = 1 ;
+			if (!th)
+			    th = reiserfs_persistent_transaction(inode->i_sb,3);
+			if (th)
+			    reiserfs_free_block (th, allocated_block_nr);
 		    }
 		    goto failure ;
 		}
 		goto research ;
 	    }
-	    retval = direct2indirect (&th, inode, &path, unbh, tail_offset);
+	    retval = direct2indirect (th, inode, &path, unbh, tail_offset);
 	    if (retval) {
 		reiserfs_unmap_buffer(unbh);
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure;
 	    }
 	    /* it is important the set_buffer_uptodate is done after
@@ -795,7 +802,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		/* we've converted the tail, so we must
 		** flush unbh before the transaction commits
 		*/
-		add_to_flushlist(inode, unbh) ;
+		reiserfs_add_tail_list(inode, unbh) ;
 
 		/* mark it dirty now to prevent commit_write from adding
 		** this buffer to the inode's dirty buffer list
@@ -858,13 +865,13 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 		   only have space for one block */
 		blocks_needed=max_to_insert?max_to_insert:1;
 	    }
-	    retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
+	    retval = reiserfs_paste_into_item (th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed);
 
 	    if (blocks_needed != 1)
 		kfree(un);
 
 	    if (retval) {
-		reiserfs_free_block (&th, allocated_block_nr);
+		reiserfs_free_block (th, allocated_block_nr);
 		goto failure;
 	    }
 	    if (done) {
@@ -889,8 +896,8 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 	** release the path so that anybody waiting on the path before
 	** ending their transaction will be able to continue.
 	*/
-	if (journal_transaction_should_end(&th, th.t_blocks_allocated)) {
-	  restart_transaction(&th, inode, &path) ; 
+	if (journal_transaction_should_end(th, th->t_blocks_allocated)) {
+	  restart_transaction(th, inode, &path) ;
 	}
 	/* inserting indirect pointers for a hole can take a 
 	** long time.  reschedule if needed
@@ -907,7 +914,7 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
 			      "%K should not be found\n", &key);
 	    retval = -EEXIST;
 	    if (allocated_block_nr)
-	        reiserfs_free_block (&th, allocated_block_nr);
+	        reiserfs_free_block (th, allocated_block_nr);
 	    pathrelse(&path) ;
 	    goto failure;
 	}
@@ -921,9 +928,9 @@ int reiserfs_get_block (struct inode * inode, sector_t block,
     retval = 0;
 
  failure:
-    if (transaction_started) {
-      reiserfs_update_sd(&th, inode) ;
-      journal_end(&th, inode->i_sb, jbegin_count) ;
+    if (th && !dangle) {
+      reiserfs_update_sd(th, inode) ;
+      reiserfs_end_persistent_transaction(th);
     }
     reiserfs_write_unlock(inode->i_sb);
     reiserfs_check_path(&path) ;
@@ -2007,7 +2014,8 @@ out:
     /* this is where we fill in holes in the file. */
     if (use_get_block) {
 	retval = reiserfs_get_block(inode, block, bh_result, 
-	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ;
+	                            GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM |
+				    GET_BLOCK_NO_DANGLE);
 	if (!retval) {
 	    if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) {
 	        /* get_block failed to find a mapped unformatted node. */
@@ -2219,13 +2227,43 @@ static int reiserfs_writepage (struct page * page, struct writeback_control *wbc
     return reiserfs_write_full_page(page, wbc) ;
 }
 
-
 int reiserfs_prepare_write(struct file *f, struct page *page, 
 			   unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
+    int ret;
+    int old_ref = 0;
+
     reiserfs_wait_on_write_block(inode->i_sb) ;
     fix_tail_page_for_writing(page) ;
-    return block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+	struct reiserfs_transaction_handle *th;
+        th = (struct reiserfs_transaction_handle *)current->journal_info;
+	old_ref = th->t_refcount;
+	th->t_refcount++;
+    }
+
+    ret = block_prepare_write(page, from, to, reiserfs_get_block) ;
+    if (ret && reiserfs_transaction_running(inode->i_sb)) {
+    	struct reiserfs_transaction_handle *th = current->journal_info;
+	/* this gets a little ugly.  If reiserfs_get_block returned an
+	 * error and left a transacstion running, we've got to close it,
+	 * and we've got to free handle if it was a persistent transaction.
+	 *
+	 * But, if we had nested into an existing transaction, we need
+	 * to just drop the ref count on the handle.
+	 *
+	 * If old_ref == 0, the transaction is from reiserfs_get_block,
+	 * and it was a persistent trans.  Otherwise, it was nested above.
+	 */
+	if (th->t_refcount > old_ref) {
+	    if (old_ref)
+	    	th->t_refcount--;
+	    else
+		reiserfs_end_persistent_transaction(th);
+	}
+    }
+    return ret;
+
 }
 
 
@@ -2237,16 +2275,21 @@ static int reiserfs_commit_write(struct file *f, struct page *page,
                                  unsigned from, unsigned to) {
     struct inode *inode = page->mapping->host ;
     loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-    int ret ; 
+    int ret = 0;
+    struct reiserfs_transaction_handle *th = NULL;
     
     reiserfs_wait_on_write_block(inode->i_sb) ;
+    if (reiserfs_transaction_running(inode->i_sb)) {
+        th = current->journal_info;
+    }
+    reiserfs_commit_page(inode, page, from, to);
  
     /* generic_commit_write does this for us, but does not update the
     ** transaction tracking stuff when the size changes.  So, we have
     ** to do the i_size updates here.
     */
     if (pos > inode->i_size) {
-	struct reiserfs_transaction_handle th ;
+	struct reiserfs_transaction_handle myth ;
 	reiserfs_write_lock(inode->i_sb);
 	/* If the file have grown beyond the border where it
 	   can have a tail, unmark it as needing a tail
@@ -2255,16 +2298,19 @@ static int reiserfs_commit_write(struct file *f, struct page *page,
 	     (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) )
 	    REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ;
 
-	journal_begin(&th, inode->i_sb, 1) ;
+	journal_begin(&myth, inode->i_sb, 1) ;
 	reiserfs_update_inode_transaction(inode) ;
 	inode->i_size = pos ;
-	reiserfs_update_sd(&th, inode) ;
-	journal_end(&th, inode->i_sb, 1) ;
+	reiserfs_update_sd(&myth, inode) ;
+	journal_end(&myth, inode->i_sb, 1) ;
+	reiserfs_write_unlock(inode->i_sb);
+    }
+    if (th) {
+	reiserfs_write_lock(inode->i_sb);
+        reiserfs_end_persistent_transaction(th);
 	reiserfs_write_unlock(inode->i_sb);
     }
  
-    ret = generic_commit_write(f, page, from, to) ;
-
     /* we test for O_SYNC here so we can commit the transaction
     ** for any packed tails the file might have had
     */
@@ -2324,16 +2370,110 @@ void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
 	}
 }
 
+/* decide if this buffer needs to stay around for data logging or ordered
+** write purposes
+*/
+static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh)
+{
+    int ret = 1 ;
+    struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ;
+
+    spin_lock(&j->j_dirty_buffers_lock) ;
+    if (!buffer_mapped(bh)) {
+        goto free_jh;
+    }
+    /* the page is locked, and the only places that log a data buffer
+     * also lock the page.
+     */
+#if 0
+    if (reiserfs_file_data_log(inode)) {
+	/* very conservative, leave the buffer pinned if anyone might need it.
+	** this should be changed to drop the buffer if it is only in the
+	** current transaction
+	*/
+        if (buffer_journaled(bh) || buffer_journal_dirty(bh)) {
+	    ret = 0 ;
+	}
+    } else
+#endif
+    if (buffer_dirty(bh) || buffer_locked(bh)) {
+	struct reiserfs_journal_list *jl;
+	struct reiserfs_jh *jh = bh->b_private;
+
+	/* why is this safe?
+	 * reiserfs_setattr updates i_size in the on disk
+	 * stat data before allowing vmtruncate to be called.
+	 *
+	 * If buffer was put onto the ordered list for this
+	 * transaction, we know for sure either this transaction
+	 * or an older one already has updated i_size on disk,
+	 * and this ordered data won't be referenced in the file
+	 * if we crash.
+	 *
+	 * if the buffer was put onto the ordered list for an older
+	 * transaction, we need to leave it around
+	 */
+	if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl)
+	    ret = 0;
+    }
+free_jh:
+    if (ret && bh->b_private) {
+        reiserfs_free_jh(bh);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock) ;
+    return ret ;
+}
+
+/* clm -- taken from fs/buffer.c:block_invalidate_page */
+static int reiserfs_invalidatepage(struct page *page, unsigned long offset)
+{
+    struct buffer_head *head, *bh, *next;
+    struct inode *inode = page->mapping->host;
+    unsigned int curr_off = 0;
+    int ret = 1;
+
+    BUG_ON(!PageLocked(page));
+    if (!page_has_buffers(page))
+	goto out;
+
+    head = page_buffers(page);
+    bh = head;
+    do {
+	unsigned int next_off = curr_off + bh->b_size;
+	next = bh->b_this_page;
+
+	/*
+	 * is this block fully invalidated?
+	 */
+	if (offset <= curr_off) {
+	    if (invalidatepage_can_drop(inode, bh))
+		reiserfs_unmap_buffer(bh);
+	    else
+	        ret = 0;
+	}
+	curr_off = next_off;
+	bh = next;
+    } while (bh != head);
+
+    /*
+     * We release buffers only if the entire page is being invalidated.
+     * The get_block cached value has been unconditionally invalidated,
+     * so real IO is not possible anymore.
+     */
+    if (!offset && ret)
+	ret = try_to_release_page(page, 0);
+out:
+    return ret;
+}
+
 /*
  * Returns 1 if the page's buffers were dropped.  The page is locked.
  *
  * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads
  * in the buffers at page_buffers(page).
  *
- * FIXME: Chris says the buffer list is not used with `mount -o notail',
- * so in that case the fs can avoid the extra locking.  Create a second
- * address_space_operations with a NULL ->releasepage and install that
- * into new address_spaces.
+ * even in -o notail mode, we can't be sure an old mount without -o notail
+ * didn't create files with tails.
  */
 static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
 {
@@ -2347,11 +2487,13 @@ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags)
     head = page_buffers(page) ;
     bh = head ;
     do {
-	if (!buffer_dirty(bh) && !buffer_locked(bh)) {
-		list_del_init(&bh->b_assoc_buffers) ;
-	} else {
+	if (bh->b_private) {
+	    if (!buffer_dirty(bh) && !buffer_locked(bh)) {
+		reiserfs_free_jh(bh);
+	    } else {
 		ret = 0 ;
 		break ;
+	    }
 	}
 	bh = bh->b_this_page ;
     } while (bh != head) ;
@@ -2379,6 +2521,7 @@ struct address_space_operations reiserfs_address_space_operations = {
     .readpage = reiserfs_readpage, 
     .readpages = reiserfs_readpages, 
     .releasepage = reiserfs_releasepage,
+    .invalidatepage = reiserfs_invalidatepage,
     .sync_page = block_sync_page,
     .prepare_write = reiserfs_prepare_write,
     .commit_write = reiserfs_commit_write,
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 9d7a018c366f..ec59e074416a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -92,6 +92,7 @@ int reiserfs_unpack (struct inode * inode, struct file * filp)
     int retval = 0;
     int index ;
     struct page *page ;
+    struct address_space *mapping ;
     unsigned long write_from ;
     unsigned long blocksize = inode->i_sb->s_blocksize ;
     	
@@ -122,17 +123,19 @@ int reiserfs_unpack (struct inode * inode, struct file * filp)
     ** reiserfs_get_block to unpack the tail for us.
     */
     index = inode->i_size >> PAGE_CACHE_SHIFT ;
-    page = grab_cache_page(inode->i_mapping, index) ;
+    mapping = inode->i_mapping ;
+    page = grab_cache_page(mapping, index) ;
     retval = -ENOMEM;
     if (!page) {
         goto out ;
     }
-    retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ;
+    retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ;
     if (retval)
         goto out_unlock ;
 
     /* conversion can change page contents, must flush */
     flush_dcache_page(page) ;
+    retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ;
     REISERFS_I(inode)->i_flags |= i_nopack_mask;
 
 out_unlock:
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index cfff6ec0871f..17278f415916 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -54,6 +54,7 @@
 #include <linux/buffer_head.h>
 #include <linux/workqueue.h>
 #include <linux/writeback.h>
+#include <linux/blkdev.h>
 
 
 /* gets a struct reiserfs_journal_list * from a list head */
@@ -595,6 +596,248 @@ static int journal_list_still_alive(struct super_block *s,
     return 0;
 }
 
+static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
+    char b[BDEVNAME_SIZE];
+
+    if (buffer_journaled(bh)) {
+        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
+	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
+    }
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) {
+    if (uptodate)
+    	set_buffer_uptodate(bh) ;
+    else
+    	clear_buffer_uptodate(bh) ;
+    unlock_buffer(bh) ;
+    put_bh(bh) ;
+}
+
+static void submit_logged_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_buffer_io_sync ;
+    mark_buffer_notjournal_new(bh) ;
+    clear_buffer_dirty(bh) ;
+    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
+        BUG();
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+static void submit_ordered_buffer(struct buffer_head *bh) {
+    get_bh(bh) ;
+    bh->b_end_io = reiserfs_end_ordered_io;
+    clear_buffer_dirty(bh) ;
+    if (!buffer_uptodate(bh))
+        BUG();
+    submit_bh(WRITE, bh) ;
+}
+
+#define CHUNK_SIZE 32
+struct buffer_chunk {
+    struct buffer_head *bh[CHUNK_SIZE];
+    int nr;
+};
+
+static void write_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_logged_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static void write_ordered_chunk(struct buffer_chunk *chunk) {
+    int i;
+    for (i = 0; i < chunk->nr ; i++) {
+	submit_ordered_buffer(chunk->bh[i]) ;
+    }
+    chunk->nr = 0;
+}
+
+static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
+			 spinlock_t *lock,
+			 void (fn)(struct buffer_chunk *))
+{
+    int ret = 0;
+    if (chunk->nr >= CHUNK_SIZE)
+        BUG();
+    chunk->bh[chunk->nr++] = bh;
+    if (chunk->nr >= CHUNK_SIZE) {
+	ret = 1;
+        if (lock)
+	    spin_unlock(lock);
+        fn(chunk);
+        if (lock)
+	    spin_lock(lock);
+    }
+    return ret;
+}
+
+
+atomic_t nr_reiserfs_jh = ATOMIC_INIT(0);
+static struct reiserfs_jh *alloc_jh(void) {
+    struct reiserfs_jh *jh;
+    while(1) {
+	jh = kmalloc(sizeof(*jh), GFP_NOFS);
+	if (jh) {
+	    atomic_inc(&nr_reiserfs_jh);
+	    return jh;
+	}
+        yield();
+    }
+}
+
+/*
+ * we want to free the jh when the buffer has been written
+ * and waited on
+ */
+void reiserfs_free_jh(struct buffer_head *bh) {
+    struct reiserfs_jh *jh;
+
+    jh = bh->b_private;
+    if (jh) {
+	bh->b_private = NULL;
+	jh->bh = NULL;
+	list_del_init(&jh->list);
+	kfree(jh);
+	if (atomic_read(&nr_reiserfs_jh) <= 0)
+	    BUG();
+	atomic_dec(&nr_reiserfs_jh);
+	put_bh(bh);
+    }
+}
+
+static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh,
+                           int tail)
+{
+    struct reiserfs_jh *jh;
+
+    if (bh->b_private) {
+	spin_lock(&j->j_dirty_buffers_lock);
+	if (!bh->b_private) {
+	    spin_unlock(&j->j_dirty_buffers_lock);
+	    goto no_jh;
+	}
+        jh = bh->b_private;
+	list_del_init(&jh->list);
+    } else {
+no_jh:
+	get_bh(bh);
+	jh = alloc_jh();
+	spin_lock(&j->j_dirty_buffers_lock);
+	/* buffer must be locked for __add_jh, should be able to have
+	 * two adds at the same time
+	 */
+	if (bh->b_private)
+	    BUG();
+	jh->bh = bh;
+	bh->b_private = jh;
+    }
+    jh->jl = j->j_current_jl;
+    if (tail)
+	list_add_tail(&jh->list, &jh->jl->j_tail_bh_list);
+    else {
+	list_add_tail(&jh->list, &jh->jl->j_bh_list);
+    }
+    spin_unlock(&j->j_dirty_buffers_lock);
+    return 0;
+}
+
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1);
+}
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) {
+    return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0);
+}
+
+#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list)
+static int write_ordered_buffers(spinlock_t *lock,
+				 struct reiserfs_journal *j,
+                                 struct reiserfs_journal_list *jl,
+				 struct list_head *list)
+{
+    struct buffer_head *bh;
+    struct reiserfs_jh *jh;
+    int ret = 0;
+    struct buffer_chunk chunk;
+    struct list_head tmp;
+    INIT_LIST_HEAD(&tmp);
+
+    chunk.nr = 0;
+    spin_lock(lock);
+    while(!list_empty(list)) {
+        jh = JH_ENTRY(list->next);
+	bh = jh->bh;
+	get_bh(bh);
+	if (test_set_buffer_locked(bh)) {
+	    if (!buffer_dirty(bh)) {
+		list_del_init(&jh->list);
+		list_add(&jh->list, &tmp);
+		goto loop_next;
+	    }
+	    spin_unlock(lock);
+	    if (chunk.nr)
+		write_ordered_chunk(&chunk);
+	    wait_on_buffer(bh);
+	    if (need_resched)
+	        schedule();
+	    spin_lock(lock);
+	    goto loop_next;
+	}
+	if (buffer_dirty(bh)) {
+	    list_del_init(&jh->list);
+	    list_add(&jh->list, &tmp);
+	    add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
+	} else {
+	    reiserfs_free_jh(bh);
+	    unlock_buffer(bh);
+	}
+loop_next:
+	put_bh(bh);
+	if (chunk.nr == 0 && need_resched) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    if (chunk.nr) {
+	spin_unlock(lock);
+        write_ordered_chunk(&chunk);
+	spin_lock(lock);
+    }
+    while(!list_empty(&tmp)) {
+        jh = JH_ENTRY(tmp.prev);
+	bh = jh->bh;
+	get_bh(bh);
+	reiserfs_free_jh(bh);
+
+	if (buffer_locked(bh)) {
+	    spin_unlock(lock);
+	    wait_on_buffer(bh);
+	    spin_lock(lock);
+	}
+	if (!buffer_uptodate(bh))
+	    ret = -EIO;
+	put_bh(bh);
+	if (need_resched()) {
+	    spin_unlock(lock);
+	    schedule();
+	    spin_lock(lock);
+	}
+    }
+    spin_unlock(lock);
+    return ret;
+}
+
 static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) {
     struct reiserfs_journal_list *other_jl;
     struct reiserfs_journal_list *first_jl;
@@ -656,6 +899,13 @@ find_first:
     }
     return 0;
 }
+int reiserfs_async_progress_wait(struct super_block *s) {
+    DEFINE_WAIT(wait);
+    struct reiserfs_journal *j = SB_JOURNAL(s);
+    if (atomic_read(&j->j_async_throttle))
+    	blk_congestion_wait(WRITE, HZ/10);
+    return 0;
+}
 
 /*
 ** if this journal list still has commit blocks unflushed, send them to disk.
@@ -710,28 +960,40 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
     goto put_jl;
   }
 
+  if (!list_empty(&jl->j_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(s)->j_dirty_buffers_lock,
+                            SB_JOURNAL(s), jl, &jl->j_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_bh_list))
+      BUG();
   /*
    * for the description block and all the log blocks, submit any buffers
    * that haven't already reached the disk
    */
+  atomic_inc(&SB_JOURNAL(s)->j_async_throttle);
   for (i = 0 ; i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) %
          SB_ONDISK_JOURNAL_SIZE(s);
     tbh = journal_find_get_block(s, bn) ;
-    wait_on_buffer(tbh) ;
-    ll_rw_block(WRITE, 1, &tbh) ;
+    if (buffer_dirty(tbh))
+	ll_rw_block(WRITE, 1, &tbh) ;
     put_bh(tbh) ;
   }
+  atomic_dec(&SB_JOURNAL(s)->j_async_throttle);
 
   /* wait on everything written so far before writing the commit */
   for (i = 0 ;  i < (jl->j_len + 1) ; i++) {
     bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
 	 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
     tbh = journal_find_get_block(s, bn) ;
-
     wait_on_buffer(tbh) ;
+    // since we're using ll_rw_blk above, it might have skipped over
+    // a locked buffer.  Double check here
+    //
     if (buffer_dirty(tbh))
-      BUG();
+      sync_dirty_buffer(tbh);
     if (!buffer_uptodate(tbh)) {
       reiserfs_panic(s, "journal-601, buffer write failed\n") ;
     }
@@ -892,33 +1154,6 @@ restart:
     return 0 ;
 }
 
-static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) {
-    char b[BDEVNAME_SIZE];
-
-    if (buffer_journaled(bh)) {
-        reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n",
-	                 bh->b_blocknr, bdevname(bh->b_bdev, b)) ;
-    }
-    if (uptodate)
-    	set_buffer_uptodate(bh) ;
-    else
-    	clear_buffer_uptodate(bh) ;
-    unlock_buffer(bh) ;
-    put_bh(bh) ;
-}
-
-static void submit_logged_buffer(struct buffer_head *bh) {
-    get_bh(bh) ;
-    bh->b_end_io = reiserfs_end_buffer_io_sync ;
-    mark_buffer_notjournal_new(bh) ;
-    clear_buffer_dirty(bh) ;
-    if (!test_and_clear_bit(BH_JTest, &bh->b_state))
-        BUG();
-    if (!buffer_uptodate(bh))
-        BUG();
-    submit_bh(WRITE, bh) ;
-}
-
 static void del_from_work_list(struct super_block *s,
                                struct reiserfs_journal_list *jl) {
     if (!list_empty(&jl->j_working_list)) {
@@ -1158,28 +1393,6 @@ flush_older_and_return:
   return 0 ;
 } 
 
-#define CHUNK_SIZE 32
-struct buffer_chunk {
-    struct buffer_head *bh[CHUNK_SIZE];
-    int nr;
-};
-
-static void write_chunk(struct buffer_chunk *chunk) {
-    int i;
-    for (i = 0; i < chunk->nr ; i++) {
-	submit_logged_buffer(chunk->bh[i]) ;
-    }
-    chunk->nr = 0;
-}
-
-static void add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh) {
-    if (chunk->nr >= CHUNK_SIZE)
-        BUG();
-    chunk->bh[chunk->nr++] = bh;
-    if (chunk->nr >= CHUNK_SIZE)
-        write_chunk(chunk);
-}
-
 static int write_one_transaction(struct super_block *s,
                                  struct reiserfs_journal_list *jl,
 				 struct buffer_chunk *chunk)
@@ -1214,7 +1427,7 @@ static int write_one_transaction(struct super_block *s,
 		if (!buffer_journal_dirty(tmp_bh) ||
 		    reiserfs_buffer_prepared(tmp_bh))
 		    BUG();
-		add_to_chunk(chunk, tmp_bh);
+		add_to_chunk(chunk, tmp_bh, NULL, write_chunk);
 		ret++;
 	    } else {
 		/* note, cn->bh might be null now */
@@ -1937,6 +2150,8 @@ retry:
     memset(jl, 0, sizeof(*jl));
     INIT_LIST_HEAD(&jl->j_list);
     INIT_LIST_HEAD(&jl->j_working_list);
+    INIT_LIST_HEAD(&jl->j_tail_bh_list);
+    INIT_LIST_HEAD(&jl->j_bh_list);
     sema_init(&jl->j_commit_lock, 1);
     SB_JOURNAL(s)->j_num_lists++;
     get_journal_list(jl);
@@ -2166,6 +2381,7 @@ int journal_init(struct super_block *p_s_sb, const char * j_dev_name, int old_fo
   SB_JOURNAL(p_s_sb)->j_len = 0 ;
   SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ;
   atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ;
+  atomic_set(&(SB_JOURNAL(p_s_sb)->j_async_throttle), 0) ;
   SB_JOURNAL(p_s_sb)->j_bcount = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ;	  
   SB_JOURNAL(p_s_sb)->j_last = NULL ;	  
@@ -2376,6 +2592,43 @@ relock:
   return 0 ;
 }
 
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *s, int nblocks) {
+    int ret ;
+    struct reiserfs_transaction_handle *th ;
+
+    /* if we're nesting into an existing transaction.  It will be
+    ** persistent on its own
+    */
+    if (reiserfs_transaction_running(s)) {
+        th = current->journal_info ;
+	th->t_refcount++ ;
+	if (th->t_refcount < 2) {
+	    BUG() ;
+	}
+	return th ;
+    }
+    th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ;
+    if (!th)
+       return NULL;
+    ret = journal_begin(th, s, nblocks) ;
+    if (ret) {
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+        return NULL;
+    }
+    return th ;
+}
+
+int
+reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) {
+    struct super_block *s = th->t_super;
+    int ret;
+    ret = journal_end(th, th->t_super, th->t_blocks_allocated);
+    if (th->t_refcount == 0)
+	reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ;
+    return ret;
+}
+
 static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   struct reiserfs_transaction_handle *cur_th = current->journal_info;
 
@@ -2522,7 +2775,9 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th, struct super_bloc
 int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) {
   if (!current->journal_info && th->t_refcount > 1)
     printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount);
-  if (th->t_refcount > 1) {
+
+  th->t_refcount--;
+  if (th->t_refcount > 0) {
     struct reiserfs_transaction_handle *cur_th = current->journal_info ;
 
     /* we aren't allowed to close a nested transaction on a different
@@ -2531,7 +2786,6 @@ int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_
     if (cur_th->t_super != th->t_super)
       BUG() ;
 
-    th->t_refcount--;
     if (th != cur_th) {
       memcpy(current->journal_info, th, sizeof(*th));
       th->t_trans_id = 0;
@@ -2648,14 +2902,7 @@ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block
 }
 
 /*
-** used to get memory back from async commits that are floating around
-** and to reclaim any blocks deleted but unusable because their commits
-** haven't hit disk yet.  called from bitmap.c
-**
-** if it starts flushing things, it ors SCHEDULE_OCCURRED into repeat.
-** note, this is just if schedule has a chance of occurring.  I need to 
-** change flush_commit_lists to have a repeat parameter too.
-**
+** writeback the pending async commits to disk
 */
 static void flush_async_commits(void *p) {
   struct super_block *p_s_sb = p;
@@ -2670,6 +2917,9 @@ static void flush_async_commits(void *p) {
       flush_commit_list(p_s_sb, jl, 1);
   }
   unlock_kernel();
+  atomic_inc(&SB_JOURNAL(p_s_sb)->j_async_throttle);
+  filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping);
+  atomic_dec(&SB_JOURNAL(p_s_sb)->j_async_throttle);
 }
 
 /*
@@ -3072,6 +3322,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   if (!check_journal_end(th, p_s_sb, nblocks, flags)) {
     p_s_sb->s_dirt = 1;
     wake_queued_writers(p_s_sb);
+    reiserfs_async_progress_wait(p_s_sb);
     goto out ;
   }
 
@@ -3248,23 +3499,38 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
   SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ;
   init_journal_hash(p_s_sb) ; 
 
+  // make sure reiserfs_add_jh sees the new current_jl before we
+  // write out the tails
+  smp_mb();
+
   /* tail conversion targets have to hit the disk before we end the
    * transaction.  Otherwise a later transaction might repack the tail
    * before this transaction commits, leaving the data block unflushed and
    * clean, if we crash before the later transaction commits, the data block
    * is lost.
    */
-  fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock),
-		     &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ;
+  if (!list_empty(&jl->j_tail_bh_list)) {
+      unlock_kernel();
+      write_ordered_buffers(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock,
+			    SB_JOURNAL(p_s_sb), jl, &jl->j_tail_bh_list);
+      lock_kernel();
+  }
+  if (!list_empty(&jl->j_tail_bh_list))
+      BUG();
   up(&jl->j_commit_lock);
 
   /* honor the flush wishes from the caller, simple commits can
   ** be done outside the journal lock, they are done below
+  **
+  ** if we don't flush the commit list right now, we put it into
+  ** the work queue so the people waiting on the async progress work
+  ** queue don't wait for this proc to flush journal lists and such.
   */
   if (flush) {
     flush_commit_list(p_s_sb, jl, 1) ;
     flush_journal_list(p_s_sb, jl, 1) ;
-  }
+  } else
+    queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
 
 
   /* if the next transaction has any chance of wrapping, flush 
@@ -3322,15 +3588,12 @@ first_jl:
   clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state);
   wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ;
 
-  if (!flush) {
-      if (wait_on_commit) {
-	  if (journal_list_still_alive(p_s_sb, commit_trans_id))
-	      flush_commit_list(p_s_sb, jl, 1) ;
-      } else {
-          queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work);
-      }
+  if (!flush && wait_on_commit &&
+      journal_list_still_alive(p_s_sb, commit_trans_id)) {
+	  flush_commit_list(p_s_sb, jl, 1) ;
   }
 out:
   reiserfs_check_lock_depth("journal end2");
+  th->t_trans_id = 0;
   return 0 ;
 }
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f75349fe4787..57991831eeef 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -510,6 +510,14 @@ typedef struct {
 		    applied BEFORE setmask */
 } opt_desc_t;
 
+/* possible values for -o data= */
+static const arg_desc_t logging_mode[] = {
+    {"ordered", 1<<REISERFS_DATA_ORDERED, (1<<REISERFS_DATA_LOG|1<<REISERFS_DATA_WRITEBACK)},
+    {"journal", 1<<REISERFS_DATA_LOG, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_WRITEBACK)},
+    {"writeback", 1<<REISERFS_DATA_WRITEBACK, (1<<REISERFS_DATA_ORDERED|1<<REISERFS_DATA_LOG)},
+    {NULL, 0}
+};
+
 /* possible values for "-o block-allocator=" and bits which are to be set in
    s_mount_opt of reiserfs specific part of in-core super block */
 static const arg_desc_t balloc[] = {
@@ -664,6 +672,7 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
 	{"nolog", 0, 0, 0, 0}, /* This is unsupported */
 	{"replayonly", 0, 0, 1<<REPLAYONLY, 0},
 	{"block-allocator", 'a', balloc, 0, 0},
+	{"data", 'd', logging_mode, 0, 0},
 	{"resize", 'r', 0, 0, 0},
 	{"jdev", 'j', 0, 0, 0},
 	{"nolargeio", 'w', 0, 0, 0},
@@ -737,6 +746,33 @@ static int reiserfs_parse_options (struct super_block * s, char * options, /* st
     return 1;
 }
 
+static void switch_data_mode(struct super_block *s, unsigned long mode) {
+    REISERFS_SB(s)->s_mount_opt &= ~((1 << REISERFS_DATA_LOG) |
+                                       (1 << REISERFS_DATA_ORDERED) |
+				       (1 << REISERFS_DATA_WRITEBACK));
+    REISERFS_SB(s)->s_mount_opt |= (1 << mode);
+}
+
+static void handle_data_mode(struct super_block *s, unsigned long mount_options)
+{
+    if (mount_options & (1 << REISERFS_DATA_LOG)) {
+        if (!reiserfs_data_log(s)) {
+	    switch_data_mode(s, REISERFS_DATA_LOG);
+	    printk("reiserfs: switching to journaled data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) {
+        if (!reiserfs_data_ordered(s)) {
+	    switch_data_mode(s, REISERFS_DATA_ORDERED);
+	    printk("reiserfs: switching to ordered data mode\n");
+	}
+    } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) {
+        if (!reiserfs_data_writeback(s)) {
+	    switch_data_mode(s, REISERFS_DATA_WRITEBACK);
+	    printk("reiserfs: switching to writeback data mode\n");
+	}
+    }
+}
+
 static void handle_attrs( struct super_block *s )
 {
 	struct reiserfs_super_block * rs;
@@ -814,6 +850,7 @@ static int reiserfs_remount (struct super_block * s, int * mount_flags, char * a
     if (!(s->s_flags & MS_RDONLY))
 	return 0; /* We are read-write already */
 
+    handle_data_mode(s, mount_options);
     REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ;
     s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */
     journal_begin(&th, s, 10) ;
@@ -1306,6 +1343,21 @@ static int reiserfs_fill_super (struct super_block * s, void * data, int silent)
     SPRINTK(silent, "reiserfs:warning: - it is slow mode for debugging.\n");
 #endif
 
+    /* make data=ordered the default */
+    if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
+        !reiserfs_data_writeback(s))
+    {
+         REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+    }
+
+    if (reiserfs_data_log(s)) {
+        printk("reiserfs: using journaled data mode\n");
+    } else if (reiserfs_data_ordered(s)) {
+        printk("reiserfs: using ordered data mode\n");
+    } else {
+        printk("reiserfs: using writeback data mode\n");
+    }
+
     // set_device_ro(s->s_dev, 1) ;
     if( journal_init(s, jdev_name, old_format, commit_max_age) ) {
 	SPRINTK(silent, "sh-2022: reiserfs_fill_super: unable to initialize journal space\n") ;
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index fb0bf2af7fd7..31e8047f0f41 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1707,6 +1707,15 @@ struct reiserfs_journal_header {
 #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
 
+enum reiserfs_bh_state_bits {
+    BH_JDirty = BH_PrivateStart,
+    BH_JDirty_wait,
+    BH_JNew,
+    BH_JPrepared,
+    BH_JRestore_dirty,
+    BH_JTest, // debugging only will go away
+};
+
 /*
 ** transaction handle which is passed around for all journal calls
 */
@@ -1726,7 +1735,36 @@ struct reiserfs_transaction_handle {
 				   should be displaced from others */
 } ;
 
+/* used to keep track of ordered and tail writes, attached to the buffer
+ * head through b_journal_head.
+ */
+struct reiserfs_jh {
+    struct reiserfs_journal_list *jl;
+    struct buffer_head *bh;
+    struct list_head list;
+};
+
+void reiserfs_free_jh(struct buffer_head *bh);
+int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh);
+int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh);
 int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
+
+static inline int reiserfs_transaction_running(struct super_block *s) {
+    struct reiserfs_transaction_handle *th = current->journal_info ;
+    if (th && th->t_super == s)
+        return 1 ;
+    if (th && th->t_super == NULL)
+        BUG();
+    return 0 ;
+}
+
+int reiserfs_async_progress_wait(struct super_block *s);
+
+struct reiserfs_transaction_handle *
+reiserfs_persistent_transaction(struct super_block *, int count);
+int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
+int reiserfs_commit_page(struct inode *inode, struct page *page,
+		unsigned from, unsigned to);
 int reiserfs_flush_old_commits(struct super_block *);
 void reiserfs_commit_for_inode(struct inode *) ;
 void reiserfs_update_inode_transaction(struct inode *) ;
@@ -1741,7 +1779,6 @@ int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ;
 int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
 int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ;
-int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ;
 int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ;
 int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ;
 int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ;
@@ -1749,11 +1786,6 @@ int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_
 
 int buffer_journaled(const struct buffer_head *bh) ;
 int mark_buffer_journal_new(struct buffer_head *bh) ;
-int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *,
-                                    struct inode *, struct buffer_head *) ;
-int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *,
-                                         struct inode *) ;
-
 int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ;
 
 				/* why is this kerplunked right here? */
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index e1fe3ebe33c0..3248dcf369f2 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -107,21 +107,6 @@ typedef enum {
 #define JOURNAL_HASH_SIZE 8192   
 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating.  Must be >= 2 */
 
-/* these are bh_state bit flag offset numbers, for use in the buffer head */
-
-#define BH_JDirty       16      /* journal data needs to be written before buffer can be marked dirty */
-#define BH_JDirty_wait 18	/* commit is done, buffer marked dirty */
-#define BH_JNew 19		/* buffer allocated during this transaction, no need to write if freed during this trans too */
-
-/* ugly.  metadata blocks must be prepared before they can be logged.  
-** prepared means unlocked and cleaned.  If the block is prepared, but not
-** logged for some reason, any bits cleared while preparing it must be 
-** set again.
-*/
-#define BH_JPrepared 20		/* block has been prepared for the log */
-#define BH_JRestore_dirty 22    /* restore the dirty bit later */
-#define BH_JTest 23             /* debugging use only */
-
 /* One of these for every block in every transaction
 ** Each one is in two hash tables.  First, a hash of the current transaction, and after journal_end, a
 ** hash of all the in memory transactions.
@@ -178,6 +163,11 @@ struct reiserfs_journal_list {
 
   /* time ordered list of all transactions we haven't tried to flush yet */
   struct list_head j_working_list;
+
+  /* list of tail conversion targets in need of flush before commit */
+  struct list_head j_tail_bh_list;
+  /* list of data=ordered buffers in need of flush before commit */
+  struct list_head j_bh_list;
   int j_refcount;
 } ;
 
@@ -253,7 +243,9 @@ struct reiserfs_journal {
   unsigned long j_max_trans_size ;
   unsigned long j_max_batch_size ;
 
+  /* when flushing ordered buffers, throttle new ordered writers */
   struct work_struct j_work;
+  atomic_t j_async_throttle;
 };
 
 #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick.  magic string to find desc blocks in the journal */
@@ -408,11 +400,12 @@ struct reiserfs_sb_info
 #define REISERFS_3_5 0
 #define REISERFS_3_6 1
 
+enum reiserfs_mount_options {
 /* Mount options */
-#define REISERFS_LARGETAIL 0  /* large tails will be created in a session */
-#define REISERFS_SMALLTAIL 17  /* small (for files less than block size) tails will be created in a session */
-#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */
-#define REISERFS_CONVERT 5    /* -o conv: causes conversion of old
+    REISERFS_LARGETAIL,  /* large tails will be created in a session */
+    REISERFS_SMALLTAIL,  /* small (for files less than block size) tails will be created in a session */
+    REPLAYONLY, /* replay journal and return 0. Use by fsck */
+    REISERFS_CONVERT,    /* -o conv: causes conversion of old
                                  format super block to the new
                                  format. If not specified - old
                                  partition will be dealt with in a
@@ -426,26 +419,29 @@ struct reiserfs_sb_info
 ** the existing hash on the FS, so if you have a tea hash disk, and mount
 ** with -o hash=rupasov, the mount will fail.
 */
-#define FORCE_TEA_HASH 6      /* try to force tea hash on mount */
-#define FORCE_RUPASOV_HASH 7  /* try to force rupasov hash on mount */
-#define FORCE_R5_HASH 8       /* try to force rupasov hash on mount */
-#define FORCE_HASH_DETECT 9   /* try to detect hash function on mount */
+    FORCE_TEA_HASH,      /* try to force tea hash on mount */
+    FORCE_RUPASOV_HASH,  /* try to force rupasov hash on mount */
+    FORCE_R5_HASH,       /* try to force rupasov hash on mount */
+    FORCE_HASH_DETECT,   /* try to detect hash function on mount */
 
+    REISERFS_DATA_LOG,
+    REISERFS_DATA_ORDERED,
+    REISERFS_DATA_WRITEBACK,
 
 /* used for testing experimental features, makes benchmarking new
    features with and without more convenient, should never be used by
    users in any code shipped to users (ideally) */
 
-#define REISERFS_NO_BORDER 11
-#define REISERFS_NO_UNHASHED_RELOCATION 12
-#define REISERFS_HASHED_RELOCATION 13
-
-#define REISERFS_ATTRS 15
+    REISERFS_NO_BORDER,
+    REISERFS_NO_UNHASHED_RELOCATION,
+    REISERFS_HASHED_RELOCATION,
+    REISERFS_ATTRS,
 
-#define REISERFS_TEST1 11
-#define REISERFS_TEST2 12
-#define REISERFS_TEST3 13
-#define REISERFS_TEST4 14 
+    REISERFS_TEST1,
+    REISERFS_TEST2,
+    REISERFS_TEST3,
+    REISERFS_TEST4,
+};
 
 #define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH))
 #define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH))
@@ -459,11 +455,12 @@ struct reiserfs_sb_info
 #define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL))
 #define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL))
 #define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY))
-#define reiserfs_dont_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NOLOG))
 #define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS))
 #define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5))
 #define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT))
-
+#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG))
+#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED))
+#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 
 void reiserfs_file_buffer (struct buffer_head * bh, int list);
 extern struct file_system_type reiserfs_fs_type;
-- 
cgit v1.2.3


From b566678f923387aa0cf3ec6d56ff368c3053ea0f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:28:19 -0700
Subject: [PATCH] reiserfs_kfree warning fix

fs/reiserfs/journal.c: In function `reiserfs_end_persistent_transaction':
fs/reiserfs/journal.c:2616: warning: unused variable `s'

Make the functions static inline so that typechecking is enabled if
!CONFIG_REISERFS_CHECK.
---
 include/linux/reiserfs_fs.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 31e8047f0f41..dfb46b513712 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -2028,8 +2028,17 @@ extern struct address_space_operations reiserfs_address_space_operations ;
 void * reiserfs_kmalloc (size_t size, int flags, struct super_block * s);
 void reiserfs_kfree (const void * vp, size_t size, struct super_block * s);
 #else
-#define reiserfs_kmalloc(x, y, z) kmalloc(x, y)
-#define reiserfs_kfree(x, y, z) kfree(x)
+static inline void *reiserfs_kmalloc(size_t size, int flags,
+					struct super_block *s)
+{
+	return kmalloc(size, flags);
+}
+
+static inline void reiserfs_kfree(const void *vp, size_t size,
+					struct super_block *s)
+{
+	kfree(vp);
+}
 #endif
 
 int fix_nodes (int n_op_mode, struct tree_balance * p_s_tb, 
-- 
cgit v1.2.3


From f85a96f63f300878dcc785cf2333cab15eef48f0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:29:12 -0700
Subject: [PATCH] Light-weight Auditing Framework

From: Rik Faith <faith@redhat.com>

This patch provides a low-overhead system-call auditing framework for Linux
that is usable by LSM components (e.g., SELinux).  This is an update of the
patch discussed in this thread:

    http://marc.theaimsgroup.com/?t=107815888100001&r=1&w=2

In brief, it provides for netlink-based logging of audit records that have
been generated in other parts of the kernel (e.g., SELinux) as well as the
ability to audit system calls, either independently (using simple
filtering) or as a compliment to the audit record that another part of the
kernel generated.

The main goals were to provide system call auditing with 1) as low overhead
as possible, and 2) without duplicating functionality that is already
provided by SELinux (and/or other security infrastructures).  This
framework will work "stand-alone", but is not designed to provide, e.g.,
CAPP functionality without another security component in place.

This updated patch includes changes from feedback I have received,
including the ability to compile without CONFIG_NET (and better use of
tabs, so use -w if you diff against the older patch).

Please see http://people.redhat.com/faith/audit/ for an early example
user-space client (auditd-0.4.tar.gz) and instructions on how to try it.

My future intentions at the kernel level include improving filtering (e.g.,
syscall personality/exit codes) and syscall support for more architectures.
 First, though, I'm going to work on documentation, a (real) audit daemon,
and patches for other user-space tools so that people can play with the
framework and understand how it can be used with and without SELinux.


Update:

Light-weight Auditing Framework receive filter fixes
From: Rik Faith <faith@redhat.com>

Since audit_receive_filter() is only called with audit_netlink_sem held, it
cannot race with either audit_del_rule() or audit_add_rule(), so the
list_for_each_entry_rcu()s may be replaced by list_for_each_entry()s, and
the rcu_read_{un,}lock()s removed.  A fix for this is part of the attached
patch.

Other features of the attached patch are:

1) generalized the ability to test for inequality

2) added syscall exit status reporting and testing

3) added ability to report and test first 4 syscall arguments (this adds
   a large amount of flexibility for little cost; not implemented or tested
   on ppc64)

4) added ability to report and test personality

User-space demo program enhanced for new fields and inequality testing:
http://people.redhat.com/faith/audit/auditd-0.5.tar.gz
---
 arch/i386/kernel/entry.S         |   6 +-
 arch/i386/kernel/ptrace.c        |  10 +
 arch/ppc64/kernel/entry.S        |  15 +-
 arch/ppc64/kernel/ptrace.c       |  29 +-
 arch/x86_64/ia32/ia32entry.S     |  18 +-
 arch/x86_64/kernel/entry.S       |  21 +-
 arch/x86_64/kernel/ptrace.c      |  30 +-
 fs/namei.c                       |  15 +-
 include/asm-i386/thread_info.h   |   6 +-
 include/asm-ppc64/thread_info.h  |   3 +
 include/asm-x86_64/thread_info.h |   5 +-
 include/linux/audit.h            | 211 +++++++++
 include/linux/fs.h               |  14 +-
 include/linux/netlink.h          |   1 +
 include/linux/sched.h            |   3 +
 init/Kconfig                     |  20 +
 kernel/Makefile                  |   2 +
 kernel/audit.c                   | 825 +++++++++++++++++++++++++++++++++++
 kernel/auditsc.c                 | 922 +++++++++++++++++++++++++++++++++++++++
 kernel/fork.c                    |  10 +-
 security/selinux/avc.c           | 168 ++++---
 security/selinux/include/avc.h   |   7 +-
 security/selinux/ss/services.c   |   2 +-
 23 files changed, 2199 insertions(+), 144 deletions(-)
 create mode 100644 include/linux/audit.h
 create mode 100644 kernel/audit.c
 create mode 100644 kernel/auditsc.c

(limited to 'include')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index 14e64d3ea25c..afa02ea3592c 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -264,7 +264,7 @@ sysenter_past_esp:
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 
-	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 	call *sys_call_table(,%eax,4)
 	movl %eax,EAX(%esp)
@@ -287,7 +287,7 @@ ENTRY(system_call)
 	cmpl $(nr_syscalls), %eax
 	jae syscall_badsys
 					# system call tracing in operation
-	testb $_TIF_SYSCALL_TRACE,TI_FLAGS(%ebp)
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_FLAGS(%ebp)
 	jnz syscall_trace_entry
 syscall_call:
 	call *sys_call_table(,%eax,4)
@@ -354,7 +354,7 @@ syscall_trace_entry:
 	# perform syscall exit tracing
 	ALIGN
 syscall_exit_work:
-	testb $_TIF_SYSCALL_TRACE, %cl
+	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT), %cl
 	jz work_pending
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
diff --git a/arch/i386/kernel/ptrace.c b/arch/i386/kernel/ptrace.c
index a77061138b0c..9f9b32a3f228 100644
--- a/arch/i386/kernel/ptrace.c
+++ b/arch/i386/kernel/ptrace.c
@@ -14,6 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -524,6 +525,15 @@ out:
 __attribute__((regparm(3)))
 void do_syscall_trace(struct pt_regs *regs, int entryexit)
 {
+	if (unlikely(current->audit_context)) {
+		if (!entryexit)
+			audit_syscall_entry(current, regs->orig_eax,
+					    regs->ebx, regs->ecx,
+					    regs->edx, regs->esi);
+		else
+			audit_syscall_exit(current, regs->eax);
+	}
+
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 	if (!(current->ptrace & PT_PTRACED))
diff --git a/arch/ppc64/kernel/entry.S b/arch/ppc64/kernel/entry.S
index 027967ba3ae4..4ad95bcc5b3e 100644
--- a/arch/ppc64/kernel/entry.S
+++ b/arch/ppc64/kernel/entry.S
@@ -95,7 +95,7 @@ _GLOBAL(DoSyscall)
 #endif /* SHOW_SYSCALLS */
 	clrrdi	r10,r1,THREAD_SHIFT
 	ld	r10,TI_FLAGS(r10)
-	andi.	r11,r10,_TIF_SYSCALL_TRACE
+	andi.	r11,r10,_TIF_SYSCALL_T_OR_A
 	bne-	50f
 	cmpli	0,r0,NR_syscalls
 	bge-	66f
@@ -151,7 +151,8 @@ _GLOBAL(ret_from_syscall_1)
 	b	22b
         
 /* Traced system call support */
-50:	bl	.do_syscall_trace
+50:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	.do_syscall_trace_enter
 	ld	r0,GPR0(r1)	/* Restore original registers */
 	ld	r3,GPR3(r1)
 	ld	r4,GPR4(r1)
@@ -201,7 +202,7 @@ _GLOBAL(ret_from_syscall_2)
 	oris	r10,r10,0x1000
 	std	r10,_CCR(r1)
 60:	std	r3,GPR3(r1)	/* Update return value */
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 66:	li	r3,ENOSYS
 	b	57b
@@ -234,14 +235,14 @@ _GLOBAL(ppc64_rt_sigreturn)
 
 80:	clrrdi	r4,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r4)
-	andi.	r4,r4,_TIF_SYSCALL_TRACE
+	andi.	r4,r4,_TIF_SYSCALL_T_OR_A
 	bne-	81f
 	cmpi	0,r3,0
 	bge	.ret_from_except
 	b	.ret_from_syscall_1
 81:	cmpi	0,r3,0
 	blt	.ret_from_syscall_2
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 
 /*
@@ -352,9 +353,9 @@ _GLOBAL(ret_from_fork)
 	bl	.schedule_tail
 	clrrdi	r4,r1,THREAD_SHIFT
 	ld	r4,TI_FLAGS(r4)
-	andi.	r4,r4,_TIF_SYSCALL_TRACE
+	andi.	r4,r4,_TIF_SYSCALL_T_OR_A
 	beq+	.ret_from_except
-	bl	.do_syscall_trace
+	bl	.do_syscall_trace_leave
 	b	.ret_from_except
 
 _GLOBAL(ret_from_except)
diff --git a/arch/ppc64/kernel/ptrace.c b/arch/ppc64/kernel/ptrace.c
index 6bf102811810..6afe71a7d56c 100644
--- a/arch/ppc64/kernel/ptrace.c
+++ b/arch/ppc64/kernel/ptrace.c
@@ -26,6 +26,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -286,12 +287,8 @@ out:
 	return ret;
 }
 
-void do_syscall_trace(void)
+static void do_syscall_trace(void)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
-	if (!(current->ptrace & PT_PTRACED))
-		return;
 	/* the 0x80 provides a way for the tracing parent to distinguish
 	   between a syscall stop and SIGTRAP delivery */
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
@@ -307,3 +304,25 @@ void do_syscall_trace(void)
 		current->exit_code = 0;
 	}
 }
+
+void do_syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->gpr[0],
+				    regs->gpr[3], regs->gpr[4],
+				    regs->gpr[5], regs->gpr[6]);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
+
+void do_syscall_trace_leave(void)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, 0);	/* FIXME: pass pt_regs */
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		do_syscall_trace();
+}
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index aea26e4a8405..4e7ab108e8ac 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -78,8 +78,8 @@ ENTRY(ia32_sysenter_target)
  	.quad 1b,ia32_badarg
  	.previous	
 	GET_THREAD_INFO(%r10)
-	bt  $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc  sysenter_tracesys
+	testl  $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz  sysenter_tracesys
 sysenter_do_call:	
 	cmpl	$(IA32_NR_syscalls),%eax
 	jae	ia32_badsys
@@ -106,7 +106,7 @@ sysenter_tracesys:
 	CLEAR_RREGS
 	movq	$-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
-	call	syscall_trace
+	call	syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl	%ebp, %ebp
@@ -163,8 +163,8 @@ ENTRY(ia32_cstar_target)
 	.quad 1b,ia32_badarg
 	.previous	
 	GET_THREAD_INFO(%r10)
-	bt  $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc  cstar_tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz   cstar_tracesys
 cstar_do_call:	
 	cmpl $IA32_NR_syscalls,%eax
 	jae  ia32_badsys
@@ -187,7 +187,7 @@ cstar_tracesys:
 	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl RSP-ARGOFFSET(%rsp), %r8d
@@ -236,8 +236,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	SAVE_ARGS 0,0,1
 	GET_THREAD_INFO(%r10)
-	bt $TIF_SYSCALL_TRACE,threadinfo_flags(%r10)
-	jc ia32_tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%r10)
+	jnz ia32_tracesys
 ia32_do_syscall:	
 	cmpl $(IA32_NR_syscalls),%eax
 	jae  ia32_badsys
@@ -251,7 +251,7 @@ ia32_tracesys:
 	SAVE_REST
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	jmp ia32_do_syscall
diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index a6309212038d..89f74f738a2a 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -131,8 +131,8 @@ ENTRY(ret_from_fork)
 	CFI_DEFAULT_STACK
 	call schedule_tail
 	GET_THREAD_INFO(%rcx)
-	bt $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx)
-	jc rff_trace
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz rff_trace
 rff_action:	
 	RESTORE_REST
 	testl $3,CS-ARGOFFSET(%rsp)	# from kernel_thread?
@@ -143,7 +143,7 @@ rff_action:
 	jmp ret_from_sys_call
 rff_trace:
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_leave
 	GET_THREAD_INFO(%rcx)	
 	jmp rff_action
 	CFI_ENDPROC
@@ -185,8 +185,8 @@ ENTRY(system_call)
 	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp) 
 	movq  %rcx,RIP-ARGOFFSET(%rsp)  
 	GET_THREAD_INFO(%rcx)
-	bt    $TIF_SYSCALL_TRACE,threadinfo_flags(%rcx) 
-	jc    tracesys
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),threadinfo_flags(%rcx)
+	jnz tracesys
 	cmpq $__NR_syscall_max,%rax
 	ja badsys
 	movq %r10,%rcx
@@ -244,7 +244,7 @@ tracesys:
 	movq $-ENOSYS,RAX(%rsp)
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_enter
 	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	cmpq $__NR_syscall_max,%rax
@@ -254,7 +254,7 @@ tracesys:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 1:	SAVE_REST
 	movq %rsp,%rdi
-	call syscall_trace
+	call syscall_trace_leave
 	RESTORE_TOP_OF_STACK %rbx
 	RESTORE_REST
 	jmp ret_from_sys_call
@@ -297,13 +297,14 @@ int_very_careful:
 	sti
 	SAVE_REST
 	/* Check for syscall exit trace */	
-	bt $TIF_SYSCALL_TRACE,%edx
-	jnc int_signal
+	testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),%edx
+	jz int_signal
 	pushq %rdi
 	leaq 8(%rsp),%rdi	# &ptregs -> arg1	
-	call syscall_trace
+	call syscall_trace_leave
 	popq %rdi
 	btr  $TIF_SYSCALL_TRACE,%edi
+	btr  $TIF_SYSCALL_AUDIT,%edi
 	jmp int_restore_rest
 	
 int_signal:
diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c
index 8aad386340dc..a97bee6f29fa 100644
--- a/arch/x86_64/kernel/ptrace.c
+++ b/arch/x86_64/kernel/ptrace.c
@@ -16,6 +16,7 @@
 #include <linux/ptrace.h>
 #include <linux/user.h>
 #include <linux/security.h>
+#include <linux/audit.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -486,7 +487,7 @@ out:
 	return ret;
 }
 
-asmlinkage void syscall_trace(struct pt_regs *regs)
+static void syscall_trace(struct pt_regs *regs)
 {
 
 #if 0
@@ -496,11 +497,6 @@ asmlinkage void syscall_trace(struct pt_regs *regs)
 	       current_thread_info()->flags, current->ptrace); 
 #endif
 
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return; 
-	if (!(current->ptrace & PT_PTRACED))
-		return;
-	
 	ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
 				? 0x80 : 0));
 	/*
@@ -513,3 +509,25 @@ asmlinkage void syscall_trace(struct pt_regs *regs)
 		current->exit_code = 0;
 	}
 }
+
+asmlinkage void syscall_trace_enter(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_entry(current, regs->orig_rax,
+				    regs->rdi, regs->rsi,
+				    regs->rdx, regs->r10);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
+
+asmlinkage void syscall_trace_leave(struct pt_regs *regs)
+{
+	if (unlikely(current->audit_context))
+		audit_syscall_exit(current, regs->rax);
+
+	if (test_thread_flag(TIF_SYSCALL_TRACE)
+	    && (current->ptrace & PT_PTRACED))
+		syscall_trace(regs);
+}
diff --git a/fs/namei.c b/fs/namei.c
index e6320d133c5f..d2cab643cf64 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -26,6 +26,7 @@
 #include <linux/personality.h>
 #include <linux/security.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 #include <asm/namei.h>
 #include <asm/uaccess.h>
 
@@ -141,10 +142,12 @@ char * getname(const char __user * filename)
 
 		result = tmp;
 		if (retval < 0) {
-			putname(tmp);
+			__putname(tmp);
 			result = ERR_PTR(retval);
 		}
 	}
+	if (unlikely(current->audit_context) && !IS_ERR(result) && result)
+		audit_getname(result);
 	return result;
 }
 
@@ -860,6 +863,8 @@ walk_init_root(const char *name, struct nameidata *nd)
 
 int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata *nd)
 {
+	int retval;
+
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 
@@ -882,7 +887,13 @@ int fastcall path_lookup(const char *name, unsigned int flags, struct nameidata
 	}
 	read_unlock(&current->fs->lock);
 	current->total_link_count = 0;
-	return link_path_walk(name, nd);
+	retval = link_path_walk(name, nd);
+	if (unlikely(current->audit_context
+		     && nd && nd->dentry && nd->dentry->d_inode))
+		audit_inode(name,
+			    nd->dentry->d_inode->i_ino,
+			    nd->dentry->d_inode->i_rdev);
+	return retval;
 }
 
 /*
diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
index da5c780f2c5c..6f59e1fe345b 100644
--- a/include/asm-i386/thread_info.h
+++ b/include/asm-i386/thread_info.h
@@ -151,6 +151,7 @@ static inline unsigned long current_stack_pointer(void)
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_IRET		5	/* return with iret */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -159,9 +160,12 @@ static inline unsigned long current_stack_pointer(void)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
-#define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
+/* work to do on interrupt/exception return */
+#define _TIF_WORK_MASK \
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT))
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
 
 /*
diff --git a/include/asm-ppc64/thread_info.h b/include/asm-ppc64/thread_info.h
index 5b74b149f04f..297c974bf220 100644
--- a/include/asm-ppc64/thread_info.h
+++ b/include/asm-ppc64/thread_info.h
@@ -97,6 +97,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_32BIT		5	/* 32 bit binary */
 #define TIF_RUN_LIGHT		6	/* iSeries run light */
 #define TIF_ABI_PENDING		7	/* 32/64 bit switch needed */
+#define TIF_SYSCALL_AUDIT	8	/* syscall auditing active */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -107,6 +108,8 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_32BIT		(1<<TIF_32BIT)
 #define _TIF_RUN_LIGHT		(1<<TIF_RUN_LIGHT)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_SYSCALL_T_OR_A	(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT)
 
 #define _TIF_USER_WORK_MASK	(_TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \
 				 _TIF_NEED_RESCHED)
diff --git a/include/asm-x86_64/thread_info.h b/include/asm-x86_64/thread_info.h
index 0145da994590..73e4fa13ed0c 100644
--- a/include/asm-x86_64/thread_info.h
+++ b/include/asm-x86_64/thread_info.h
@@ -101,6 +101,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
@@ -112,13 +113,15 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_IRET		(1<<TIF_IRET)
+#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
 
 /* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK    (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP))
+#define _TIF_WORK_MASK \
+  (0x0000FFFF & ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP))
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK 0x0000FFFF	
 
diff --git a/include/linux/audit.h b/include/linux/audit.h
new file mode 100644
index 000000000000..d766482451af
--- /dev/null
+++ b/include/linux/audit.h
@@ -0,0 +1,211 @@
+/* audit.h -- Auditing support -*- linux-c -*-
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ */
+
+#ifndef _LINUX_AUDIT_H_
+#define _LINUX_AUDIT_H_
+
+/* Request and reply types */
+#define AUDIT_GET      1000	/* Get status */
+#define AUDIT_SET      1001	/* Set status (enable/disable/auditd) */
+#define AUDIT_LIST     1002	/* List filtering rules */
+#define AUDIT_ADD      1003	/* Add filtering rule */
+#define AUDIT_DEL      1004	/* Delete filtering rule */
+#define AUDIT_USER     1005	/* Send a message from user-space */
+#define AUDIT_LOGIN    1006     /* Define the login id and informaiton */
+#define AUDIT_KERNEL   2000	/* Asynchronous audit record. NOT A REQUEST. */
+
+/* Rule flags */
+#define AUDIT_PER_TASK 0x01	/* Apply rule at task creation (not syscall) */
+#define AUDIT_AT_ENTRY 0x02	/* Apply rule at syscall entry */
+#define AUDIT_AT_EXIT  0x04	/* Apply rule at syscall exit */
+#define AUDIT_PREPEND  0x10	/* Prepend to front of list */
+
+/* Rule actions */
+#define AUDIT_NEVER    0	/* Do not build context if rule matches */
+#define AUDIT_POSSIBLE 1	/* Build context if rule matches  */
+#define AUDIT_ALWAYS   2	/* Generate audit record if rule matches */
+
+/* Rule structure sizes -- if these change, different AUDIT_ADD and
+ * AUDIT_LIST commands must be implemented. */
+#define AUDIT_MAX_FIELDS   64
+#define AUDIT_BITMASK_SIZE 64
+#define AUDIT_WORD(nr) ((__u32)((nr)/32))
+#define AUDIT_BIT(nr)  (1 << ((nr) - AUDIT_WORD(nr)*32))
+
+/* Rule fields */
+				/* These are useful when checking the
+				 * task structure at task creation time
+				 * (AUDIT_PER_TASK).  */
+#define AUDIT_PID	0
+#define AUDIT_UID	1
+#define AUDIT_EUID	2
+#define AUDIT_SUID	3
+#define AUDIT_FSUID	4
+#define AUDIT_GID	5
+#define AUDIT_EGID	6
+#define AUDIT_SGID	7
+#define AUDIT_FSGID	8
+#define AUDIT_LOGINUID	9
+#define AUDIT_PERS	10
+
+				/* These are ONLY useful when checking
+				 * at syscall exit time (AUDIT_AT_EXIT). */
+#define AUDIT_DEVMAJOR	100
+#define AUDIT_DEVMINOR	101
+#define AUDIT_INODE	102
+#define AUDIT_EXIT	103
+#define AUDIT_SUCCESS   104	/* exit >= 0; value ignored */
+
+#define AUDIT_ARG0      200
+#define AUDIT_ARG1      (AUDIT_ARG0+1)
+#define AUDIT_ARG2      (AUDIT_ARG0+2)
+#define AUDIT_ARG3      (AUDIT_ARG0+3)
+
+#define AUDIT_NEGATE    0x80000000
+
+
+/* Status symbols */
+				/* Mask values */
+#define AUDIT_STATUS_ENABLED		0x0001
+#define AUDIT_STATUS_FAILURE		0x0002
+#define AUDIT_STATUS_PID		0x0004
+#define AUDIT_STATUS_RATE_LIMIT		0x0008
+#define AUDIT_STATUS_BACKLOG_LIMIT	0x0010
+				/* Failure-to-log actions */
+#define AUDIT_FAIL_SILENT	0
+#define AUDIT_FAIL_PRINTK	1
+#define AUDIT_FAIL_PANIC	2
+
+#ifndef __KERNEL__
+struct audit_message {
+	struct nlmsghdr nlh;
+	char		data[1200];
+};
+#endif
+
+struct audit_status {
+	__u32		mask;		/* Bit mask for valid entries */
+	__u32		enabled;	/* 1 = enabled, 0 = disbaled */
+	__u32		failure;	/* Failure-to-log action */
+	__u32		pid;		/* pid of auditd process */
+	__u32		rate_limit;	/* messages rate limit (per second) */
+	__u32		backlog_limit;	/* waiting messages limit */
+	__u32		lost;		/* messages lost */
+	__u32		backlog;	/* messages waiting in queue */
+};
+
+struct audit_login {
+	__u32		loginuid;
+	int		msglen;
+	char		msg[1024];
+};
+
+struct audit_rule {		/* for AUDIT_LIST, AUDIT_ADD, and AUDIT_DEL */
+	__u32		flags;	/* AUDIT_PER_{TASK,CALL}, AUDIT_PREPEND */
+	__u32		action;	/* AUDIT_NEVER, AUDIT_POSSIBLE, AUDIT_ALWAYS */
+	__u32		field_count;
+	__u32		mask[AUDIT_BITMASK_SIZE];
+	__u32		fields[AUDIT_MAX_FIELDS];
+	__u32		values[AUDIT_MAX_FIELDS];
+};
+
+#ifdef __KERNEL__
+
+#ifdef CONFIG_AUDIT
+struct audit_buffer;
+struct audit_context;
+#endif
+
+#ifdef CONFIG_AUDITSYSCALL
+/* These are defined in auditsc.c */
+				/* Public API */
+extern int  audit_alloc(struct task_struct *task);
+extern void audit_free(struct task_struct *task);
+extern void audit_syscall_entry(struct task_struct *task,
+				int major, unsigned long a0, unsigned long a1,
+				unsigned long a2, unsigned long a3);
+extern void audit_syscall_exit(struct task_struct *task, int return_code);
+extern void audit_getname(const char *name);
+extern void audit_putname(const char *name);
+extern void audit_inode(const char *name, unsigned long ino, dev_t rdev);
+
+				/* Private API (for audit.c only) */
+extern int  audit_receive_filter(int type, int pid, int uid, int seq,
+				 void *data);
+extern void audit_get_stamp(struct audit_context *ctx,
+			    struct timespec *t, int *serial);
+extern int  audit_set_loginuid(struct audit_context *ctx, uid_t loginuid);
+#else
+#define audit_alloc(t) ({ 0; })
+#define audit_free(t) do { ; } while (0)
+#define audit_syscall_entry(t,a,b,c,d,e) do { ; } while (0)
+#define audit_syscall_exit(t,r) do { ; } while (0)
+#define audit_getname(n) do { ; } while (0)
+#define audit_putname(n) do { ; } while (0)
+#define audit_inode(n,i,d) do { ; } while (0)
+#endif
+
+#ifdef CONFIG_AUDIT
+/* These are defined in audit.c */
+				/* Public API */
+extern void		    audit_log(struct audit_context *ctx,
+				      const char *fmt, ...)
+			    __attribute__((format(printf,2,3)));
+
+extern struct audit_buffer *audit_log_start(struct audit_context *ctx);
+extern void		    audit_log_format(struct audit_buffer *ab,
+					     const char *fmt, ...)
+			    __attribute__((format(printf,2,3)));
+extern void		    audit_log_end(struct audit_buffer *ab);
+extern void		    audit_log_end_fast(struct audit_buffer *ab);
+extern void		    audit_log_end_irq(struct audit_buffer *ab);
+extern void		    audit_log_d_path(struct audit_buffer *ab,
+					     const char *prefix,
+					     struct dentry *dentry,
+					     struct vfsmount *vfsmnt);
+extern int		    audit_set_rate_limit(int limit);
+extern int		    audit_set_backlog_limit(int limit);
+extern int		    audit_set_enabled(int state);
+extern int		    audit_set_failure(int state);
+
+				/* Private API (for auditsc.c only) */
+extern void		    audit_send_reply(int pid, int seq, int type,
+					     int done, int multi,
+					     void *payload, int size);
+extern void		    audit_log_lost(const char *message);
+#else
+#define audit_log(t,f,...) do { ; } while (0)
+#define audit_log_start(t) ({ NULL; })
+#define audit_log_vformat(b,f,a) do { ; } while (0)
+#define audit_log_format(b,f,...) do { ; } while (0)
+#define audit_log_end(b) do { ; } while (0)
+#define audit_log_end_fast(b) do { ; } while (0)
+#define audit_log_end_irq(b) do { ; } while (0)
+#define audit_log_d_path(b,p,d,v) do { ; } while (0)
+#define audit_set_rate_limit(l) do { ; } while (0)
+#define audit_set_backlog_limit(l) do { ; } while (0)
+#define audit_set_enabled(s) do { ; } while (0)
+#define audit_set_failure(s) do { ; } while (0)
+#endif
+#endif
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bacf6bcbc7b7..39c893f8aa28 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -20,6 +20,7 @@
 #include <linux/radix-tree.h>
 #include <linux/kobject.h>
 #include <asm/atomic.h>
+#include <linux/audit.h>
 
 struct iovec;
 struct nameidata;
@@ -1159,7 +1160,18 @@ extern char * getname(const char __user *);
 extern void vfs_caches_init(unsigned long);
 
 #define __getname()	kmem_cache_alloc(names_cachep, SLAB_KERNEL)
-#define putname(name)	kmem_cache_free(names_cachep, (void *)(name))
+#define __putname(name) kmem_cache_free(names_cachep, (void *)(name))
+#ifndef CONFIG_AUDITSYSCALL
+#define putname(name)   __putname(name)
+#else
+#define putname(name)							\
+	do {								\
+		if (unlikely(current->audit_context))			\
+			audit_putname(name);				\
+		else							\
+			__putname(name);				\
+	} while (0)
+#endif
 
 extern int register_blkdev(unsigned int, const char *);
 extern int unregister_blkdev(unsigned int, const char *);
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index e5e15ddadab5..5adca479de6e 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -13,6 +13,7 @@
 #define NETLINK_XFRM		6	/* ipsec */
 #define NETLINK_SELINUX		7	/* SELinux event notifications */
 #define NETLINK_ARPD		8
+#define NETLINK_AUDIT		9	/* auditing */
 #define NETLINK_ROUTE6		11	/* af_inet6 route comm channel */
 #define NETLINK_IP6_FW		13
 #define NETLINK_DNRTMSG		14	/* DECnet routing messages */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 22080f919266..b72c38420d71 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -371,6 +371,8 @@ int set_current_groups(struct group_info *group_info);
     ((gi)->blocks[(i)/NGROUPS_PER_BLOCK][(i)%NGROUPS_PER_BLOCK])
 
 
+struct audit_context;		/* See audit.c */
+
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	struct thread_info *thread_info;
@@ -474,6 +476,7 @@ struct task_struct {
 	sigset_t *notifier_mask;
 	
 	void *security;
+	struct audit_context *audit_context;
 
 /* Thread group tracking */
    	u32 parent_exec_id;
diff --git a/init/Kconfig b/init/Kconfig
index ddd82dbad5dd..55261afdc3bf 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -137,6 +137,26 @@ config SYSCTL
 	  building a kernel for install/rescue disks or your system is very
 	  limited in memory.
 
+config AUDIT
+	bool "Auditing support"
+	default y if SECURITY_SELINUX
+	default n
+	help
+	  Enable auditing infrastructure that can be used with another
+	  kernel subsystem, such as SELinux (which requires this for
+	  logging of avc messages output).  Does not do system-call
+	  auditing without CONFIG_AUDITSYSCALL.
+
+config AUDITSYSCALL
+	bool "Enable system-call auditing support"
+	depends on AUDIT && (X86 || PPC64)
+	default y if SECURITY_SELINUX
+	default n
+	help
+	  Enable low-overhead system-call auditing infrastructure that
+	  can be used independently or with another kernel subsystem,
+	  such as SELinux.
+
 config LOG_BUF_SHIFT
 	int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" if DEBUG_KERNEL
 	range 12 20
diff --git a/kernel/Makefile b/kernel/Makefile
index 3a6484838748..238c65f60d9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,6 +21,8 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_IKCONFIG_PROC) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
+obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 
 ifneq ($(CONFIG_IA64),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
new file mode 100644
index 000000000000..765822b03b91
--- /dev/null
+++ b/kernel/audit.c
@@ -0,0 +1,825 @@
+/* audit.c -- Auditing support -*- linux-c -*-
+ * Gateway between the kernel (e.g., selinux) and the user-space audit daemon.
+ * System-call specific features have moved to auditsc.c
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Goals: 1) Integrate fully with SELinux.
+ *	  2) Minimal run-time overhead:
+ *	     a) Minimal when syscall auditing is disabled (audit_enable=0).
+ *	     b) Small when syscall auditing is enabled and no audit record
+ *		is generated (defer as much work as possible to record
+ *		generation time):
+ *		i) context is allocated,
+ *		ii) names from getname are stored without a copy, and
+ *		iii) inode information stored from path_lookup.
+ *	  3) Ability to disable syscall auditing at boot time (audit=0).
+ *	  4) Usable by other parts of the kernel (if audit_log* is called,
+ *	     then a syscall record will be generated automatically for the
+ *	     current syscall).
+ *	  5) Netlink interface to user-space.
+ *	  6) Support low-overhead kernel-based filtering to minimize the
+ *	     information that must be passed to user-space.
+ *
+ * Example user-space utilities: http://people.redhat.com/faith/audit/
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+
+#include <net/sock.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+
+/* No auditing will take place until audit_initialized != 0.
+ * (Initialization happens after skb_init is called.) */
+static int	audit_initialized;
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+int		audit_enabled;
+
+/* Default state when kernel boots without any parameters. */
+static int	audit_default;
+
+/* If auditing cannot proceed, audit_failure selects what happens. */
+static int	audit_failure = AUDIT_FAIL_PRINTK;
+
+/* If audit records are to be written to the netlink socket, audit_pid
+ * contains the (non-zero) pid. */
+static int	audit_pid;
+
+/* If audit_limit is non-zero, limit the rate of sending audit records
+ * to that number per second.  This prevents DoS attacks, but results in
+ * audit records being dropped. */
+static int	audit_rate_limit;
+
+/* Number of outstanding audit_buffers allowed. */
+static int	audit_backlog_limit = 64;
+static atomic_t	audit_backlog	    = ATOMIC_INIT(0);
+
+/* Records can be lost in several ways:
+   0) [suppressed in audit_alloc]
+   1) out of memory in audit_log_start [kmalloc of struct audit_buffer]
+   2) out of memory in audit_log_move [alloc_skb]
+   3) suppressed due to audit_rate_limit
+   4) suppressed due to audit_backlog_limit
+*/
+static atomic_t    audit_lost = ATOMIC_INIT(0);
+
+/* The netlink socket. */
+static struct sock *audit_sock;
+
+/* There are two lists of audit buffers.  The txlist contains audit
+ * buffers that cannot be sent immediately to the netlink device because
+ * we are in an irq context (these are sent later in a tasklet).
+ *
+ * The second list is a list of pre-allocated audit buffers (if more
+ * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
+ * being placed on the freelist). */
+static spinlock_t  audit_txlist_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t  audit_freelist_lock = SPIN_LOCK_UNLOCKED;
+static int	   audit_freelist_count = 0;
+static LIST_HEAD(audit_txlist);
+static LIST_HEAD(audit_freelist);
+
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+/* The netlink socket is only to be read by 1 CPU, which lets us assume
+ * that list additions and deletions never happen simultaneiously in
+ * auditsc.c */
+static DECLARE_MUTEX(audit_netlink_sem);
+
+/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
+ * audit records.  Since printk uses a 1024 byte buffer, this buffer
+ * should be at least that large. */
+#define AUDIT_BUFSIZ 1024
+
+/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the
+ * audit_freelist.  Doing so eliminates many kmalloc/kfree calls. */
+#define AUDIT_MAXFREE  (2*NR_CPUS)
+
+/* The audit_buffer is used when formatting an audit record.  The caller
+ * locks briefly to get the record off the freelist or to allocate the
+ * buffer, and locks briefly to send the buffer to the netlink layer or
+ * to place it on a transmit queue.  Multiple audit_buffers can be in
+ * use simultaneously. */
+struct audit_buffer {
+	struct list_head     list;
+	struct sk_buff_head  sklist;	/* formatted skbs ready to send */
+	struct audit_context *ctx;	/* NULL or associated context */
+	int		     len;	/* used area of tmp */
+	char		     tmp[AUDIT_BUFSIZ];
+
+				/* Pointer to header and contents */
+	struct nlmsghdr      *nlh;
+	int		     total;
+	int		     type;
+	int		     pid;
+	int		     count; /* Times requeued */
+};
+
+struct audit_entry {
+	struct list_head  list;
+	struct audit_rule rule;
+};
+
+static void audit_panic(const char *message)
+{
+	switch (audit_failure)
+	{
+	case AUDIT_FAIL_SILENT:
+		break;
+	case AUDIT_FAIL_PRINTK:
+		printk(KERN_ERR "audit: %s\n", message);
+		break;
+	case AUDIT_FAIL_PANIC:
+		panic(message);
+		break;
+	}
+}
+
+static inline int audit_rate_check(void)
+{
+	static unsigned long	last_check = 0;
+	static int		messages   = 0;
+	static spinlock_t	lock	   = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	unsigned long		elapsed;
+	int			retval	   = 0;
+
+	if (!audit_rate_limit) return 1;
+
+	spin_lock_irqsave(&lock, flags);
+	if (++messages < audit_rate_limit) {
+		retval = 1;
+	} else {
+		now     = jiffies;
+		elapsed = now - last_check;
+		if (elapsed > HZ) {
+			last_check = now;
+			messages   = 0;
+			retval     = 1;
+		}
+	}
+	spin_unlock_irqrestore(&lock, flags);
+
+	return retval;
+}
+
+/* Emit at least 1 message per second, even if audit_rate_check is
+ * throttling. */
+void audit_log_lost(const char *message)
+{
+	static unsigned long	last_msg = 0;
+	static spinlock_t	lock     = SPIN_LOCK_UNLOCKED;
+	unsigned long		flags;
+	unsigned long		now;
+	int			print;
+
+	atomic_inc(&audit_lost);
+
+	print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit);
+
+	if (!print) {
+		spin_lock_irqsave(&lock, flags);
+		now = jiffies;
+		if (now - last_msg > HZ) {
+			print = 1;
+			last_msg = now;
+		}
+		spin_unlock_irqrestore(&lock, flags);
+	}
+
+	if (print) {
+		printk(KERN_WARNING
+		       "audit: audit_lost=%d audit_backlog=%d"
+		       " audit_rate_limit=%d audit_backlog_limit=%d\n",
+		       atomic_read(&audit_lost),
+		       atomic_read(&audit_backlog),
+		       audit_rate_limit,
+		       audit_backlog_limit);
+		audit_panic(message);
+	}
+
+}
+
+int audit_set_rate_limit(int limit)
+{
+	int old		 = audit_rate_limit;
+	audit_rate_limit = limit;
+	audit_log(current->audit_context, "audit_rate_limit=%d old=%d",
+		  audit_rate_limit, old);
+	return old;
+}
+
+int audit_set_backlog_limit(int limit)
+{
+	int old		 = audit_backlog_limit;
+	audit_backlog_limit = limit;
+	audit_log(current->audit_context, "audit_backlog_limit=%d old=%d",
+		  audit_backlog_limit, old);
+	return old;
+}
+
+int audit_set_enabled(int state)
+{
+	int old		 = audit_enabled;
+	if (state != 0 && state != 1)
+		return -EINVAL;
+	audit_enabled = state;
+	audit_log(current->audit_context, "audit_enabled=%d old=%d",
+		  audit_enabled, old);
+	return old;
+}
+
+int audit_set_failure(int state)
+{
+	int old		 = audit_failure;
+	if (state != AUDIT_FAIL_SILENT
+	    && state != AUDIT_FAIL_PRINTK
+	    && state != AUDIT_FAIL_PANIC)
+		return -EINVAL;
+	audit_failure = state;
+	audit_log(current->audit_context, "audit_failure=%d old=%d",
+		  audit_failure, old);
+	return old;
+}
+
+#ifdef CONFIG_NET
+void audit_send_reply(int pid, int seq, int type, int done, int multi,
+		      void *payload, int size)
+{
+	struct sk_buff	*skb;
+	struct nlmsghdr	*nlh;
+	int		len = NLMSG_SPACE(size);
+	void		*data;
+	int		flags = multi ? NLM_F_MULTI : 0;
+	int		t     = done  ? NLMSG_DONE  : type;
+
+	skb = alloc_skb(len, GFP_KERNEL);
+	if (!skb)
+		goto nlmsg_failure;
+
+	nlh		 = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+	nlh->nlmsg_flags = flags;
+	data		 = NLMSG_DATA(nlh);
+	memcpy(data, payload, size);
+	netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+	return;
+
+nlmsg_failure:			/* Used by NLMSG_PUT */
+	if (skb)
+		kfree_skb(skb);
+}
+
+static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	u32			uid, pid, seq;
+	void			*data;
+	struct audit_status	*status_get, status_set;
+	struct audit_login	*login;
+	int			err = 0;
+	struct audit_buffer	*ab;
+
+	pid  = NETLINK_CREDS(skb)->pid;
+	uid  = NETLINK_CREDS(skb)->uid;
+	seq  = nlh->nlmsg_seq;
+	data = NLMSG_DATA(nlh);
+
+	switch (nlh->nlmsg_type) {
+	case AUDIT_GET:
+		status_set.enabled	 = audit_enabled;
+		status_set.failure	 = audit_failure;
+		status_set.pid		 = audit_pid;
+		status_set.rate_limit	 = audit_rate_limit;
+		status_set.backlog_limit = audit_backlog_limit;
+		status_set.lost		 = atomic_read(&audit_lost);
+		status_set.backlog	 = atomic_read(&audit_backlog);
+		audit_send_reply(pid, seq, AUDIT_GET, 0, 0,
+				 &status_set, sizeof(status_set));
+		break;
+	case AUDIT_SET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		status_get   = (struct audit_status *)data;
+		if (status_get->mask & AUDIT_STATUS_ENABLED) {
+			err = audit_set_enabled(status_get->enabled);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_FAILURE) {
+			err = audit_set_failure(status_get->failure);
+			if (err < 0) return err;
+		}
+		if (status_get->mask & AUDIT_STATUS_PID) {
+			int old   = audit_pid;
+			audit_pid = status_get->pid;
+			audit_log(current->audit_context,
+				  "audit_pid=%d old=%d", audit_pid, old);
+		}
+		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
+			audit_set_rate_limit(status_get->rate_limit);
+		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
+			audit_set_backlog_limit(status_get->backlog_limit);
+		break;
+	case AUDIT_USER:
+		ab = audit_log_start(NULL);
+		if (!ab)
+			break;	/* audit_panic has been called */
+		audit_log_format(ab,
+				 "user pid=%d uid=%d length=%d msg='%.1024s'",
+				 pid, uid,
+				 (int)(nlh->nlmsg_len
+				       - ((char *)data - (char *)nlh)),
+				 (char *)data);
+		ab->type = AUDIT_USER;
+		ab->pid  = pid;
+		audit_log_end(ab);
+		break;
+	case AUDIT_LOGIN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		login = (struct audit_login *)data;
+		ab = audit_log_start(NULL);
+		if (ab) {
+			audit_log_format(ab, "login pid=%d uid=%d loginuid=%d"
+					 " length=%d msg='%.1024s'",
+					 pid, uid,
+					 login->loginuid,
+					 login->msglen,
+					 login->msg);
+			ab->type = AUDIT_LOGIN;
+			ab->pid  = pid;
+			audit_log_end(ab);
+		}
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_set_loginuid(current->audit_context,
+					 login->loginuid);
+#endif
+		break;
+	case AUDIT_LIST:
+	case AUDIT_ADD:
+	case AUDIT_DEL:
+#ifdef CONFIG_AUDITSYSCALL
+		err = audit_receive_filter(nlh->nlmsg_type, pid, uid, seq,
+					   data);
+#else
+		err = -EOPNOTSUPP;
+#endif
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+
+	return err < 0 ? err : 0;
+}
+
+/* Get message from skb (based on rtnetlink_rcv_skb).  Each message is
+ * processed by audit_receive_msg.  Malformed skbs with wrong length are
+ * discarded silently.  */
+static int audit_receive_skb(struct sk_buff *skb)
+{
+	int		err;
+	struct nlmsghdr	*nlh;
+	u32		rlen;
+
+	while (skb->len >= NLMSG_SPACE(0)) {
+		nlh = (struct nlmsghdr *)skb->data;
+		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+			return 0;
+		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+		if (rlen > skb->len)
+			rlen = skb->len;
+		if ((err = audit_receive_msg(skb, nlh))) {
+			netlink_ack(skb, nlh, -err);
+		} else if (nlh->nlmsg_flags & NLM_F_ACK)
+			netlink_ack(skb, nlh, 0);
+		skb_pull(skb, rlen);
+	}
+	return 0;
+}
+
+/* Receive messages from netlink socket. */
+static void audit_receive(struct sock *sk, int length)
+{
+	struct sk_buff  *skb;
+
+	if (down_trylock(&audit_netlink_sem))
+		return;
+
+				/* FIXME: this must not cause starvation */
+	while ((skb = skb_dequeue(&sk->sk_receive_queue))) {
+		if (audit_receive_skb(skb) && skb->len)
+			skb_queue_head(&sk->sk_receive_queue, skb);
+		else
+			kfree_skb(skb);
+	}
+	up(&audit_netlink_sem);
+}
+
+/* Move data from tmp buffer into an skb.  This is an extra copy, and
+ * that is unfortunate.  However, the copy will only occur when a record
+ * is being written to user space, which is already a high-overhead
+ * operation.  (Elimination of the copy is possible, for example, by
+ * writing directly into a pre-allocated skb, at the cost of wasting
+ * memory. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	struct sk_buff	*skb;
+	char		*start;
+	int		extra = ab->nlh ? 0 : NLMSG_SPACE(0);
+
+	skb = skb_peek(&ab->sklist);
+	if (!skb || skb_tailroom(skb) <= ab->len + extra) {
+		skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
+		if (!skb) {
+			ab->len = 0; /* Lose information in ab->tmp */
+			audit_log_lost("out of memory in audit_log_move");
+			return;
+		}
+		__skb_queue_tail(&ab->sklist, skb);
+		if (!ab->nlh)
+			ab->nlh = (struct nlmsghdr *)skb_put(skb,
+							     NLMSG_SPACE(0));
+	}
+	start = skb_put(skb, ab->len);
+	memcpy(start, ab->tmp, ab->len);
+	ab->len = 0;
+}
+
+/* Iterate over the skbuff in the audit_buffer, sending their contents
+ * to user space. */
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&ab->sklist))) {
+		int retval = 0;
+
+		if (audit_pid) {
+			if (ab->nlh) {
+				ab->nlh->nlmsg_len   = ab->total;
+				ab->nlh->nlmsg_type  = ab->type;
+				ab->nlh->nlmsg_flags = 0;
+				ab->nlh->nlmsg_seq   = 0;
+				ab->nlh->nlmsg_pid   = ab->pid;
+			}
+			skb_get(skb); /* because netlink_* frees */
+			retval = netlink_unicast(audit_sock, skb, audit_pid,
+						 MSG_DONTWAIT);
+		}
+		if (retval == -EAGAIN && ab->count < 5) {
+			++ab->count;
+			audit_log_end_irq(ab);
+			return 1;
+		}
+		if (retval < 0) {
+			if (retval == -ECONNREFUSED) {
+				printk(KERN_ERR
+				       "audit: *NO* daemon at audit_pid=%d\n",
+				       audit_pid);
+				audit_pid = 0;
+			} else
+				audit_log_lost("netlink socket too busy");
+		}
+		if (!audit_pid) { /* No daemon */
+			int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
+			int len    = skb->len - offset;
+			printk(KERN_ERR "%*.*s\n",
+			       len, len, skb->data + offset);
+		}
+		kfree_skb(skb);
+		ab->nlh = NULL;
+	}
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
+	       audit_default ? "enabled" : "disabled");
+	audit_sock = netlink_kernel_create(NETLINK_AUDIT, audit_receive);
+	if (!audit_sock)
+		audit_panic("cannot initialize netlink socket");
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+
+#else
+/* Without CONFIG_NET, we have no skbuffs.  For now, print what we have
+ * in the buffer. */
+static void audit_log_move(struct audit_buffer *ab)
+{
+	printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
+	ab->len = 0;
+}
+
+static inline int audit_log_drain(struct audit_buffer *ab)
+{
+	return 0;
+}
+
+/* Initialize audit support at boot time. */
+int __init audit_init(void)
+{
+	printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
+	audit_sock = NULL;
+	audit_pid  = 0;
+
+	audit_initialized = 1;
+	audit_enabled = audit_default;
+	audit_log(NULL, "initialized");
+	return 0;
+}
+#endif
+
+__initcall(audit_init);
+
+/* Process kernel command-line parameter at boot time.  audit=0 or audit=1. */
+static int __init audit_enable(char *str)
+{
+	audit_default = !!simple_strtol(str, NULL, 0);
+	printk(KERN_INFO "audit: %s%s\n",
+	       audit_default ? "enabled" : "disabled",
+	       audit_initialized ? "" : " (after initialization)");
+	if (audit_initialized)
+		audit_enabled = audit_default;
+	return 0;
+}
+
+__setup("audit=", audit_enable);
+
+
+/* Obtain an audit buffer.  This routine does locking to obtain the
+ * audit buffer, but then no locking is required for calls to
+ * audit_log_*format.  If the tsk is a task that is currently in a
+ * syscall, then the syscall is marked as auditable and an audit record
+ * will be written at syscall exit.  If there is no associated task, tsk
+ * should be NULL. */
+struct audit_buffer *audit_log_start(struct audit_context *ctx)
+{
+	struct audit_buffer	*ab	= NULL;
+	unsigned long		flags;
+	struct timespec		t;
+	int			serial	= 0;
+
+	if (!audit_initialized)
+		return NULL;
+
+	if (audit_backlog_limit
+	    && atomic_read(&audit_backlog) > audit_backlog_limit) {
+		if (audit_rate_check())
+			printk(KERN_WARNING
+			       "audit: audit_backlog=%d > "
+			       "audit_backlog_limit=%d\n",
+			       atomic_read(&audit_backlog),
+			       audit_backlog_limit);
+		audit_log_lost("backlog limit exceeded");
+		return NULL;
+	}
+
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (!list_empty(&audit_freelist)) {
+		ab = list_entry(audit_freelist.next,
+				struct audit_buffer, list);
+		list_del(&ab->list);
+		--audit_freelist_count;
+	}
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+	if (!ab)
+		ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+	if (!ab)
+		audit_log_lost("audit: out of memory in audit_log_start");
+	if (!ab)
+		return NULL;
+
+	atomic_inc(&audit_backlog);
+	skb_queue_head_init(&ab->sklist);
+
+	ab->ctx   = ctx;
+	ab->len   = 0;
+	ab->nlh   = NULL;
+	ab->total = 0;
+	ab->type  = AUDIT_KERNEL;
+	ab->pid   = 0;
+	ab->count = 0;
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (ab->ctx)
+		audit_get_stamp(ab->ctx, &t, &serial);
+	else
+#endif
+		t = CURRENT_TIME;
+
+	audit_log_format(ab, "audit(%lu.%03lu:%u): ",
+			 t.tv_sec, t.tv_nsec/1000000, serial);
+	return ab;
+}
+
+
+/* Format an audit message into the audit buffer.  If there isn't enough
+ * room in the audit buffer, more room will be allocated and vsnprint
+ * will be called a second time.  Currently, we assume that a printk
+ * can't format message larger than 1024 bytes, so we don't either. */
+static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
+			      va_list args)
+{
+	int len, avail;
+
+	if (!ab)
+		return;
+
+	avail = sizeof(ab->tmp) - ab->len;
+	if (avail <= 0) {
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+	}
+	len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	if (len >= avail) {
+		/* The printk buffer is 1024 bytes long, so if we get
+		 * here and AUDIT_BUFSIZ is at least 1024, then we can
+		 * log everything that printk could have logged. */
+		audit_log_move(ab);
+		avail = sizeof(ab->tmp) - ab->len;
+		len   = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+	}
+	ab->len   += (len < avail) ? len : avail;
+	ab->total += (len < avail) ? len : avail;
+}
+
+/* Format a message into the audit buffer.  All the work is done in
+ * audit_log_vformat. */
+void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
+{
+	va_list args;
+
+	if (!ab)
+		return;
+	va_start(args, fmt);
+	audit_log_vformat(ab, fmt, args);
+	va_end(args);
+}
+
+/* This is a helper-function to print the d_path without using a static
+ * buffer or allocating another buffer in addition to the one in
+ * audit_buffer. */
+void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
+		      struct dentry *dentry, struct vfsmount *vfsmnt)
+{
+	char *p;
+	int  len, avail;
+
+	if (prefix) audit_log_format(ab, " %s", prefix);
+
+	if (ab->len > 128)
+		audit_log_move(ab);
+	avail = sizeof(ab->tmp) - ab->len;
+	p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
+	if (p == ERR_PTR(-ENAMETOOLONG)) {
+		/* FIXME: can we save some information here? */
+		audit_log_format(ab, "<toolong>");
+	} else {
+				/* path isn't at start of buffer */
+		len	   = (ab->tmp + sizeof(ab->tmp) - 1) - p;
+		memmove(ab->tmp + ab->len, p, len);
+		ab->len   += len;
+		ab->total += len;
+	}
+}
+
+/* Remove queued messages from the audit_txlist and send them to userspace. */
+static void audit_tasklet_handler(unsigned long arg)
+{
+	LIST_HEAD(list);
+	struct audit_buffer *ab;
+	unsigned long	    flags;
+
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_splice_init(&audit_txlist, &list);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	while (!list_empty(&list)) {
+		ab = list_entry(list.next, struct audit_buffer, list);
+		list_del(&ab->list);
+		audit_log_end_fast(ab);
+	}
+}
+
+static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
+
+/* The netlink_* functions cannot be called inside an irq context, so
+ * the audit buffer is places on a queue and a tasklet is scheduled to
+ * remove them from the queue outside the irq context.  May be called in
+ * any context. */
+void audit_log_end_irq(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	if (!ab)
+		return;
+	spin_lock_irqsave(&audit_txlist_lock, flags);
+	list_add_tail(&ab->list, &audit_txlist);
+	spin_unlock_irqrestore(&audit_txlist_lock, flags);
+
+	tasklet_schedule(&audit_tasklet);
+}
+
+/* Send the message in the audit buffer directly to user space.  May not
+ * be called in an irq context. */
+void audit_log_end_fast(struct audit_buffer *ab)
+{
+	unsigned long flags;
+
+	BUG_ON(in_irq());
+	if (!ab)
+		return;
+	if (!audit_rate_check()) {
+		audit_log_lost("rate limit exceeded");
+	} else {
+		audit_log_move(ab);
+		if (audit_log_drain(ab))
+			return;
+	}
+
+	atomic_dec(&audit_backlog);
+	spin_lock_irqsave(&audit_freelist_lock, flags);
+	if (++audit_freelist_count > AUDIT_MAXFREE)
+		kfree(ab);
+	else
+		list_add(&ab->list, &audit_freelist);
+	spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+/* Send or queue the message in the audit buffer, depending on the
+ * current context.  (A convenience function that may be called in any
+ * context.) */
+void audit_log_end(struct audit_buffer *ab)
+{
+	if (in_irq())
+		audit_log_end_irq(ab);
+	else
+		audit_log_end_fast(ab);
+}
+
+/* Log an audit record.  This is a convenience function that calls
+ * audit_log_start, audit_log_vformat, and audit_log_end.  It may be
+ * called in any context. */
+void audit_log(struct audit_context *ctx, const char *fmt, ...)
+{
+	struct audit_buffer *ab;
+	va_list args;
+
+	ab = audit_log_start(ctx);
+	if (ab) {
+		va_start(args, fmt);
+		audit_log_vformat(ab, fmt, args);
+		va_end(args);
+		audit_log_end(ab);
+	}
+}
+
+EXPORT_SYMBOL_GPL(audit_set_rate_limit);
+EXPORT_SYMBOL_GPL(audit_set_backlog_limit);
+EXPORT_SYMBOL_GPL(audit_set_enabled);
+EXPORT_SYMBOL_GPL(audit_set_failure);
+
+EXPORT_SYMBOL_GPL(audit_log_start);
+EXPORT_SYMBOL_GPL(audit_log_format);
+EXPORT_SYMBOL_GPL(audit_log_end_irq);
+EXPORT_SYMBOL_GPL(audit_log_end_fast);
+EXPORT_SYMBOL_GPL(audit_log_end);
+EXPORT_SYMBOL_GPL(audit_log);
+EXPORT_SYMBOL_GPL(audit_log_d_path);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
new file mode 100644
index 000000000000..342b57141fd9
--- /dev/null
+++ b/kernel/auditsc.c
@@ -0,0 +1,922 @@
+/* auditsc.c -- System-call auditing support -*- linux-c -*-
+ * Handles all system-call specific auditing features.
+ *
+ * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Written by Rickard E. (Rik) Faith <faith@redhat.com>
+ *
+ * Many of the ideas implemented here are from Stephen C. Tweedie,
+ * especially the idea of avoiding a copy by using getname.
+ *
+ * The method for actual interception of syscall entry and exit (not in
+ * this file -- see entry.S) is based on a GPL'd patch written by
+ * okir@suse.de and Copyright 2003 SuSE Linux AG.
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/atomic.h>
+#include <asm/types.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+
+#include <linux/audit.h>
+#include <linux/personality.h>
+#include <linux/time.h>
+#include <asm/unistd.h>
+
+/* 0 = no checking
+   1 = put_count checking
+   2 = verbose put_count checking
+*/
+#define AUDIT_DEBUG 0
+
+/* No syscall auditing will take place unless audit_enabled != 0. */
+extern int audit_enabled;
+
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname(). */
+#define AUDIT_NAMES    20
+
+/* AUDIT_NAMES_RESERVED is the number of slots we reserve in the
+ * audit_context from being used for nameless inodes from
+ * path_lookup. */
+#define AUDIT_NAMES_RESERVED 7
+
+/* At task start time, the audit_state is set in the audit_context using
+   a per-task filter.  At syscall entry, the audit_state is augmented by
+   the syscall filter. */
+enum audit_state {
+	AUDIT_DISABLED,		/* Do not create per-task audit_context.
+				 * No syscall-specific audit records can
+				 * be generated. */
+	AUDIT_SETUP_CONTEXT,	/* Create the per-task audit_context,
+				 * but don't necessarily fill it in at
+				 * syscall entry time (i.e., filter
+				 * instead). */
+	AUDIT_BUILD_CONTEXT,	/* Create the per-task audit_context,
+				 * and always fill it in at syscall
+				 * entry time.  This makes a full
+				 * syscall record available if some
+				 * other part of the kernel decides it
+				 * should be recorded. */
+	AUDIT_RECORD_CONTEXT	/* Create the per-task audit_context,
+				 * always fill it in at syscall entry
+				 * time, and always write out the audit
+				 * record at syscall exit time.  */
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device. */
+struct audit_names {
+	const char	*name;
+	unsigned long	ino;
+	dev_t		rdev;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+	int		    in_syscall;	/* 1 if task is in a syscall */
+	enum audit_state    state;
+	unsigned int	    serial;     /* serial number for record */
+	struct timespec	    ctime;      /* time of syscall entry */
+	uid_t		    loginuid;   /* login uid (identity) */
+	int		    major;      /* syscall number */
+	unsigned long	    argv[4];    /* syscall arguments */
+	int		    return_valid; /* return code is valid */
+	int		    return_code;/* syscall return code */
+	int		    auditable;  /* 1 if record should be written */
+	int		    name_count;
+	struct audit_names  names[AUDIT_NAMES];
+	struct audit_context *previous; /* For nested syscalls */
+
+				/* Save things to print about task_struct */
+	pid_t		    pid;
+	uid_t		    uid, euid, suid, fsuid;
+	gid_t		    gid, egid, sgid, fsgid;
+	unsigned long	    personality;
+
+#if AUDIT_DEBUG
+	int		    put_count;
+	int		    ino_count;
+#endif
+};
+
+				/* Public API */
+/* There are three lists of rules -- one to search at task creation
+ * time, one to search at syscall entry time, and another to search at
+ * syscall exit time. */
+static LIST_HEAD(audit_tsklist);
+static LIST_HEAD(audit_entlist);
+static LIST_HEAD(audit_extlist);
+
+struct audit_entry {
+	struct list_head  list;
+	struct rcu_head   rcu;
+	struct audit_rule rule;
+};
+
+/* Check to see if two rules are identical.  It is called from
+ * audit_del_rule during AUDIT_DEL. */
+static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
+{
+	int i;
+
+	if (a->flags != b->flags)
+		return 1;
+
+	if (a->action != b->action)
+		return 1;
+
+	if (a->field_count != b->field_count)
+		return 1;
+
+	for (i = 0; i < a->field_count; i++) {
+		if (a->fields[i] != b->fields[i]
+		    || a->values[i] != b->values[i])
+			return 1;
+	}
+
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
+		if (a->mask[i] != b->mask[i])
+			return 1;
+
+	return 0;
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_add_rule(struct audit_entry *entry,
+				 struct list_head *list)
+{
+	if (entry->rule.flags & AUDIT_PREPEND) {
+		entry->rule.flags &= ~AUDIT_PREPEND;
+		list_add_rcu(&entry->list, list);
+	} else {
+		list_add_tail_rcu(&entry->list, list);
+	}
+	return 0;
+}
+
+static void audit_free_rule(void *arg)
+{
+	kfree(arg);
+}
+
+/* Note that audit_add_rule and audit_del_rule are called via
+ * audit_receive() in audit.c, and are protected by
+ * audit_netlink_sem. */
+static inline int audit_del_rule(struct audit_rule *rule,
+				 struct list_head *list)
+{
+	struct audit_entry  *e;
+
+	/* Do not use the _rcu iterator here, since this is the only
+	 * deletion routine. */
+	list_for_each_entry(e, list, list) {
+		if (!audit_compare_rule(rule, &e->rule)) {
+			list_del_rcu(&e->list);
+			call_rcu(&e->rcu, audit_free_rule, e);
+			return 0;
+		}
+	}
+	return -EFAULT;		/* No matching rule */
+}
+
+#ifdef CONFIG_NET
+/* Copy rule from user-space to kernel-space.  Called during
+ * AUDIT_ADD. */
+static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
+{
+	int i;
+
+	if (s->action != AUDIT_NEVER
+	    && s->action != AUDIT_POSSIBLE
+	    && s->action != AUDIT_ALWAYS)
+		return -1;
+	if (s->field_count < 0 || s->field_count > AUDIT_MAX_FIELDS)
+		return -1;
+
+	d->flags	= s->flags;
+	d->action	= s->action;
+	d->field_count	= s->field_count;
+	for (i = 0; i < d->field_count; i++) {
+		d->fields[i] = s->fields[i];
+		d->values[i] = s->values[i];
+	}
+	for (i = 0; i < AUDIT_BITMASK_SIZE; i++) d->mask[i] = s->mask[i];
+	return 0;
+}
+
+int audit_receive_filter(int type, int pid, int uid, int seq, void *data)
+{
+	u32		   flags;
+	struct audit_entry *entry;
+	int		   err = 0;
+
+	switch (type) {
+	case AUDIT_LIST:
+		/* The *_rcu iterators not needed here because we are
+		   always called with audit_netlink_sem held. */
+		list_for_each_entry(entry, &audit_tsklist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_entlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		list_for_each_entry(entry, &audit_extlist, list)
+			audit_send_reply(pid, seq, AUDIT_LIST, 0, 1,
+					 &entry->rule, sizeof(entry->rule));
+		audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
+		break;
+	case AUDIT_ADD:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		if (!(entry = kmalloc(sizeof(*entry), GFP_KERNEL)))
+			return -ENOMEM;
+		if (audit_copy_rule(&entry->rule, data)) {
+			kfree(entry);
+			return -EINVAL;
+		}
+		flags = entry->rule.flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_add_rule(entry, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_add_rule(entry, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_add_rule(entry, &audit_extlist);
+		break;
+	case AUDIT_DEL:
+		flags =((struct audit_rule *)data)->flags;
+		if (!err && (flags & AUDIT_PER_TASK))
+			err = audit_del_rule(data, &audit_tsklist);
+		if (!err && (flags & AUDIT_AT_ENTRY))
+			err = audit_del_rule(data, &audit_entlist);
+		if (!err && (flags & AUDIT_AT_EXIT))
+			err = audit_del_rule(data, &audit_extlist);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return err;
+}
+#endif
+
+/* Compare a task_struct with an audit_rule.  Return 1 on match, 0
+ * otherwise. */
+static int audit_filter_rules(struct task_struct *tsk,
+			      struct audit_rule *rule,
+			      struct audit_context *ctx,
+			      enum audit_state *state)
+{
+	int i, j;
+
+	for (i = 0; i < rule->field_count; i++) {
+		u32 field  = rule->fields[i] & ~AUDIT_NEGATE;
+		u32 value  = rule->values[i];
+		int result = 0;
+
+		switch (field) {
+		case AUDIT_PID:
+			result = (tsk->pid == value);
+			break;
+		case AUDIT_UID:
+			result = (tsk->uid == value);
+			break;
+		case AUDIT_EUID:
+			result = (tsk->euid == value);
+			break;
+		case AUDIT_SUID:
+			result = (tsk->suid == value);
+			break;
+		case AUDIT_FSUID:
+			result = (tsk->fsuid == value);
+			break;
+		case AUDIT_GID:
+			result = (tsk->gid == value);
+			break;
+		case AUDIT_EGID:
+			result = (tsk->egid == value);
+			break;
+		case AUDIT_SGID:
+			result = (tsk->sgid == value);
+			break;
+		case AUDIT_FSGID:
+			result = (tsk->fsgid == value);
+			break;
+		case AUDIT_PERS:
+			result = (tsk->personality == value);
+			break;
+
+		case AUDIT_EXIT:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code == value);
+			break;
+		case AUDIT_SUCCESS:
+			if (ctx && ctx->return_valid)
+				result = (ctx->return_code >= 0);
+			break;
+		case AUDIT_DEVMAJOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MAJOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_DEVMINOR:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].rdev)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_INODE:
+			if (ctx) {
+				for (j = 0; j < ctx->name_count; j++) {
+					if (MINOR(ctx->names[j].ino)==value) {
+						++result;
+						break;
+					}
+				}
+			}
+			break;
+		case AUDIT_LOGINUID:
+			result = 0;
+			if (ctx)
+				result = (ctx->loginuid == value);
+			break;
+		case AUDIT_ARG0:
+		case AUDIT_ARG1:
+		case AUDIT_ARG2:
+		case AUDIT_ARG3:
+			if (ctx)
+				result = (ctx->argv[field-AUDIT_ARG0]==value);
+			break;
+		}
+
+		if (rule->fields[i] & AUDIT_NEGATE)
+			result = !result;
+		if (!result)
+			return 0;
+	}
+	switch (rule->action) {
+	case AUDIT_NEVER:    *state = AUDIT_DISABLED;	    break;
+	case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT;  break;
+	case AUDIT_ALWAYS:   *state = AUDIT_RECORD_CONTEXT; break;
+	}
+	return 1;
+}
+
+/* At process creation time, we can determine if system-call auditing is
+ * completely disabled for this task.  Since we only have the task
+ * structure at this point, we can only check uid and gid.
+ */
+static enum audit_state audit_filter_task(struct task_struct *tsk)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, &audit_tsklist, list) {
+		if (audit_filter_rules(tsk, &e->rule, NULL, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* At syscall entry and exit time, this filter is called if the
+ * audit_state is not low enough that auditing cannot take place, but is
+ * also not high enough that we already know we have to write and audit
+ * record (i.e., the state is AUDIT_SETUP_CONTEXT or  AUDIT_BUILD_CONTEXT).
+ */
+static enum audit_state audit_filter_syscall(struct task_struct *tsk,
+					     struct audit_context *ctx,
+					     struct list_head *list)
+{
+	struct audit_entry *e;
+	enum audit_state   state;
+	int		   word = AUDIT_WORD(ctx->major);
+	int		   bit  = AUDIT_BIT(ctx->major);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(e, list, list) {
+		if ((e->rule.mask[word] & bit) == bit
+ 		    && audit_filter_rules(tsk, &e->rule, ctx, &state)) {
+			rcu_read_unlock();
+			return state;
+		}
+	}
+	rcu_read_unlock();
+	return AUDIT_BUILD_CONTEXT;
+}
+
+/* This should be called with task_lock() held. */
+static inline struct audit_context *audit_get_context(struct task_struct *tsk,
+						      int return_valid,
+						      int return_code)
+{
+	struct audit_context *context = tsk->audit_context;
+
+	if (likely(!context))
+		return NULL;
+	context->return_valid = return_valid;
+	context->return_code  = return_code;
+
+	if (context->in_syscall && !context->auditable) {
+		enum audit_state state;
+		state = audit_filter_syscall(tsk, context, &audit_extlist);
+		if (state == AUDIT_RECORD_CONTEXT)
+			context->auditable = 1;
+	}
+
+	context->pid = tsk->pid;
+	context->uid = tsk->uid;
+	context->gid = tsk->gid;
+	context->euid = tsk->euid;
+	context->suid = tsk->suid;
+	context->fsuid = tsk->fsuid;
+	context->egid = tsk->egid;
+	context->sgid = tsk->sgid;
+	context->fsgid = tsk->fsgid;
+	context->personality = tsk->personality;
+	tsk->audit_context = NULL;
+	return context;
+}
+
+static inline void audit_free_names(struct audit_context *context)
+{
+	int i;
+
+#if AUDIT_DEBUG == 2
+	if (context->auditable
+	    ||context->put_count + context->ino_count != context->name_count) {
+		printk(KERN_ERR "audit.c:%d(:%d): major=%d in_syscall=%d"
+		       " name_count=%d put_count=%d"
+		       " ino_count=%d [NOT freeing]\n",
+		       __LINE__,
+		       context->serial, context->major, context->in_syscall,
+		       context->name_count, context->put_count,
+		       context->ino_count);
+		for (i = 0; i < context->name_count; i++)
+			printk(KERN_ERR "names[%d] = %p = %s\n", i,
+			       context->names[i].name,
+			       context->names[i].name);
+		dump_stack();
+		return;
+	}
+#endif
+#if AUDIT_DEBUG
+	context->put_count  = 0;
+	context->ino_count  = 0;
+#endif
+
+	for (i = 0; i < context->name_count; i++)
+		if (context->names[i].name)
+			__putname(context->names[i].name);
+	context->name_count = 0;
+}
+
+static inline void audit_zero_context(struct audit_context *context,
+				      enum audit_state state)
+{
+	uid_t loginuid = context->loginuid;
+
+	memset(context, 0, sizeof(*context));
+	context->state      = state;
+	context->loginuid   = loginuid;
+}
+
+static inline struct audit_context *audit_alloc_context(enum audit_state state)
+{
+	struct audit_context *context;
+
+	if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+		return NULL;
+	audit_zero_context(context, state);
+	return context;
+}
+
+/* Filter on the task information and allocate a per-task audit context
+ * if necessary.  Doing so turns on system call auditing for the
+ * specified task.  This is called from copy_process, so no lock is
+ * needed. */
+int audit_alloc(struct task_struct *tsk)
+{
+	struct audit_context *context;
+	enum audit_state     state;
+
+	if (likely(!audit_enabled))
+		return 0; /* Return if not auditing. */
+
+	state = audit_filter_task(tsk);
+	if (likely(state == AUDIT_DISABLED))
+		return 0;
+
+	if (!(context = audit_alloc_context(state))) {
+		audit_log_lost("out of memory in audit_alloc");
+		return -ENOMEM;
+	}
+
+				/* Preserve login uid */
+	context->loginuid = -1;
+	if (tsk->audit_context)
+		context->loginuid = tsk->audit_context->loginuid;
+
+	tsk->audit_context  = context;
+	set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
+	return 0;
+}
+
+static inline void audit_free_context(struct audit_context *context)
+{
+	struct audit_context *previous;
+	int		     count = 0;
+
+	do {
+		previous = context->previous;
+		if (previous || (count &&  count < 10)) {
+			++count;
+			printk(KERN_ERR "audit(:%d): major=%d name_count=%d:"
+			       " freeing multiple contexts (%d)\n",
+			       context->serial, context->major,
+			       context->name_count, count);
+		}
+		audit_free_names(context);
+		kfree(context);
+		context  = previous;
+	} while (context);
+	if (count >= 10)
+		printk(KERN_ERR "audit: freed %d contexts\n", count);
+}
+
+static void audit_log_exit(struct audit_context *context)
+{
+	int i;
+	struct audit_buffer *ab;
+
+	ab = audit_log_start(context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "syscall=%d", context->major);
+	if (context->personality != PER_LINUX)
+		audit_log_format(ab, " per=%lx", context->personality);
+	if (context->return_valid)
+		audit_log_format(ab, " exit=%u", context->return_code);
+	audit_log_format(ab,
+		  " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
+		  " pid=%d loginuid=%d uid=%d gid=%d"
+		  " euid=%d suid=%d fsuid=%d"
+		  " egid=%d sgid=%d fsgid=%d",
+		  context->argv[0],
+		  context->argv[1],
+		  context->argv[2],
+		  context->argv[3],
+		  context->name_count,
+		  context->pid,
+		  context->loginuid,
+		  context->uid,
+		  context->gid,
+		  context->euid, context->suid, context->fsuid,
+		  context->egid, context->sgid, context->fsgid);
+	audit_log_end(ab);
+	for (i = 0; i < context->name_count; i++) {
+		ab = audit_log_start(context);
+		if (!ab)
+			continue; /* audit_panic has been called */
+		audit_log_format(ab, "item=%d", i);
+		if (context->names[i].name)
+			audit_log_format(ab, " name=%s",
+					 context->names[i].name);
+		if (context->names[i].ino != (unsigned long)-1)
+			audit_log_format(ab, " inode=%lu",
+					 context->names[i].ino);
+		/* FIXME: should use format_dev_t, but ab structure is
+		 * opaque. */
+		if (context->names[i].rdev != -1)
+			audit_log_format(ab, " dev=%02x:%02x",
+					 MAJOR(context->names[i].rdev),
+					 MINOR(context->names[i].rdev));
+		audit_log_end(ab);
+	}
+}
+
+/* Free a per-task audit context.  Called from copy_process and
+ * __put_task_struct. */
+void audit_free(struct task_struct *tsk)
+{
+	struct audit_context *context;
+
+	task_lock(tsk);
+	context = audit_get_context(tsk, 0, 0);
+	task_unlock(tsk);
+
+	if (likely(!context))
+		return;
+
+	/* Check for system calls that do not go through the exit
+	 * function (e.g., exit_group), then free context block. */
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	audit_free_context(context);
+}
+
+/* Compute a serial number for the audit record.  Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces.  The timestamp of the
+ * record and this serial number are used by the user-space daemon to
+ * determine which pieces belong to the same audit record.  The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit.  However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+static inline unsigned int audit_serial(void)
+{
+	static atomic_t serial = ATOMIC_INIT(0xffffff);
+	unsigned int a, b;
+
+	do {
+		a = atomic_read(&serial);
+		if (atomic_dec_and_test(&serial))
+			atomic_set(&serial, 0xffffff);
+		b = atomic_read(&serial);
+	} while (b != a - 1);
+
+	return 0xffffff - b;
+}
+
+/* Fill in audit context at syscall entry.  This only happens if the
+ * audit context was created when the task was created and the state or
+ * filters demand the audit context be built.  If the state from the
+ * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT,
+ * then the record will be written at syscall exit time (otherwise, it
+ * will only be written if another part of the kernel requests that it
+ * be written). */
+void audit_syscall_entry(struct task_struct *tsk, int major,
+			 unsigned long a1, unsigned long a2,
+			 unsigned long a3, unsigned long a4)
+{
+	struct audit_context *context = tsk->audit_context;
+	enum audit_state     state;
+
+	BUG_ON(!context);
+
+	/* This happens only on certain architectures that make system
+	 * calls in kernel_thread via the entry.S interface, instead of
+	 * with direct calls.  (If you are porting to a new
+	 * architecture, hitting this condition can indicate that you
+	 * got the _exit/_leave calls backward in entry.S.)
+	 *
+	 * i386     no
+	 * x86_64   no
+	 * ppc64    yes (see arch/ppc64/kernel/misc.S)
+	 *
+	 * This also happens with vm86 emulation in a non-nested manner
+	 * (entries without exits), so this case must be caught.
+	 */
+	if (context->in_syscall) {
+		struct audit_context *newctx;
+
+#if defined(__NR_vm86) && defined(__NR_vm86old)
+		/* vm86 mode should only be entered once */
+		if (major == __NR_vm86 || major == __NR_vm86old)
+			return;
+#endif
+#if AUDIT_DEBUG
+		printk(KERN_ERR
+		       "audit(:%d) pid=%d in syscall=%d;"
+		       " entering syscall=%d\n",
+		       context->serial, tsk->pid, context->major, major);
+#endif
+		newctx = audit_alloc_context(context->state);
+		if (newctx) {
+			newctx->previous   = context;
+			context		   = newctx;
+			tsk->audit_context = newctx;
+		} else	{
+			/* If we can't alloc a new context, the best we
+			 * can do is to leak memory (any pending putname
+			 * will be lost).  The only other alternative is
+			 * to abandon auditing. */
+			audit_zero_context(context, context->state);
+		}
+	}
+	BUG_ON(context->in_syscall || context->name_count);
+
+	if (!audit_enabled)
+		return;
+
+	context->major      = major;
+	context->argv[0]    = a1;
+	context->argv[1]    = a2;
+	context->argv[2]    = a3;
+	context->argv[3]    = a4;
+
+	state = context->state;
+	if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT)
+		state = audit_filter_syscall(tsk, context, &audit_entlist);
+	if (likely(state == AUDIT_DISABLED))
+		return;
+
+	context->serial     = audit_serial();
+	context->ctime      = CURRENT_TIME;
+	context->in_syscall = 1;
+	context->auditable  = !!(state == AUDIT_RECORD_CONTEXT);
+}
+
+/* Tear down after system call.  If the audit context has been marked as
+ * auditable (either because of the AUDIT_RECORD_CONTEXT state from
+ * filtering, or because some other part of the kernel write an audit
+ * message), then write out the syscall information.  In call cases,
+ * free the names stored from getname(). */
+void audit_syscall_exit(struct task_struct *tsk, int return_code)
+{
+	struct audit_context *context;
+
+	get_task_struct(tsk);
+	task_lock(tsk);
+	context = audit_get_context(tsk, 1, return_code);
+	task_unlock(tsk);
+
+	/* Not having a context here is ok, since the parent may have
+	 * called __put_task_struct. */
+	if (likely(!context))
+		return;
+
+	if (context->in_syscall && context->auditable)
+		audit_log_exit(context);
+
+	context->in_syscall = 0;
+	context->auditable  = 0;
+	if (context->previous) {
+		struct audit_context *new_context = context->previous;
+		context->previous  = NULL;
+		audit_free_context(context);
+		tsk->audit_context = new_context;
+	} else {
+		audit_free_names(context);
+		audit_zero_context(context, context->state);
+		tsk->audit_context = context;
+	}
+	put_task_struct(tsk);
+}
+
+/* Add a name to the list.  Called from fs/namei.c:getname(). */
+void audit_getname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		dump_stack();
+#endif
+		return;
+	}
+	BUG_ON(context->name_count >= AUDIT_NAMES);
+	context->names[context->name_count].name = name;
+	context->names[context->name_count].ino  = (unsigned long)-1;
+	context->names[context->name_count].rdev = -1;
+	++context->name_count;
+}
+
+/* Intercept a putname request.  Called from
+ * include/linux/fs.h:putname().  If we have stored the name from
+ * getname in the audit context, then we delay the putname until syscall
+ * exit. */
+void audit_putname(const char *name)
+{
+	struct audit_context *context = current->audit_context;
+
+	BUG_ON(!context);
+	if (!context->in_syscall) {
+#if AUDIT_DEBUG == 2
+		printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n",
+		       __FILE__, __LINE__, context->serial, name);
+		if (context->name_count) {
+			int i;
+			for (i = 0; i < context->name_count; i++)
+				printk(KERN_ERR "name[%d] = %p = %s\n", i,
+				       context->names[i].name,
+				       context->names[i].name);
+		}
+#endif
+		__putname(name);
+	}
+#if AUDIT_DEBUG
+	else {
+		++context->put_count;
+		if (context->put_count > context->name_count) {
+			printk(KERN_ERR "%s:%d(:%d): major=%d"
+			       " in_syscall=%d putname(%p) name_count=%d"
+			       " put_count=%d\n",
+			       __FILE__, __LINE__,
+			       context->serial, context->major,
+			       context->in_syscall, name, context->name_count,
+			       context->put_count);
+			dump_stack();
+		}
+	}
+#endif
+}
+
+/* Store the inode and device from a lookup.  Called from
+ * fs/namei.c:path_lookup(). */
+void audit_inode(const char *name, unsigned long ino, dev_t rdev)
+{
+	int idx;
+	struct audit_context *context = current->audit_context;
+
+	if (!context->in_syscall)
+		return;
+	if (context->name_count
+	    && context->names[context->name_count-1].name
+	    && context->names[context->name_count-1].name == name)
+		idx = context->name_count - 1;
+	else if (context->name_count > 1
+		 && context->names[context->name_count-2].name
+		 && context->names[context->name_count-2].name == name)
+		idx = context->name_count - 2;
+	else {
+		/* FIXME: how much do we care about inodes that have no
+		 * associated name? */
+		if (context->name_count >= AUDIT_NAMES - AUDIT_NAMES_RESERVED)
+			return;
+		idx = context->name_count++;
+		context->names[idx].name = NULL;
+#if AUDIT_DEBUG
+		++context->ino_count;
+#endif
+	}
+	context->names[idx].ino  = ino;
+	context->names[idx].rdev = rdev;
+}
+
+void audit_get_stamp(struct audit_context *ctx,
+		     struct timespec *t, int *serial)
+{
+	if (ctx) {
+		t->tv_sec  = ctx->ctime.tv_sec;
+		t->tv_nsec = ctx->ctime.tv_nsec;
+		*serial    = ctx->serial;
+		ctx->auditable = 1;
+	} else {
+		*t      = CURRENT_TIME;
+		*serial = 0;
+	}
+}
+
+int audit_set_loginuid(struct audit_context *ctx, uid_t loginuid)
+{
+	if (ctx) {
+		if (loginuid < 0)
+			return -EINVAL;
+		ctx->loginuid = loginuid;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(audit_alloc);
+EXPORT_SYMBOL_GPL(audit_free);
+EXPORT_SYMBOL_GPL(audit_syscall_entry);
+EXPORT_SYMBOL_GPL(audit_syscall_exit);
+EXPORT_SYMBOL_GPL(audit_getname);
+EXPORT_SYMBOL_GPL(audit_putname);
+EXPORT_SYMBOL_GPL(audit_inode);
diff --git a/kernel/fork.c b/kernel/fork.c
index fc25a3a15d0e..6035db6957f8 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -32,6 +32,7 @@
 #include <linux/futex.h>
 #include <linux/ptrace.h>
 #include <linux/mount.h>
+#include <linux/audit.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -83,6 +84,8 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	if (unlikely(tsk->audit_context))
+		audit_free(tsk);
 	security_task_free(tsk);
 	free_uid(tsk->user);
 	put_group_info(tsk->group_info);
@@ -949,13 +952,16 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	p->start_time = get_jiffies_64();
 	p->security = NULL;
 	p->io_context = NULL;
+	p->audit_context = NULL;
 
 	retval = -ENOMEM;
 	if ((retval = security_task_alloc(p)))
 		goto bad_fork_cleanup;
+	if ((retval = audit_alloc(p)))
+		goto bad_fork_cleanup_security;
 	/* copy all the process information */
 	if ((retval = copy_semundo(clone_flags, p)))
-		goto bad_fork_cleanup_security;
+		goto bad_fork_cleanup_audit;
 	if ((retval = copy_files(clone_flags, p)))
 		goto bad_fork_cleanup_semundo;
 	if ((retval = copy_fs(clone_flags, p)))
@@ -1090,6 +1096,8 @@ bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup_semundo:
 	exit_sem(p);
+bad_fork_cleanup_audit:
+	audit_free(p);
 bad_fork_cleanup_security:
 	security_task_free(p);
 bad_fork_cleanup:
diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index dad7ae38cc91..2e431763dc73 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -22,11 +22,14 @@
 #include <linux/un.h>
 #include <net/af_unix.h>
 #include <linux/ip.h>
+#include <linux/audit.h>
 #include <linux/ipv6.h>
 #include <net/ipv6.h>
 #include "avc.h"
 #include "avc_ss.h"
+#ifdef CONFIG_AUDIT
 #include "class_to_string.h"
+#endif
 #include "common_perm_to_string.h"
 #include "av_inherit.h"
 #include "av_perm_to_string.h"
@@ -68,14 +71,10 @@ struct avc_callback_node {
 };
 
 static spinlock_t avc_lock = SPIN_LOCK_UNLOCKED;
-static spinlock_t avc_log_lock = SPIN_LOCK_UNLOCKED;
 static struct avc_node *avc_node_freelist = NULL;
 static struct avc_cache avc_cache;
-static char *avc_audit_buffer = NULL;
 static unsigned avc_cache_stats[AVC_NSTATS];
 static struct avc_callback_node *avc_callbacks = NULL;
-static unsigned int avc_log_level = 4; /* default:  KERN_WARNING */
-static char avc_level_string[4] = "< >";
 
 static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
 {
@@ -87,14 +86,14 @@ static inline int avc_hash(u32 ssid, u32 tsid, u16 tclass)
  * @tclass: target security class
  * @av: access vector
  */
-void avc_dump_av(u16 tclass, u32 av)
+void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av)
 {
 	char **common_pts = 0;
 	u32 common_base = 0;
 	int i, i2, perm;
 
 	if (av == 0) {
-		printk(" null");
+		audit_log_format(ab, " null");
 		return;
 	}
 
@@ -106,12 +105,12 @@ void avc_dump_av(u16 tclass, u32 av)
 		}
 	}
 
-	printk(" {");
+	audit_log_format(ab, " {");
 	i = 0;
 	perm = 1;
 	while (perm < common_base) {
 		if (perm & av)
-			printk(" %s", common_pts[i]);
+			audit_log_format(ab, " %s", common_pts[i]);
 		i++;
 		perm <<= 1;
 	}
@@ -124,13 +123,14 @@ void avc_dump_av(u16 tclass, u32 av)
 					break;
 			}
 			if (i2 < ARRAY_SIZE(av_perm_to_string))
-				printk(" %s", av_perm_to_string[i2].name);
+				audit_log_format(ab, " %s",
+						 av_perm_to_string[i2].name);
 		}
 		i++;
 		perm <<= 1;
 	}
 
-	printk(" }");
+	audit_log_format(ab, " }");
 }
 
 /**
@@ -139,7 +139,7 @@ void avc_dump_av(u16 tclass, u32 av)
  * @tsid: target security identifier
  * @tclass: target security class
  */
-void avc_dump_query(u32 ssid, u32 tsid, u16 tclass)
+void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass)
 {
 	int rc;
 	char *scontext;
@@ -147,20 +147,20 @@ void avc_dump_query(u32 ssid, u32 tsid, u16 tclass)
 
  	rc = security_sid_to_context(ssid, &scontext, &scontext_len);
 	if (rc)
-		printk("ssid=%d", ssid);
+		audit_log_format(ab, "ssid=%d", ssid);
 	else {
-		printk("scontext=%s", scontext);
+		audit_log_format(ab, "scontext=%s", scontext);
 		kfree(scontext);
 	}
 
 	rc = security_sid_to_context(tsid, &scontext, &scontext_len);
 	if (rc)
-		printk(" tsid=%d", tsid);
+		audit_log_format(ab, " tsid=%d", tsid);
 	else {
-		printk(" tcontext=%s", scontext);
+		audit_log_format(ab, " tcontext=%s", scontext);
 		kfree(scontext);
 	}
-	printk(" tclass=%s", class_to_string[tclass]);
+	audit_log_format(ab, " tclass=%s", class_to_string[tclass]);
 }
 
 /**
@@ -194,11 +194,7 @@ void __init avc_init(void)
 		avc_node_freelist = new;
 	}
 
-	avc_audit_buffer = (char *)__get_free_page(GFP_ATOMIC);
-	if (!avc_audit_buffer)
-		panic("AVC:  unable to allocate audit buffer\n");
-
-	avc_level_string[1] = '0' + avc_log_level;
+	audit_log(current->audit_context, "AVC INITIALIZED\n");
 }
 
 #if 0
@@ -430,12 +426,13 @@ static inline void avc_print_ipv6_addr(struct in6_addr *addr, u16 port,
 		printk(" %s=%d", name2, ntohs(port));
 }
 
-static inline void avc_print_ipv4_addr(u32 addr, u16 port, char *name1, char *name2)
+static inline void avc_print_ipv4_addr(struct audit_buffer *ab, u32 addr,
+				       u16 port, char *name1, char *name2)
 {
 	if (addr)
-		printk(" %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
+		audit_log_format(ab, " %s=%d.%d.%d.%d", name1, NIPQUAD(addr));
 	if (port)
-		printk(" %s=%d", name2, ntohs(port));
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
 }
 
 /*
@@ -515,9 +512,8 @@ void avc_audit(u32 ssid, u32 tsid,
 {
 	struct task_struct *tsk = current;
 	struct inode *inode = NULL;
-	char *p;
 	u32 denied, audited;
-	unsigned long flags;
+	struct audit_buffer *ab;
 
 	denied = requested & ~avd->allowed;
 	if (denied) {
@@ -535,19 +531,18 @@ void avc_audit(u32 ssid, u32 tsid,
 	if (!check_avc_ratelimit())
 		return;
 
-	/* prevent overlapping printks */
-	spin_lock_irqsave(&avc_log_lock,flags);
-
-	printk("%s\n", avc_level_string);
-	printk("%savc:  %s ", avc_level_string, denied ? "denied" : "granted");
-	avc_dump_av(tclass,audited);
-	printk(" for ");
+	ab = audit_log_start(current->audit_context);
+	if (!ab)
+		return;		/* audit_panic has been called */
+	audit_log_format(ab, "avc:  %s ", denied ? "denied" : "granted");
+	avc_dump_av(ab, tclass,audited);
+	audit_log_format(ab, " for ");
 	if (a && a->tsk)
 		tsk = a->tsk;
 	if (tsk && tsk->pid) {
 		struct mm_struct *mm;
 		struct vm_area_struct *vma;
-		printk(" pid=%d", tsk->pid);
+		audit_log_format(ab, " pid=%d", tsk->pid);
 		if (tsk == current)
 			mm = current->mm;
 		else
@@ -558,11 +553,9 @@ void avc_audit(u32 ssid, u32 tsid,
 				while (vma) {
 					if ((vma->vm_flags & VM_EXECUTABLE) &&
 					    vma->vm_file) {
-						p = d_path(vma->vm_file->f_dentry,
-							   vma->vm_file->f_vfsmnt,
-							   avc_audit_buffer,
-							   PAGE_SIZE);
-						printk(" exe=%s", p);
+						audit_log_d_path(ab, "exe=",
+							vma->vm_file->f_dentry,
+							vma->vm_file->f_vfsmnt);
 						break;
 					}
 					vma = vma->vm_next;
@@ -572,29 +565,26 @@ void avc_audit(u32 ssid, u32 tsid,
 			if (tsk != current)
 				mmput(mm);
 		} else {
-			printk(" comm=%s", tsk->comm);
+			audit_log_format(ab, " comm=%s", tsk->comm);
 		}
 	}
 	if (a) {
 		switch (a->type) {
 		case AVC_AUDIT_DATA_IPC:
-			printk(" key=%d", a->u.ipc_id);
+			audit_log_format(ab, " key=%d", a->u.ipc_id);
 			break;
 		case AVC_AUDIT_DATA_CAP:
-			printk(" capability=%d", a->u.cap);
+			audit_log_format(ab, " capability=%d", a->u.cap);
 			break;
 		case AVC_AUDIT_DATA_FS:
 			if (a->u.fs.dentry) {
 				struct dentry *dentry = a->u.fs.dentry;
 				if (a->u.fs.mnt) {
-					p = d_path(dentry,
-						   a->u.fs.mnt,
-						   avc_audit_buffer,
-						   PAGE_SIZE);
-					if (p)
-						printk(" path=%s", p);
+					audit_log_d_path(ab, "path=", dentry,
+							a->u.fs.mnt);
 				} else {
-					printk(" name=%s", dentry->d_name.name);
+					audit_log_format(ab, " name=%s",
+							 dentry->d_name.name);
 				}
 				inode = dentry->d_inode;
 			} else if (a->u.fs.inode) {
@@ -602,29 +592,33 @@ void avc_audit(u32 ssid, u32 tsid,
 				inode = a->u.fs.inode;
 				dentry = d_find_alias(inode);
 				if (dentry) {
-					printk(" name=%s", dentry->d_name.name);
+					audit_log_format(ab, " name=%s",
+							 dentry->d_name.name);
 					dput(dentry);
 				}
 			}
 			if (inode)
-				printk(" dev=%s ino=%ld",
-				       inode->i_sb->s_id, inode->i_ino);
+				audit_log_format(ab, " dev=%s ino=%ld",
+						 inode->i_sb->s_id,
+						 inode->i_ino);
 			break;
 		case AVC_AUDIT_DATA_NET:
 			if (a->u.net.sk) {
 				struct sock *sk = a->u.net.sk;
 				struct unix_sock *u;
+				int len = 0;
+				char *p = NULL;
 
 				switch (sk->sk_family) {
 				case AF_INET: {
 					struct inet_opt *inet = inet_sk(sk);
 
-					avc_print_ipv4_addr(inet->rcv_saddr,
-					                    inet->sport,
-					                    "laddr", "lport");
-					avc_print_ipv4_addr(inet->daddr,
-					                    inet->dport,
-					                    "faddr", "fport");
+					avc_print_ipv4_addr(ab, inet->rcv_saddr,
+							    inet->sport,
+							    "laddr", "lport");
+					avc_print_ipv4_addr(ab, inet->daddr,
+							    inet->dport,
+							    "faddr", "fport");
 					break;
 				}
 				case AF_INET6: {
@@ -642,34 +636,32 @@ void avc_audit(u32 ssid, u32 tsid,
 				case AF_UNIX:
 					u = unix_sk(sk);
 					if (u->dentry) {
-						p = d_path(u->dentry,
-							   u->mnt,
-							   avc_audit_buffer,
-							   PAGE_SIZE);
-						printk(" path=%s", p);
-					} else if (u->addr) {
-						p = avc_audit_buffer;
-						memcpy(p,
-						       u->addr->name->sun_path,
-						       u->addr->len-sizeof(short));
-						if (*p == 0) {
-							*p = '@';
-							p += u->addr->len-sizeof(short);
-							*p = 0;
-						}
-						printk(" path=%s",
-						       avc_audit_buffer);
+						audit_log_d_path(ab, "path=",
+							u->dentry, u->mnt);
+						break;
 					}
+					if (!u->addr)
+						break;
+					len = u->addr->len-sizeof(short);
+					p = &u->addr->name->sun_path[0];
+					if (*p)
+						audit_log_format(ab,
+							"path=%*.*s", len,
+							len, p);
+					else
+						audit_log_format(ab,
+							"path=@%*.*s", len-1,
+							len-1, p+1);
 					break;
 				}
 			}
 			
 			switch (a->u.net.family) {
 			case AF_INET:
-				avc_print_ipv4_addr(a->u.net.v4info.saddr,
+				avc_print_ipv4_addr(ab, a->u.net.v4info.saddr,
 						    a->u.net.sport,
 						    "saddr", "src");
-				avc_print_ipv4_addr(a->u.net.v4info.daddr,
+				avc_print_ipv4_addr(ab, a->u.net.v4info.daddr,
 						    a->u.net.dport,
 						    "daddr", "dest");
 				break;
@@ -683,15 +675,14 @@ void avc_audit(u32 ssid, u32 tsid,
 				break;
 			}
 			if (a->u.net.netif)
-				printk(" netif=%s", a->u.net.netif);
+				audit_log_format(ab, " netif=%s",
+					a->u.net.netif);
 			break;
 		}
 	}
-	printk(" ");
-	avc_dump_query(ssid, tsid, tclass);
-	printk("\n");
-
-	spin_unlock_irqrestore(&avc_log_lock,flags);
+	audit_log_format(ab, " ");
+	avc_dump_query(ab, ssid, tsid, tclass);
+	audit_log_end(ab);
 }
 
 /**
@@ -1120,14 +1111,3 @@ int avc_has_perm(u32 ssid, u32 tsid, u16 tclass,
 	avc_audit(ssid, tsid, tclass, requested, &avd, rc, auditdata);
 	return rc;
 }
-
-static int __init avc_log_level_setup(char *str)
-{
-	avc_log_level = simple_strtol(str, NULL, 0);
-	if (avc_log_level > 7)
-		avc_log_level = 7;
-	return 1;
-}
-
-__setup("avc_log_level=", avc_log_level_setup);
-
diff --git a/security/selinux/include/avc.h b/security/selinux/include/avc.h
index c143db4ca685..86bdeef585a4 100644
--- a/security/selinux/include/avc.h
+++ b/security/selinux/include/avc.h
@@ -127,9 +127,10 @@ static inline void avc_cache_stats_add(int type, unsigned val)
 /*
  * AVC display support
  */
-void avc_dump_av(u16 tclass, u32 av);
-void avc_dump_query(u32 ssid, u32 tsid, u16 tclass);
-void avc_dump_cache(char *tag);
+struct audit_buffer;
+void avc_dump_av(struct audit_buffer *ab, u16 tclass, u32 av);
+void avc_dump_query(struct audit_buffer *ab, u32 ssid, u32 tsid, u16 tclass);
+void avc_dump_cache(struct audit_buffer *ab, char *tag);
 
 /*
  * AVC operations
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index d8c18ed0087b..f2a53e22b060 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -399,7 +399,7 @@ int security_sid_to_context(u32 sid, char **scontext, u32 *scontext_len)
 			char *scontextp;
 
 			*scontext_len = strlen(initial_sid_to_string[sid]) + 1;
-			scontextp = kmalloc(*scontext_len,GFP_KERNEL);
+			scontextp = kmalloc(*scontext_len,GFP_ATOMIC);
 			strcpy(scontextp, initial_sid_to_string[sid]);
 			*scontext = scontextp;
 			goto out;
-- 
cgit v1.2.3


From a804dbaf0e0feb822e0e72a8716c5a4829c7421c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:30:58 -0700
Subject: [PATCH] es1688 Definition redundancy

From: Fabian Frederick <Fabian.Frederick@skynet.be>

Here's a trivial patch to avoid definition redundancy in es1688.
---
 include/sound/es1688.h        | 2 ++
 sound/isa/es1688/es1688.c     | 2 --
 sound/isa/es1688/es1688_lib.c | 2 --
 3 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/sound/es1688.h b/include/sound/es1688.h
index 104bb9b022cc..d390c64768f5 100644
--- a/include/sound/es1688.h
+++ b/include/sound/es1688.h
@@ -55,6 +55,8 @@ struct _snd_es1688 {
 
 typedef struct _snd_es1688 es1688_t;
 
+#define chip_t es1688_t
+
 /* I/O ports */
 
 #define ES1688P(codec, x) ((codec)->port + e_s_s_ESS1688##x)
diff --git a/sound/isa/es1688/es1688.c b/sound/isa/es1688/es1688.c
index 20a949c0cd64..79a5dd6dc97d 100644
--- a/sound/isa/es1688/es1688.c
+++ b/sound/isa/es1688/es1688.c
@@ -34,8 +34,6 @@
 #define SNDRV_GET_ID
 #include <sound/initval.h>
 
-#define chip_t es1688_t
-
 MODULE_AUTHOR("Jaroslav Kysela <perex@suse.cz>");
 MODULE_DESCRIPTION("ESS ESx688 AudioDrive");
 MODULE_LICENSE("GPL");
diff --git a/sound/isa/es1688/es1688_lib.c b/sound/isa/es1688/es1688_lib.c
index 62267ccb9f5e..66d08b80467f 100644
--- a/sound/isa/es1688/es1688_lib.c
+++ b/sound/isa/es1688/es1688_lib.c
@@ -37,8 +37,6 @@ MODULE_DESCRIPTION("ESS ESx688 lowlevel module");
 MODULE_CLASSES("{sound}");
 MODULE_LICENSE("GPL");
 
-#define chip_t es1688_t
-
 static int snd_es1688_dsp_command(es1688_t *chip, unsigned char val)
 {
 	int i;
-- 
cgit v1.2.3


From 4f9ad28f535ebe88803e7129a455338e5949d02d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:32:16 -0700
Subject: [PATCH] H8/300 support update (1/3) - ptrace fix

From: Yoshinori Sato <ysato@users.sourceforge.jp>

- fix PTRACE_SIGLESTEP bug.
- separate to CPU depend.
---
 arch/h8300/kernel/ptrace.c                 | 231 ++---------------------
 arch/h8300/platform/h8300h/Makefile        |   3 +-
 arch/h8300/platform/h8300h/ptrace_h8300h.c | 282 +++++++++++++++++++++++++++++
 arch/h8300/platform/h8s/Makefile           |   2 +-
 arch/h8300/platform/h8s/ptrace_h8s.c       |  84 +++++++++
 include/asm-h8300/processor.h              |  15 +-
 include/asm-h8300/ptrace.h                 |  11 ++
 7 files changed, 407 insertions(+), 221 deletions(-)
 create mode 100644 arch/h8300/platform/h8300h/ptrace_h8300h.c
 create mode 100644 arch/h8300/platform/h8s/ptrace_h8s.c

(limited to 'include')

diff --git a/arch/h8300/kernel/ptrace.c b/arch/h8300/kernel/ptrace.c
index dd47698b8e51..0843013d149a 100644
--- a/arch/h8300/kernel/ptrace.c
+++ b/arch/h8300/kernel/ptrace.c
@@ -1,7 +1,7 @@
 /*
  *  linux/arch/h8300/kernel/ptrace.c
  *
- *  Yoshinori Sato <qzb04471@nifty.ne.jp>
+ *  Yoshinori Sato <ysato@users.sourceforge.jp>
  *
  *  Based on:
  *  linux/arch/m68k/kernel/ptrace.c
@@ -32,194 +32,17 @@
 #include <asm/processor.h>
 #include <asm/signal.h>
 
+/* cpu depend functions */
+extern long h8300_get_reg(struct task_struct *task, int regno);
+extern int  h8300_put_reg(struct task_struct *task, int regno, unsigned long data);
+extern void h8300_disable_trace(struct task_struct *child);
+extern void h8300_enable_trace(struct task_struct *child);
+
 /*
  * does not yet catch signals sent when the child dies.
  * in exit.c or in signal.c.
  */
 
-/* determines which bits in the SR the user has access to. */
-/* 1 = access 0 = no access */
-#define SR_MASK 0x001f
-
-/* sets the trace bits. */
-#define TRACE_BITS 0x8000
-
-/* Find the stack offset for a register, relative to thread.esp0. */
-#define PT_REG(reg)	((long)&((struct pt_regs *)0)->reg)
-/* Mapping from PT_xxx to the stack offset at which the register is
-   saved.  Notice that usp has no stack-slot and needs to be treated
-   specially (see get_reg/put_reg below). */
-static const int regoff[] = {
-	PT_REG(er1), PT_REG(er2), PT_REG(er3), PT_REG(er4),
-	PT_REG(er5), PT_REG(er6), PT_REG(er0), PT_REG(orig_er0),
-	PT_REG(ccr), PT_REG(pc)
-};
-
-/*
- * Get contents of register REGNO in task TASK.
- */
-static inline long get_reg(struct task_struct *task, int regno)
-{
-	unsigned long *addr;
-
-	if (regno == PT_USP)
-		addr = &task->thread.usp;
-	else if (regno < sizeof(regoff)/sizeof(regoff[0]))
-		addr = (unsigned long *)(task->thread.esp0 + regoff[regno]);
-	else
-		return 0;
-	return *addr;
-}
-
-/*
- * Write contents of register REGNO in task TASK.
- */
-static inline int put_reg(struct task_struct *task, int regno,
-			  unsigned long data)
-{
-	unsigned long *addr;
-
-	if (regno == PT_USP)
-		addr = &task->thread.usp;
-	else if (regno < sizeof(regoff)/sizeof(regoff[0]))
-		addr = (unsigned long *) (task->thread.esp0 + regoff[regno]);
-	else
-		return -1;
-	*addr = data;
-	return 0;
-}
-
-/*
- * Called by kernel/ptrace.c when detaching..
- *
- * Make sure the single step bit is not set.
- */
-int ptrace_cancel_bpt(struct task_struct *child)
-{
-        int i,r=0;
-
-	for(i=0; i<4; i++) {
-	        if (child->thread.debugreg[i]) {
-		        if (child->thread.debugreg[i] != ~0)
-		                put_user(child->thread.debugreg[i+4],
-                                         (unsigned short *)child->thread.debugreg[i]);
-			r = 1;
-			child->thread.debugreg[i] = 0;
-		}
-	}
-	return r;
-}
-
-const static unsigned char opcode0[]={
-  0x04,0x02,0x04,0x02,0x04,0x02,0x04,0x02,  /* 0x58 */
-  0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,  /* 0x60 */
-  0x02,0x02,0x11,0x11,0x02,0x02,0x04,0x04,  /* 0x68 */
-  0x02,0x02,0x02,0x02,0x02,0x02,0x02,0x02,  /* 0x70 */
-  0x08,0x04,0x06,0x04,0x04,0x04,0x04,0x04}; /* 0x78 */
-
-const static int table_parser01(unsigned char *pc);
-const static int table_parser02(unsigned char *pc);
-const static int table_parser100(unsigned char *pc);
-const static int table_parser101(unsigned char *pc);
-
-const static int (*parsers[])(unsigned char *pc)={table_parser01,table_parser02};
-
-static int insn_length(unsigned char *pc)
-{
-  if (*pc == 0x01)
-    return table_parser01(pc+1);
-  if (*pc < 0x58 || *pc>=0x80) 
-    return 2;
-  else
-    if (opcode0[*pc-0x58]<0x10)
-      return opcode0[*pc-0x58];
-    else
-      return (*parsers[opcode0[*pc-0x58]-0x10])(pc+1);
-}
-
-const static int table_parser01(unsigned char *pc)
-{
-  const unsigned char codelen[]={0x10,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-                                 0x02,0x00,0x00,0x00,0x04,0x04,0x00,0x04};
-  const static int (*parsers[])(unsigned char *)={table_parser100,table_parser101};
-  unsigned char second_index;
-  second_index = (*pc) >> 4;
-  if (codelen[second_index]<0x10)
-    return codelen[second_index];
-  else
-    return parsers[codelen[second_index]-0x10](pc);
-}
-
-const static int table_parser02(unsigned char *pc)
-{
-  return (*pc & 0x20)?0x06:0x04;
-}
-
-const static int table_parser100(unsigned char *pc)
-{
-  return (*(pc+2) & 0x02)?0x08:0x06;
-}
-
-const static int table_parser101(unsigned char *pc)
-{
-  return (*(pc+2) & 0x02)?0x08:0x06;
-}
-
-#define BREAK_INST 0x5730 /* TRAPA #3 */
-
-int ptrace_set_bpt(struct task_struct *child)
-{
-        unsigned long pc,next;
-	unsigned short insn;
-	pc = get_reg(child,PT_PC);
-	next = insn_length((unsigned char *)pc) + pc;
-	get_user(insn,(unsigned short *)pc);
-	if (insn == 0x5470) {
-	        /* rts */ 
-	        unsigned long sp;
-		sp = get_reg(child,PT_USP);
-		get_user(next,(unsigned long *)sp);
-	} else if ((insn & 0xfb00) != 0x5800) {
-	        /* jmp / jsr */
-	        int regs;
-		const short reg_tbl[]={PT_ER0,PT_ER1,PT_ER2,PT_ER3,
-                                       PT_ER4,PT_ER5,PT_ER6,PT_USP};
-	        switch(insn & 0xfb00) {
-		        case 0x5900:
-			       regs = (insn & 0x0070) >> 8;
-                               next = get_reg(child,reg_tbl[regs]);
-			       break;
-		        case 0x5a00:
-			       get_user(next,(unsigned long *)(pc+2));
-			       next &= 0x00ffffff;
-			       break;
-		        case 0x5b00:
-			       /* unneccessary? */
-			       next = *(unsigned long *)(insn & 0xff);
-                               break;
-		}
-	} else if (((insn & 0xf000) == 0x4000) || ((insn &0xff00) == 0x5500)) { 
-	        /* b**:8 */
-	        unsigned long dsp;
-		dsp = (long)(insn && 0xff)+pc+2;
-		child->thread.debugreg[1] = dsp;
-		get_user(child->thread.debugreg[5],(unsigned short *)dsp);
-		put_user(BREAK_INST,(unsigned short *)dsp);
-	} else if (((insn & 0xff00) == 0x5800) || ((insn &0xff00) == 0x5c00)) { 
-	        /* b**:16 */
-	        unsigned long dsp;
-		get_user(dsp,(unsigned short *)(pc+2));
-		dsp = (long)dsp+pc+4;
-		child->thread.debugreg[1] = dsp;
-		get_user(child->thread.debugreg[5],(unsigned short *)dsp);
-		put_user(BREAK_INST,(unsigned short *)dsp);
-	}
-	child->thread.debugreg[0] = next;
-	get_user(child->thread.debugreg[4],(unsigned short *)next);
-	put_user(BREAK_INST,(unsigned short *)next);
-	return 0;
-}
-
 inline
 static int read_long(struct task_struct * tsk, unsigned long addr,
 	unsigned long * result)
@@ -230,7 +53,7 @@ static int read_long(struct task_struct * tsk, unsigned long addr,
 
 void ptrace_disable(struct task_struct *child)
 {
-	ptrace_cancel_bpt(child);
+	h8300_disable_trace(child);
 }
 
 asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
@@ -298,8 +121,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			
 			tmp = 0;  /* Default return condition */
 			addr = addr >> 2; /* temporary hack. */
-			if (addr < 10)
-				tmp = get_reg(child, addr);
+			if (addr < H8300_REGS_NO)
+				tmp = h8300_get_reg(child, addr);
 			else {
 				ret = -EIO;
 				break ;
@@ -328,14 +151,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 				ret = -EIO;
 				break ;
 			}
-			if (addr == PT_CCR) {
-				data &= SR_MASK;
-			}
-			if (addr < 10) {
-				if (put_reg(child, addr, data))
-					ret = -EIO;
-				else
-					ret = 0;
+			if (addr < H8300_REGS_NO) {
+				ret = h8300_put_reg(child, addr, data);
 				break ;
 			}
 			ret = -EIO;
@@ -352,7 +169,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			child->exit_code = data;
 			wake_up_process(child);
 			/* make sure the single step bit is not set. */
-			ptrace_cancel_bpt(child);
+			h8300_disable_trace(child);
 			ret = 0;
 		}
 
@@ -367,7 +184,7 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			if (child->state == TASK_ZOMBIE) /* already dead */
 				break;
 			child->exit_code = SIGKILL;
-			ptrace_cancel_bpt(child);
+			h8300_disable_trace(child);
 			wake_up_process(child);
 			break;
 		}
@@ -377,8 +194,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 			if ((unsigned long) data > _NSIG)
 				break;
 			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-			child->thread.debugreg[0]=-1;
 			child->exit_code = data;
+			h8300_enable_trace(child);
 			wake_up_process(child);
 			ret = 0;
 			break;
@@ -391,8 +208,8 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		case PTRACE_GETREGS: { /* Get all gp regs from the child. */
 		  	int i;
 			unsigned long tmp;
-			for (i = 0; i < 19; i++) {
-			    tmp = get_reg(child, i);
+			for (i = 0; i < H8300_REGS_NO; i++) {
+			    tmp = h8300_get_reg(child, i);
 			    if (put_user(tmp, (unsigned long *) data)) {
 				ret = -EFAULT;
 				break;
@@ -406,12 +223,12 @@ asmlinkage int sys_ptrace(long request, long pid, long addr, long data)
 		case PTRACE_SETREGS: { /* Set all gp regs in the child. */
 			int i;
 			unsigned long tmp;
-			for (i = 0; i < 10; i++) {
+			for (i = 0; i < H8300_REGS_NO; i++) {
 			    if (get_user(tmp, (unsigned long *) data)) {
 				ret = -EFAULT;
 				break;
 			    }
-			    put_reg(child, i, tmp);
+			    h8300_put_reg(child, i, tmp);
 			    data += sizeof(long);
 			}
 			ret = 0;
@@ -449,13 +266,3 @@ asmlinkage void syscall_trace(void)
 		current->exit_code = 0;
 	}
 }
-
-asmlinkage void trace_trap(unsigned long bp)
-{
-	if (current->thread.debugreg[0] == bp ||
-            current->thread.debugreg[1] == bp) {
-	        ptrace_cancel_bpt(current);
-		force_sig(SIGTRAP,current);
-	} else
-	        force_sig(SIGILL,current);
-}
diff --git a/arch/h8300/platform/h8300h/Makefile b/arch/h8300/platform/h8300h/Makefile
index 9a920fe01921..5d42c772f75a 100644
--- a/arch/h8300/platform/h8300h/Makefile
+++ b/arch/h8300/platform/h8300h/Makefile
@@ -4,5 +4,4 @@
 # Reuse any files we can from the H8/300H
 #
 
-obj-y := entry.o ints_h8300h.o
-
+obj-y := entry.o ints_h8300h.o ptrace_h8300h.o
diff --git a/arch/h8300/platform/h8300h/ptrace_h8300h.c b/arch/h8300/platform/h8300h/ptrace_h8300h.c
new file mode 100644
index 000000000000..69f6ae19e1e2
--- /dev/null
+++ b/arch/h8300/platform/h8300h/ptrace_h8300h.c
@@ -0,0 +1,282 @@
+/*
+ *  linux/arch/h8300/platform/h8300h/ptrace_h8300h.c
+ *    ptrace cpu depend helper functions
+ *
+ *  Yoshinori Sato <ysato@users.sourceforge.jp>
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sched.h>
+#include <asm/ptrace.h>
+
+#define CCR_MASK 0x6f    /* mode/imask not set */
+#define BREAKINST 0x5730 /* trapa #3 */
+
+/* Mapping from PT_xxx to the stack offset at which the register is
+   saved.  Notice that usp has no stack-slot and needs to be treated
+   specially (see get_reg/put_reg below). */
+static const int h8300_register_offset[] = {
+	PT_REG(er1), PT_REG(er2), PT_REG(er3), PT_REG(er4),
+	PT_REG(er5), PT_REG(er6), PT_REG(er0), PT_REG(orig_er0),
+	PT_REG(ccr), PT_REG(pc)
+};
+
+/* read register */
+long h8300_get_reg(struct task_struct *task, int regno)
+{
+	switch (regno) {
+	case PT_USP:
+		return task->thread.usp + sizeof(long)*2;
+	case PT_CCR:
+	    return *(unsigned short *)(task->thread.esp0 + h8300_register_offset[regno]);
+	default:
+	    return *(unsigned long *)(task->thread.esp0 + h8300_register_offset[regno]);
+	}
+}
+
+/* write register */
+int h8300_put_reg(struct task_struct *task, int regno, unsigned long data)
+{
+	unsigned short oldccr;
+	switch (regno) {
+	case PT_USP:
+		task->thread.usp = data - sizeof(long)*2;
+	case PT_CCR:
+		oldccr = *(unsigned short *)(task->thread.esp0 + h8300_register_offset[regno]);
+		oldccr &= ~CCR_MASK;
+		data &= CCR_MASK;
+		data |= oldccr;
+		*(unsigned short *)(task->thread.esp0 + h8300_register_offset[regno]) = data;
+		break;
+	default:
+		*(unsigned long *)(task->thread.esp0 + h8300_register_offset[regno]) = data;
+		break;
+	}
+	return 0;
+}
+
+/* disable singlestep */
+void h8300_disable_trace(struct task_struct *child)
+{
+	if((long)child->thread.breakinfo.addr != -1L) {
+		*child->thread.breakinfo.addr = child->thread.breakinfo.inst;
+		child->thread.breakinfo.addr = (unsigned short *)-1L;
+	}
+}
+
+/* calculate next pc */
+enum jump_type {none,    /* normal instruction */
+		jabs,    /* absolute address jump */
+		ind,     /* indirect address jump */
+		ret,     /* return to subrutine */
+		reg,     /* register indexed jump */
+		relb,    /* pc relative jump (byte offset) */
+		relw,    /* pc relative jump (word offset) */
+               };
+
+/* opcode decode table define
+   ptn: opcode pattern
+   msk: opcode bitmask
+   len: instruction length (<0 next table index)
+   jmp: jump operation mode */
+struct optable {
+	unsigned char bitpattern;
+	unsigned char bitmask;
+	signed char length;
+	signed char type;
+} __attribute__((aligned(1),packed));
+
+#define OPTABLE(ptn,msk,len,jmp)   \
+        {                          \
+		.bitpattern = ptn, \
+		.bitmask    = msk, \
+		.length	    = len, \
+		.type       = jmp, \
+	}
+
+const static struct optable optable_0[] = {
+	OPTABLE(0x00,0xff, 1,none), /* 0x00 */
+	OPTABLE(0x01,0xff,-1,none), /* 0x01 */
+	OPTABLE(0x02,0xfe, 1,none), /* 0x02-0x03 */
+	OPTABLE(0x04,0xee, 1,none), /* 0x04-0x05/0x14-0x15 */
+	OPTABLE(0x06,0xfe, 1,none), /* 0x06-0x07 */
+	OPTABLE(0x08,0xea, 1,none), /* 0x08-0x09/0x0c-0x0d/0x18-0x19/0x1c-0x1d */
+	OPTABLE(0x0a,0xee, 1,none), /* 0x0a-0x0b/0x1a-0x1b */
+	OPTABLE(0x0e,0xee, 1,none), /* 0x0e-0x0f/0x1e-0x1f */
+	OPTABLE(0x10,0xfc, 1,none), /* 0x10-0x13 */
+	OPTABLE(0x16,0xfe, 1,none), /* 0x16-0x17 */
+	OPTABLE(0x20,0xe0, 1,none), /* 0x20-0x3f */
+	OPTABLE(0x40,0xf0, 1,relb), /* 0x40-0x4f */
+	OPTABLE(0x50,0xfc, 1,none), /* 0x50-0x53 */
+	OPTABLE(0x54,0xfd, 1,ret ), /* 0x54/0x56 */
+	OPTABLE(0x55,0xff, 1,relb), /* 0x55 */
+	OPTABLE(0x57,0xff, 1,none), /* 0x57 */
+	OPTABLE(0x58,0xfb, 2,relw), /* 0x58/0x5c */
+	OPTABLE(0x59,0xfb, 1,reg ), /* 0x59/0x5b */
+	OPTABLE(0x5a,0xfb, 2,jabs), /* 0x5a/0x5e */
+	OPTABLE(0x5b,0xfb, 2,ind ), /* 0x5b/0x5f */
+	OPTABLE(0x60,0xe8, 1,none), /* 0x60-0x67/0x70-0x77 */
+	OPTABLE(0x68,0xfa, 1,none), /* 0x68-0x69/0x6c-0x6d */
+	OPTABLE(0x6a,0xfe,-2,none), /* 0x6a-0x6b */
+	OPTABLE(0x6e,0xfe, 2,none), /* 0x6e-0x6f */
+	OPTABLE(0x78,0xff, 4,none), /* 0x78 */
+	OPTABLE(0x79,0xff, 2,none), /* 0x79 */
+	OPTABLE(0x7a,0xff, 3,none), /* 0x7a */
+	OPTABLE(0x7b,0xff, 2,none), /* 0x7b */
+	OPTABLE(0x7c,0xfc, 2,none), /* 0x7c-0x7f */
+	OPTABLE(0x80,0x80, 1,none), /* 0x80-0xff */
+};
+
+const static struct optable optable_1[] = {
+	OPTABLE(0x00,0xff,-3,none), /* 0x0100 */
+	OPTABLE(0x40,0xf0,-3,none), /* 0x0140-0x14f */
+	OPTABLE(0x80,0xf0, 1,none), /* 0x0180-0x018f */
+	OPTABLE(0xc0,0xc0, 2,none), /* 0x01c0-0x01ff */
+};
+
+const static struct optable optable_2[] = {
+	OPTABLE(0x00,0x20, 2,none), /* 0x6a0?/0x6a8?/0x6b0?/0x6b8? */
+	OPTABLE(0x20,0x20, 3,none), /* 0x6a2?/0x6aa?/0x6b2?/0x6ba? */
+};
+
+const static struct optable optable_3[] = {
+	OPTABLE(0x69,0xfb, 2,none), /* 0x010069/0x01006d/014069/0x01406d */
+	OPTABLE(0x6b,0xff,-4,none), /* 0x01006b/0x01406b */
+	OPTABLE(0x6f,0xff, 3,none), /* 0x01006f/0x01406f */
+	OPTABLE(0x78,0xff, 5,none), /* 0x010078/0x014078 */
+};
+
+const static struct optable optable_4[] = {
+	OPTABLE(0x00,0x78, 3,none), /* 0x0100690?/0x01006d0?/0140690/0x01406d0?/0x0100698?/0x01006d8?/0140698?/0x01406d8? */
+	OPTABLE(0x20,0x78, 4,none), /* 0x0100692?/0x01006d2?/0140692/0x01406d2?/0x010069a?/0x01006da?/014069a?/0x01406da? */
+};
+
+const static struct optables_list {
+	const struct optable *ptr;
+	int size;
+} optables[] = {
+#define OPTABLES(no)                                                   \
+        {                                                              \
+		.ptr  = optable_##no,                                  \
+		.size = sizeof(optable_##no) / sizeof(struct optable), \
+	}
+	OPTABLES(0),
+	OPTABLES(1),
+	OPTABLES(2),
+	OPTABLES(3),
+	OPTABLES(4),
+
+};
+
+const unsigned char condmask[] = {
+	0x00,0x40,0x01,0x04,0x02,0x08,0x10,0x20
+};
+
+static int isbranch(struct task_struct *task,int reson)
+{
+	unsigned char cond = h8300_get_reg(task, PT_CCR);
+	/* encode complex conditions */
+	/* B4: N^V
+	   B5: Z|(N^V)
+	   B6: C|Z */
+	__asm__("bld #3,%w0\n\t"
+		"bxor #1,%w0\n\t"
+		"bst #4,%w0\n\t"
+		"bor #2,%w0\n\t"
+		"bst #5,%w0\n\t"
+		"bld #2,%w0\n\t"
+		"bor #0,%w0\n\t"
+		"bst #6,%w0\n\t"
+		:"=&r"(cond)::"cc");
+	cond &= condmask[reson >> 1];
+	if (!(reson & 1))
+		return cond == 0;
+	else
+		return cond != 0;
+}
+
+static unsigned short *getnextpc(struct task_struct *child, unsigned short *pc)
+{
+	const struct optable *op;
+	unsigned char *fetch_p;
+	unsigned char inst;
+	unsigned long addr;
+	unsigned long *sp;
+	int op_len,regno;
+	op = optables[0].ptr;
+	op_len = optables[0].size;
+	fetch_p = (unsigned char *)pc;
+	inst = *fetch_p++;
+	do {
+		if ((inst & op->bitmask) == op->bitpattern) {
+			if (op->length < 0) {
+				op = optables[-op->length].ptr;
+				op_len = optables[-op->length].size + 1;
+				inst = *fetch_p++;
+			} else {
+				switch (op->type) {
+				case none:
+					return pc + op->length;
+				case jabs:
+					addr = *(unsigned long *)pc;
+					return (unsigned short *)(addr & 0x00ffffff);
+				case ind:
+					addr = *pc & 0xff;
+					return (unsigned short *)(*(unsigned long *)addr);
+				case ret:
+					sp = (unsigned long *)h8300_get_reg(child, PT_USP);
+					/* user stack frames
+					   |   er0  | temporary saved
+					   +--------+
+					   |   exp  | exception stack frames
+					   +--------+
+					   | ret pc | userspace return address
+					*/
+					return (unsigned short *)(*(sp+2) & 0x00ffffff);
+				case reg:
+					regno = (*pc >> 4) & 0x07;
+					if (regno == 0)
+						addr = h8300_get_reg(child, PT_ER0);
+					else
+						addr = h8300_get_reg(child, regno-1+PT_ER1);
+					return (unsigned short *)addr;
+				case relb:
+					if ((inst = 0x55) || isbranch(child,inst & 0x0f))
+						(unsigned char *)pc += (signed char)(*fetch_p);
+					return pc+1; /* skip myself */
+				case relw:
+					if ((inst = 0x5c) || isbranch(child,(*fetch_p & 0xf0) >> 4))
+						(unsigned char *)pc += (signed short)(*(pc+1));
+					return pc+2; /* skip myself */
+				}
+			}
+		} else
+			op++;
+	} while(--op_len > 0);
+	return NULL;
+}
+
+/* Set breakpoint(s) to simulate a single step from the current PC.  */
+
+void h8300_enable_trace(struct task_struct *child)
+{
+	unsigned short *nextpc;
+	nextpc = getnextpc(child,(unsigned short *)h8300_get_reg(child, PT_PC));
+	child->thread.breakinfo.addr = nextpc;
+	child->thread.breakinfo.inst = *nextpc;
+	*nextpc = BREAKINST;
+}
+
+asmlinkage void trace_trap(unsigned long bp)
+{
+	if ((unsigned long)current->thread.breakinfo.addr == bp) {
+		h8300_disable_trace(current);
+		force_sig(SIGTRAP,current);
+	} else
+	        force_sig(SIGILL,current);
+}
+
diff --git a/arch/h8300/platform/h8s/Makefile b/arch/h8300/platform/h8s/Makefile
index 7cf522c11de5..0847b15d4256 100644
--- a/arch/h8300/platform/h8s/Makefile
+++ b/arch/h8300/platform/h8s/Makefile
@@ -4,4 +4,4 @@
 # Reuse any files we can from the H8S
 #
 
-obj-y := entry.o ints_h8s.o
+obj-y := entry.o ints_h8s.o ptrace_h8s.o
diff --git a/arch/h8300/platform/h8s/ptrace_h8s.c b/arch/h8300/platform/h8s/ptrace_h8s.c
new file mode 100644
index 000000000000..dc04954048a9
--- /dev/null
+++ b/arch/h8300/platform/h8s/ptrace_h8s.c
@@ -0,0 +1,84 @@
+/*
+ *  linux/arch/h8300/platform/h8s/ptrace_h8s.c
+ *    ptrace cpu depend helper functions
+ *
+ *  Yoshinori Sato <ysato@users.sourceforge.jp>
+ *
+ * This file is subject to the terms and conditions of the GNU General
+ * Public License.  See the file COPYING in the main directory of
+ * this archive for more details.
+ */
+
+#include <linux/linkage.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <asm/ptrace.h>
+
+#define CCR_MASK  0x6f
+#define EXR_TRACE 0x80
+
+/* Mapping from PT_xxx to the stack offset at which the register is
+   saved.  Notice that usp has no stack-slot and needs to be treated
+   specially (see get_reg/put_reg below). */
+static const int h8300_register_offset[] = {
+	PT_REG(er1), PT_REG(er2), PT_REG(er3), PT_REG(er4),
+	PT_REG(er5), PT_REG(er6), PT_REG(er0), PT_REG(orig_er0),
+	PT_REG(ccr), PT_REG(pc), PT_REG(exr)
+};
+
+/* read register */
+long h8300_get_reg(struct task_struct *task, int regno)
+{
+	switch (regno) {
+	case PT_USP:
+		return task->thread.usp + sizeof(long)*2 + 2;
+	case PT_CCR:
+	case PT_EXR:
+	    return *(unsigned short *)(task->thread.esp0 + h8300_register_offset[regno]);
+	default:
+	    return *(unsigned long *)(task->thread.esp0 + h8300_register_offset[regno]);
+	}
+}
+
+/* write register */
+int h8300_put_reg(struct task_struct *task, int regno, unsigned long data)
+{
+	unsigned short oldccr;
+	switch (regno) {
+	case PT_USP:
+		task->thread.usp = data - sizeof(long)*2 - 2;
+	case PT_CCR:
+		oldccr = *(unsigned short *)(task->thread.esp0 + h8300_register_offset[regno]);
+		oldccr &= ~CCR_MASK;
+		data &= CCR_MASK;
+		data |= oldccr;
+		*(unsigned short *)(task->thread.esp0 + h8300_register_offset[regno]) = data;
+		break;
+	case PT_EXR:
+		/* exr modify not support */
+		return -EIO;
+	default:
+		*(unsigned long *)(task->thread.esp0 + h8300_register_offset[regno]) = data;
+		break;
+	}
+	return 0;
+}
+
+/* disable singlestep */
+void h8300_disable_trace(struct task_struct *child)
+{
+	*(unsigned short *)(child->thread.esp0 + h8300_register_offset[PT_EXR]) &= ~EXR_TRACE;
+}
+
+/* enable singlestep */
+void h8300_enable_trace(struct task_struct *child)
+{
+	*(unsigned short *)(child->thread.esp0 + h8300_register_offset[PT_EXR]) |= EXR_TRACE;
+}
+
+asmlinkage void trace_trap(unsigned long bp)
+{
+	(void)bp;
+	force_sig(SIGTRAP,current);
+}
+
diff --git a/include/asm-h8300/processor.h b/include/asm-h8300/processor.h
index 7154a6a9d9c3..31494109a13e 100644
--- a/include/asm-h8300/processor.h
+++ b/include/asm-h8300/processor.h
@@ -51,16 +51,19 @@ extern inline void wrusp(unsigned long usp) {
 #define MCA_bus 0
 
 struct thread_struct {
-	unsigned long ksp;		/* kernel stack pointer */
-	unsigned long usp;		/* user stack pointer */
-	unsigned long ccr;		/* saved status register */
-	unsigned long esp0;             /* points to SR of stack frame */
-	unsigned long debugreg[8];      /* debug info */
+	unsigned long  ksp;		/* kernel stack pointer */
+	unsigned long  usp;		/* user stack pointer */
+	unsigned long  ccr;		/* saved status register */
+	unsigned long  esp0;            /* points to SR of stack frame */
+	struct {
+		unsigned short *addr;
+		unsigned short inst;
+	} breakinfo;
 };
 
 #define INIT_THREAD  { \
 	sizeof(init_stack) + (unsigned long) init_stack, 0, \
-	PS_S, \
+	PS_S,  0, {(unsigned short *)-1, 0}, \
 }
 
 /*
diff --git a/include/asm-h8300/ptrace.h b/include/asm-h8300/ptrace.h
index abf25cf74dc7..5effc8d14b0d 100644
--- a/include/asm-h8300/ptrace.h
+++ b/include/asm-h8300/ptrace.h
@@ -14,6 +14,7 @@
 #define PT_CCR	   8
 #define PT_PC	   9
 #define PT_USP	   10
+#define PT_EXR     12
 
 /* this struct defines the way the registers are stored on the
    stack during a system call. */
@@ -44,6 +45,16 @@ struct pt_regs {
 #define PS_S  (0x10)
 #endif
 
+#if defined(__H8300H__)
+#define H8300_REGS_NO 11
+#endif
+#if defined(__H8300S__)
+#define H8300_REGS_NO 12
+#endif
+
+/* Find the stack offset for a register, relative to thread.esp0. */
+#define PT_REG(reg)	((long)&((struct pt_regs *)0)->reg)
+
 #define user_mode(regs) (!((regs)->ccr & PS_S))
 #define instruction_pointer(regs) ((regs)->pc)
 extern void show_regs(struct pt_regs *);
-- 
cgit v1.2.3


From 092ed9514b86fb028067a38f3dd44d484d91a202 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:32:42 -0700
Subject: [PATCH] H8/300 support update (3/3) - others

From: Yoshinori Sato <ysato@users.sourceforge.jp>

- use new serial driver (drivers/serial/sh-sci.[ch])
- typo fix
- add message level
---
 arch/h8300/Kconfig                 | 74 +++-----------------------------------
 arch/h8300/Kconfig.ide             |  2 +-
 arch/h8300/kernel/setup.c          | 48 +++++++------------------
 arch/h8300/platform/h8s/ints_h8s.c |  5 +--
 include/asm-h8300/regs306x.h       |  4 +--
 5 files changed, 24 insertions(+), 109 deletions(-)

(limited to 'include')

diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 73538bc9f99b..556863890a72 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -57,17 +57,17 @@ config H8300H_GENERIC
 config H8300H_AKI3068NET
 	bool "AE-3068/69"
 	help
-	  AKI-H8/3068F / AKI-H8/3069F Flashmicom LAN Board Suppot
+	  AKI-H8/3068F / AKI-H8/3069F Flashmicom LAN Board Support
 	  More Information. (Japanese Only)
 	  <http://akizukidensi.com/catalog/h8.html>
-	  AE-3068/69 Evalution Board Support
+	  AE-3068/69 Evaluation Board Support
 	  More Information.
 	  <http://www.microtronique.com/ae3069lan.htm>
 
 config H8300H_H8MAX
 	bool "H8MAX"
 	help
-	  H8MAX Evalution Board Suooprt
+	  H8MAX Evaluation Board Support
 	  More Information. (Japanese Only)
 	  <http://strawberry-linux.com/h8/index.html>
 
@@ -81,7 +81,7 @@ config H8300H_SIM
 config H8S_EDOSK2674
 	bool "EDOSK-2674"
 	help
-	  Renesas EDOSK-2674R Evalution Board Support
+	  Renesas EDOSK-2674 Evaluation Board Support
 	  More Information.
 	  <http://www.azpower.com/H8-uClinux/index.html>
  	  <http://www.eu.renesas.com/tools/edk/support/edosk2674.html>
@@ -240,70 +240,6 @@ config HW_CONSOLE
 	depends on VT && !S390 && !UM
 	default y
 
-config SERIAL
-	tristate "Serial (8250, 16450, 16550 or compatible) support"
-	---help---
-	  This selects whether you want to include the driver for the standard
-	  serial ports.  The standard answer is Y.  People who might say N
-	  here are those that are setting up dedicated Ethernet WWW/FTP
-	  servers, or users that have one of the various bus mice instead of a
-	  serial mouse and don't intend to use their machine's standard serial
-	  port for anything.  (Note that the Cyclades and Stallion multi
-	  serial port drivers do not need this driver built in for them to
-	  work.)
-
-	  To compile this driver as a module, choose M here: the
-	  module will be called serial.
-	  [WARNING: Do not compile this driver as a module if you are using
-	  non-standard serial ports, since the configuration information will
-	  be lost when the driver is unloaded.  This limitation may be lifted
-	  in the future.]
-
-	  BTW1: If you have a mouseman serial mouse which is not recognized by
-	  the X window system, try running gpm first.
-
-	  BTW2: If you intend to use a software modem (also called Winmodem)
-	  under Linux, forget it.  These modems are crippled and require
-	  proprietary drivers which are only available under Windows.
-
-	  Most people will say Y or M here, so that they can use serial mice,
-	  modems and similar devices connecting to the standard serial ports.
-
-config SH_SCI
-	tristate "Serial (SCI, SCIF) support"
-	help
-	  Selecting this option will allow the Linux kernel to transfer data
-	  over SCI (Serial Communication Interface) and/or SCIF (Serial
-	  Communication Interface with FIFO) which are built into the Hitachi
-	  SuperH processor.  The option provides 1 to 3 (depending
-	  on the CPU model) standard Linux tty devices, /dev/ttySC[012]; one
-	  of these is normally used as the system console.
-
-	  If in doubt, press "y".
-
-config SERIAL_CONSOLE
-	bool "Support for console on serial port"
-	depends on SERIAL=y || SH_SCI=y
-	---help---
-	  If you say Y here, it will be possible to use a serial port as the
-	  system console (the system console is the device which receives all
-	  kernel messages and warnings and which allows logins in single user
-	  mode). This could be useful if some terminal or printer is connected
-	  to that serial port.
-
-	  Even if you say Y here, the currently visible virtual console
-	  (/dev/tty0) will still be used as the system console by default, but
-	  you can alter that using a kernel command line option such as
-	  "console=ttyS1". (Try "man bootparam" or see the documentation of
-	  your boot loader (lilo or loadlin) about how to pass options to the
-	  kernel at boot time.)
-
-	  If you don't have a VGA card installed and you say Y here, the
-	  kernel will automatically use the first serial line, /dev/ttyS0, as
-	  system console.
-
-	  If unsure, say N.
-
 comment "Unix98 PTY support"
 
 config UNIX98_PTYS
@@ -396,7 +332,7 @@ config SYSCALL_PRINT
 
 config GDB_DEBUG
    	bool "Use gdb stub"
-	depends on (!H8300H_SIM && H8S_SIM)
+	depends on (!H8300H_SIM && !H8S_SIM)
 	help
 	  gdb stub exception support
 
diff --git a/arch/h8300/Kconfig.ide b/arch/h8300/Kconfig.ide
index 70694986b760..3b9d58c169a8 100644
--- a/arch/h8300/Kconfig.ide
+++ b/arch/h8300/Kconfig.ide
@@ -14,7 +14,7 @@ config H8300_IDE_ALT
 	help
 	  IDE alternate registers address
 
-config H8300_IDE_IRQNO
+config H8300_IDE_IRQ
 	int "IDE IRQ no"
 	depends on IDE
 	help
diff --git a/arch/h8300/kernel/setup.c b/arch/h8300/kernel/setup.c
index a027a0ac607b..7bab2e30d711 100644
--- a/arch/h8300/kernel/setup.c
+++ b/arch/h8300/kernel/setup.c
@@ -138,27 +138,23 @@ void __init setup_arch(char **cmdline_p)
 	register_console((struct console *)&gdb_console);
 #endif
 
-	printk("\r\n\nuClinux " CPU "\n");
-	printk("Target Hardware: %s\n",_target_name);
-	printk("Flat model support (C) 1998,1999 Kenneth Albanowski, D. Jeff Dionne\n");
-	printk("H8/300 series support by Yoshinori Sato <ysato@users.sourceforge.jp>\n");
+	printk(KERN_INFO "\r\n\nuClinux " CPU "\n");
+	printk(KERN_INFO "Target Hardware: %s\n",_target_name);
+	printk(KERN_INFO "Flat model support (C) 1998,1999 Kenneth Albanowski, D. Jeff Dionne\n");
+	printk(KERN_INFO "H8/300 series support by Yoshinori Sato <ysato@users.sourceforge.jp>\n");
 
 #ifdef DEBUG
-	printk("KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
+	printk(KERN_DEBUG "KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
 		"BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
 		(int) &_sdata, (int) &_edata,
 		(int) &_sbss, (int) &_ebss);
-	printk("KERNEL -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x "
+	printk(KERN_DEBUG "KERNEL -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x "
 		"STACK=0x%06x-0x%06x\n",
 	       (int) &_ebss, (int) memory_start,
 		(int) memory_start, (int) memory_end,
 		(int) memory_end, (int) &_ramend);
 #endif
 
-#ifdef CONFIG_BLK_DEV_BLKMEM
-	ROOT_DEV = MKDEV(BLKMEM_MAJOR,0);
-#endif
-
 #ifdef CONFIG_DEFAULT_CMDLINE
 	/* set from default command line */
 	if (*command_line == '\0')
@@ -171,7 +167,7 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef DEBUG
 	if (strlen(*cmdline_p)) 
-		printk("Command line: '%s'\n", *cmdline_p);
+		printk(KERN_DEBUG "Command line: '%s'\n", *cmdline_p);
 #endif
 
 	/*
@@ -195,30 +191,10 @@ void __init setup_arch(char **cmdline_p)
 	paging_init();
 	h8300_gpio_init();
 #ifdef DEBUG
-	printk("Done setup_arch\n");
+	printk(KERN_DEBUG "Done setup_arch\n");
 #endif
 }
 
-int get_cpuinfo(char * buffer)
-{
-    char *cpu;
-    u_long clockfreq;
-
-    cpu = CPU;
-
-    clockfreq = CONFIG_CPU_CLOCK;
-
-    return(sprintf(buffer, "CPU:\t\t%s\n"
-		   "Clock:\t%lu.%1luMHz\n"
-		   "BogoMips:\t%lu.%02lu\n"
-		   "Calibration:\t%lu loops\n",
-		   cpu,
-		   clockfreq/100,clockfreq%100,
-		   (loops_per_jiffy*HZ)/500000,((loops_per_jiffy*HZ)/5000)%100,
-		   (loops_per_jiffy*HZ)));
-
-}
-
 /*
  *	Get CPU information for use by the procfs.
  */
@@ -226,17 +202,19 @@ int get_cpuinfo(char * buffer)
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
     char *cpu;
+    int mode;
     u_long clockfreq;
 
     cpu = CPU;
+    mode = *(volatile unsigned char *)MDCR & 0x07;
 
     clockfreq = CONFIG_CPU_CLOCK;
 
-    seq_printf(m,  "CPU:\t\t%s\n"
-		   "Clock:\t%lu.%1luMHz\n"
+    seq_printf(m,  "CPU:\t\t%s (mode:%d)\n"
+		   "Clock:\t\t%lu.%1luMHz\n"
 		   "BogoMips:\t%lu.%02lu\n"
 		   "Calibration:\t%lu loops\n",
-		   cpu,
+	           cpu,mode,
 		   clockfreq/100,clockfreq%100,
 		   (loops_per_jiffy*HZ)/500000,((loops_per_jiffy*HZ)/5000)%100,
 		   (loops_per_jiffy*HZ));
diff --git a/arch/h8300/platform/h8s/ints_h8s.c b/arch/h8300/platform/h8s/ints_h8s.c
index dd4398ec950d..61d14597728f 100644
--- a/arch/h8300/platform/h8s/ints_h8s.c
+++ b/arch/h8300/platform/h8s/ints_h8s.c
@@ -63,7 +63,8 @@ const static struct irq_pins irq_assign_table1[16]={
 	{H8300_GPIO_P2,H8300_GPIO_B6},{H8300_GPIO_P2,H8300_GPIO_B7},
 };
 
-#define IRQ_GPIO_MAP(irqbit,port,bit)				  \
+/* IRQ to GPIO pinno transrate */
+#define IRQ_GPIO_MAP(irqbit,irq,port,bit)			  \
 do {								  \
 	if (*(volatile unsigned short *)ITSR & irqbit) {	  \
 		port = irq_assign_table1[irq - EXT_IRQ0].port_no; \
@@ -79,7 +80,7 @@ int h8300_enable_irq_pin(unsigned int irq)
 	if (irq >= EXT_IRQ0 && irq <= EXT_IRQ15) {
 		unsigned short ptn = 1 << (irq - EXT_IRQ0);
 		unsigned int port_no,bit_no;
-		IRQ_GPIO_MAP(ptn,port_no,bit_no);
+		IRQ_GPIO_MAP(ptn,irq,port_no,bit_no);
 		if (H8300_GPIO_RESERVE(port_no, bit_no) == 0)
 			return -EBUSY;                   /* pin already use */
 		H8300_GPIO_DDR(port_no, bit_no, H8300_GPIO_INPUT);
diff --git a/include/asm-h8300/regs306x.h b/include/asm-h8300/regs306x.h
index b8ff8d1f56d7..027dd633fa25 100644
--- a/include/asm-h8300/regs306x.h
+++ b/include/asm-h8300/regs306x.h
@@ -125,8 +125,8 @@
 #define RDR2  0xFFFFC5
 #define SCMR2 0xFFFFC6
 
-#define MDCR   0xFEE000
-#define SYSCR  0xFEE001
+#define MDCR   0xFEE011
+#define SYSCR  0xFEE012
 #define DIVCR  0xFEE01B
 #define MSTCRH 0xFEE01C
 #define MSTCRL 0xFEE01D
-- 
cgit v1.2.3


From 8d94c528b039ab11e66a0f0ae2137a75974c1f10 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:32:54 -0700
Subject: [PATCH] H8/300 support update

From: Yoshinori Sato <ysato@users.sourceforge.jp>

- fix any error/warning
- fix {request,freee}_irq interrupt control fix
- add dump_stack
- fix show_trace_task
- fix typo
---
 arch/h8300/kernel/ints.c           | 23 +++++++++++++++--------
 arch/h8300/kernel/traps.c          | 37 +++++++++++++++----------------------
 arch/h8300/platform/h8s/entry.S    |  2 +-
 arch/h8300/platform/h8s/ints_h8s.c |  8 +++++---
 drivers/serial/sh-sci.c            |  8 ++++----
 include/asm-h8300/io.h             |  8 ++++----
 6 files changed, 44 insertions(+), 42 deletions(-)

(limited to 'include')

diff --git a/arch/h8300/kernel/ints.c b/arch/h8300/kernel/ints.c
index 4a70da2b1e34..7246e29ea01d 100644
--- a/arch/h8300/kernel/ints.c
+++ b/arch/h8300/kernel/ints.c
@@ -93,7 +93,7 @@ void __init init_IRQ(void)
 	if (ramvec == NULL)
 		panic("interrupt vector serup failed.");
 	else
-		printk("virtual vector at 0x%08lx\n",(unsigned long)ramvec);
+		printk(KERN_INFO "virtual vector at 0x%08lx\n",(unsigned long)ramvec);
 
 	/* create redirect table */
 	ramvec_p = ramvec;
@@ -118,11 +118,11 @@ void __init init_IRQ(void)
 	ramvec_p = ramvec;
 	for (i = 0; i < NR_IRQS; i++) {
 		if ((i % 8) == 0)
-			printk("\n%p: ",ramvec_p);
-		printk("%p ",*ramvec_p);
+			printk(KERN_DEBUG "\n%p: ",ramvec_p);
+		printk(KERN_DEBUG "%p ",*ramvec_p);
 		ramvec_p++;
 	}
-	printk("\n");
+	printk(KERN_DEBUG "\n");
 #endif
 #endif
 }
@@ -133,9 +133,10 @@ int request_irq(unsigned int irq,
 {
 	irq_handler_t *irq_handle;
 	if (irq < 0 || irq >= NR_IRQS) {
-		printk("Incorrect IRQ %d from %s\n", irq, devname);
+		printk(KERN_ERR "Incorrect IRQ %d from %s\n", irq, devname);
 		return -EINVAL;
 	}
+
 	if (irq_list[irq] || (h8300_enable_irq_pin(irq) == -EBUSY))
 		return -EBUSY;
 
@@ -156,6 +157,11 @@ int request_irq(unsigned int irq,
 	irq_handle->dev_id  = dev_id;
 	irq_handle->devname = devname;
 	irq_list[irq] = irq_handle;
+
+	if (irq_handle->flags & SA_SAMPLE_RANDOM)
+		rand_initialize_irq(irq);
+
+	enable_irq(irq);
 	return 0;
 }
 
@@ -163,12 +169,13 @@ EXPORT_SYMBOL(request_irq);
 
 void free_irq(unsigned int irq, void *dev_id)
 {
-	if (irq >= NR_IRQS) {
+	if (irq >= NR_IRQS)
 		return;
-	}
+
 	if (!irq_list[irq] || irq_list[irq]->dev_id != dev_id)
-		printk("Removing probably wrong IRQ %d from %s\n",
+		printk(KERN_WARNING "Removing probably wrong IRQ %d from %s\n",
 		       irq, irq_list[irq]->devname);
+	disable_irq(irq);
 	h8300_disable_irq_pin(irq);
 	if (((unsigned long)irq_list[irq] & 0x80000000) == 0) {
 		kfree(irq_list[irq]);
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index 4cc723eb3651..300e3279ca5a 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -16,9 +16,10 @@
 
 #include <linux/types.h>
 #include <linux/sched.h>
-#include <linux/kernel_stat.h>
+#include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 #include <asm/system.h>
 #include <asm/irq.h>
@@ -41,7 +42,7 @@ void __init trap_init (void)
 
 asmlinkage void set_esp0 (unsigned long ssp)
 {
-  current->thread.esp0 = ssp;
+	current->thread.esp0 = ssp;
 }
 
 /*
@@ -55,14 +56,6 @@ static void dump(struct pt_regs *fp)
 	int		i;
 
 	printk("\nCURRENT PROCESS:\n\n");
-#if 0
-{
-	extern int	swt_lastjiffies, swt_reference;
-	printk("WATCHDOG: jiffies=%d lastjiffies=%d [%d] reference=%d\n",
-		jiffies, swt_lastjiffies, (swt_lastjiffies - jiffies),
-		swt_reference);
-}
-#endif
 	printk("COMM=%s PID=%d\n", current->comm, current->pid);
 	if (current->mm) {
 		printk("TEXT=%08x-%08x DATA=%08x-%08x BSS=%08x-%08x\n",
@@ -77,12 +70,7 @@ static void dump(struct pt_regs *fp)
 			(int) PAGE_SIZE+(unsigned long)current);
 	}
 
-	printk("PC: %08lx\n", (long)fp->pc);
-	printk("CCR: %02x   SP: %08lx\n", fp->ccr, (long) fp);
-	printk("ER0: %08lx  ER1: %08lx   ER2: %08lx   ER3: %08lx\n",
-		fp->er0, fp->er1, fp->er2, fp->er3);
-	printk("ER4: %08lx  ER5: %08lx   ER6: %08lx\n",
-		fp->er4, fp->er5, fp->er6);
+	show_regs(fp);
 	printk("\nCODE:");
 	tp = ((unsigned char *) fp->pc) - 0x20;
 	for (sp = (unsigned long *) tp, i = 0; (i < 0x40);  i += 4) {
@@ -106,12 +94,6 @@ static void dump(struct pt_regs *fp)
 	printk("\n\n");
 }
 
-void show_trace_task(struct task_struct *tsk)
-{
-	/* DAVIDM: we can do better, need a proper stack dump */
-	printk("STACK ksp=0x%lx, usp=0x%lx\n", tsk->thread.ksp, tsk->thread.usp);
-}
-
 void die_if_kernel (char *str, struct pt_regs *fp, int nr)
 {
 	extern int console_loglevel;
@@ -174,3 +156,14 @@ void show_stack(struct task_struct *task, unsigned long *esp)
 	printk("\n");
 }
 
+void show_trace_task(struct task_struct *tsk)
+{
+	show_stack(tsk,(unsigned long *)tsk->thread.esp0);
+}
+
+void dump_stack(void)
+{
+	show_stack(NULL,NULL);
+}
+
+EXPORT_SYMBOL(dump_stack);
diff --git a/arch/h8300/platform/h8s/entry.S b/arch/h8300/platform/h8s/entry.S
index ddb04b46d423..ab73e30cd8ff 100644
--- a/arch/h8300/platform/h8s/entry.S
+++ b/arch/h8300/platform/h8s/entry.S
@@ -166,7 +166,7 @@ SYMBOL_NAME_LABEL(interrupt_entry)
 	mov.l	sp,er1
 	subs	#4,er1				/* adjust ret_pc */
 	jsr	@SYMBOL_NAME(process_int)
-	mov.l	@SYMBOL_NAME(irq_stat)+CPU_SOFTIRQ_PENDING,er0
+	mov.l	@SYMBOL_NAME(irq_stat)+CPUSTAT_SOFTIRQ_PENDING,er0
 	beq	1f
 	jsr	@SYMBOL_NAME(do_softirq)
 1:
diff --git a/arch/h8300/platform/h8s/ints_h8s.c b/arch/h8300/platform/h8s/ints_h8s.c
index 61d14597728f..f53de493e3e8 100644
--- a/arch/h8300/platform/h8s/ints_h8s.c
+++ b/arch/h8300/platform/h8s/ints_h8s.c
@@ -9,6 +9,7 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/errno.h>
+#include <linux/kernel.h>
 
 #include <asm/ptrace.h>
 #include <asm/traps.h>
@@ -80,12 +81,13 @@ int h8300_enable_irq_pin(unsigned int irq)
 	if (irq >= EXT_IRQ0 && irq <= EXT_IRQ15) {
 		unsigned short ptn = 1 << (irq - EXT_IRQ0);
 		unsigned int port_no,bit_no;
-		IRQ_GPIO_MAP(ptn,irq,port_no,bit_no);
+		IRQ_GPIO_MAP(ptn, irq, port_no, bit_no);
 		if (H8300_GPIO_RESERVE(port_no, bit_no) == 0)
 			return -EBUSY;                   /* pin already use */
 		H8300_GPIO_DDR(port_no, bit_no, H8300_GPIO_INPUT);
 		*(volatile unsigned short *)ISR &= ~ptn; /* ISR clear */
-	}		
+	}
+
 	return 0;
 }
 
@@ -97,7 +99,7 @@ void h8300_disable_irq_pin(unsigned int irq)
 		unsigned short port_no,bit_no;
 		*(volatile unsigned short *)ISR &= ~ptn;
 		*(volatile unsigned short *)IER &= ~ptn;
-		IRQ_GPIO_MAP(ptn,port_no,bit_no);
+		IRQ_GPIO_MAP(ptn, irq, port_no, bit_no);
 		H8300_GPIO_FREE(port_no, bit_no);
 	}
 }
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index ea827607de39..15fb5ddf783a 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -297,10 +297,10 @@ static void kgdb_break_interrupt(int irq, void *ptr, struct pt_regs *regs)
 #if defined(__H8300S__)
 enum { sci_disable, sci_enable };
 
-static void h8300_sci_enable(struct sci_port* port, unsigned int ctrl)
+static void h8300_sci_enable(struct uart_port* port, unsigned int ctrl)
 {
 	volatile unsigned char *mstpcrl=(volatile unsigned char *)MSTPCRL;
-	int ch = (port->base  - SMR0) >> 3;
+	int ch = (port->mapbase  - SMR0) >> 3;
 	unsigned char mask = 1 << (ch+1);
 
 	if (ctrl == sci_disable) {
@@ -1276,7 +1276,7 @@ static struct sci_port sci_ports[SCI_NPORTS] = {
 			.line		= 1,
 		},
 		.type		= PORT_SCI,
-		.irqs		= H8S_IRQS1,
+		.irqs		= H8S_SCI_IRQS1,
 		.init_pins	= sci_init_pins_sci,
 	},
 	{
@@ -1290,7 +1290,7 @@ static struct sci_port sci_ports[SCI_NPORTS] = {
 			.line		= 2,
 		},
 		.type		= PORT_SCI,
-		.irqs		= H8S_IRQS2,
+		.irqs		= H8S_SCI_IRQS2,
 		.init_pins	= sci_init_pins_sci,
 	},
 #else
diff --git a/include/asm-h8300/io.h b/include/asm-h8300/io.h
index 2962f9030413..662ca869856a 100644
--- a/include/asm-h8300/io.h
+++ b/include/asm-h8300/io.h
@@ -9,7 +9,7 @@
 #if defined(CONFIG_H83007) || defined(CONFIG_H83068)
 #include <asm/regs306x.h>
 #elif defined(CONFIG_H8S2678)
-#include <asm/regs2678.h>
+#include <asm/regs267x.h>
 #else
 #error UNKNOWN CPU TYPE
 #endif
@@ -73,7 +73,7 @@ static inline unsigned int _swapl(volatile unsigned long v)
 
 static inline int h8300_buswidth(unsigned int addr)
 {
-	return (*(volatile unsigned char *)ABWCR & (1 << (addr >> 21) & 7)) == 0;
+	return (*(volatile unsigned char *)ABWCR & (1 << ((addr >> 21) & 7))) == 0;
 }
 
 static inline void io_outsb(unsigned int addr, void *buf, int len)
@@ -145,10 +145,10 @@ static inline void io_insl(unsigned int addr, void *buf, int len)
 #define memcpy_fromio(a,b,c)	memcpy((a),(void *)(b),(c))
 #define memcpy_toio(a,b,c)	memcpy((void *)(a),(b),(c))
 
-#define inb(addr)    ((h8300_buswidth(addr))?readb(addr ^ 1) & 0xff:readb(addr))
+#define inb(addr)    ((h8300_buswidth(addr))?readb((addr) ^ 1) & 0xff:readb(addr))
 #define inw(addr)    _swapw(readw(addr))
 #define inl(addr)    _swapl(readl(addr))
-#define outb(x,addr) ((void)((h8300_buswidth(addr) && (addr & 1))?writew(x,addr):writeb(x,addr)))
+#define outb(x,addr) ((void)((h8300_buswidth(addr) && ((addr) & 1))?writew(x,addr):writeb(x,addr)))
 #define outw(x,addr) ((void) writew(_swapw(x),addr))
 #define outl(x,addr) ((void) writel(_swapl(x),addr))
 
-- 
cgit v1.2.3


From 0e568881178ff0e0aceeafdb51f9fecab39e1923 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:33:21 -0700
Subject: [PATCH] fix posix-timers to have proper per-process scope

From: Roland McGrath <roland@redhat.com>

The posix-timers implementation associates timers with the creating thread
and destroys timers when their creator thread dies.  POSIX clearly
specifies that these timers are per-process, and a timer should not be torn
down when the thread that created it exits.  I hope there won't be any
controversy on what the correct semantics are here, since POSIX is clear
and the Linux feature is called "posix-timers".

The attached program built with NPTL -lrt -lpthread demonstrates the bug.
The program is correct by POSIX, but fails on Linux.  Note that a until
just the other day, NPTL had a trivial bug that always disabled its use of
kernel timer syscalls (check strace for lack of timer_create/SYS_259).  So
unless you have built your own NPTL libs very recently, you probably won't
see the kernel calls actually used by this program.

Also attached is my patch to fix this.  It (you guessed it) moves the
posix_timers field from task_struct to signal_struct.  Access is now
governed by the siglock instead of the task lock.  exit_itimers is called
from __exit_signal, i.e.  only on the death of the last thread in the
group, rather than from do_exit for every thread.  Timers' it_process
fields store the group leader's pointer, which won't die.  For the case of
SIGEV_THREAD_ID, I hold a ref on the task_struct for it_process to stay
robust in case the target thread dies; the ref is released and the dangling
pointer cleared when the timer fires and the target thread is dead.  (This
should only come up in a buggy user program, so noone cares exactly how the
kernel handles that case.  But I think what I did is robust and sensical.)

/* Test for bogus per-thread deletion of timers.  */

#include <stdio.h>
#include <error.h>
#include <time.h>
#include <signal.h>
#include <stdint.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <unistd.h>
#include <pthread.h>

/* Creating timers in another thread should work too.  */
static void *do_timer_create(void *arg)
{
	struct sigevent *const sigev = arg;
	timer_t *const timerId = sigev->sigev_value.sival_ptr;
	if (timer_create(CLOCK_REALTIME, sigev, timerId) < 0) {
		perror("timer_create");
		return NULL;
	}
	return timerId;
}

int main(void)
{
	int i, res;
	timer_t timerId;
	struct itimerspec itval;
	struct sigevent sigev;

	itval.it_interval.tv_sec = 2;
	itval.it_interval.tv_nsec = 0;
	itval.it_value.tv_sec = 2;
	itval.it_value.tv_nsec = 0;

	sigev.sigev_notify = SIGEV_SIGNAL;
	sigev.sigev_signo = SIGALRM;
	sigev.sigev_value.sival_ptr = (void *)&timerId;

	for (i = 0; i < 100; i++) {
		printf("cnt = %d\n", i);

		pthread_t thr;
		res = pthread_create(&thr, NULL, &do_timer_create, &sigev);
		if (res) {
			error(0, res, "pthread_create");
			continue;
		}
		void *val;
		res = pthread_join(thr, &val);
		if (res) {
			error(0, res, "pthread_join");
			continue;
		}
		if (val == NULL)
			continue;

		res = timer_settime(timerId, 0, &itval, NULL);
		if (res < 0)
			perror("timer_settime");

		res = timer_delete(timerId);
		if (res < 0)
			perror("timer_delete");
	}

	return 0;
}
---
 fs/exec.c                 |  1 -
 include/linux/init_task.h |  2 +-
 include/linux/sched.h     |  6 ++-
 kernel/exit.c             |  1 -
 kernel/fork.c             |  2 +-
 kernel/posix-timers.c     | 97 ++++++++++++++++++++++++++++++++++-------------
 kernel/signal.c           |  2 +-
 7 files changed, 78 insertions(+), 33 deletions(-)

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 26e3392b6369..5fb9f8f7c38f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -856,7 +856,6 @@ int flush_old_exec(struct linux_binprm * bprm)
 			
 	flush_signal_handlers(current, 0);
 	flush_old_files(current->files);
-	exit_itimers(current);
 
 	return 0;
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 5c4843a08917..29189706ea57 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -49,6 +49,7 @@
 	.shared_pending	= { 				\
 		.list = LIST_HEAD_INIT(sig.shared_pending.list),	\
 		.signal =  {{0}}}, \
+	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 }
 
 #define INIT_SIGHAND(sighand) {	\
@@ -107,7 +108,6 @@ extern struct group_info init_groups;
 		.list = LIST_HEAD_INIT(tsk.pending.list),		\
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
-	.posix_timers	 = LIST_HEAD_INIT(tsk.posix_timers),		\
 	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
 	.switch_lock	= SPIN_LOCK_UNLOCKED,				\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b72c38420d71..17bbedd6bb3d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -270,6 +270,9 @@ struct signal_struct {
 	/* thread group stop support, overloads group_exit_code too */
 	int			group_stop_count;
 
+	/* POSIX.1b Interval Timers */
+	struct list_head posix_timers;
+
 	/* job control IDs */
 	pid_t pgrp;
 	pid_t tty_old_pgrp;
@@ -433,7 +436,6 @@ struct task_struct {
 	unsigned long it_real_value, it_prof_value, it_virt_value;
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
 	struct timer_list real_timer;
-	struct list_head posix_timers; /* POSIX.1b Interval Timers */
 	unsigned long utime, stime, cutime, cstime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; /* context switch counts */
 	u64 start_time;
@@ -728,7 +730,7 @@ extern void exit_signal(struct task_struct *);
 extern void __exit_signal(struct task_struct *);
 extern void exit_sighand(struct task_struct *);
 extern void __exit_sighand(struct task_struct *);
-extern void exit_itimers(struct task_struct *);
+extern void exit_itimers(struct signal_struct *);
 
 extern NORET_TYPE void do_group_exit(int);
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 8157dbc037d6..0ec66729ead8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -776,7 +776,6 @@ asmlinkage NORET_TYPE void do_exit(long code)
 	__exit_files(tsk);
 	__exit_fs(tsk);
 	exit_namespace(tsk);
-	exit_itimers(tsk);
 	exit_thread();
 
 	if (tsk->signal->leader)
diff --git a/kernel/fork.c b/kernel/fork.c
index 6035db6957f8..b4cbfd04847b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	sig->group_stop_count = 0;
 	sig->curr_target = NULL;
 	init_sigpending(&sig->shared_pending);
+	INIT_LIST_HEAD(&sig->posix_timers);
 
 	sig->tty = current->signal->tty;
 	sig->pgrp = process_group(current);
@@ -932,7 +933,6 @@ struct task_struct *copy_process(unsigned long clone_flags,
 
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
-	INIT_LIST_HEAD(&p->posix_timers);
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 082693e383cf..3de4d0ae9d26 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -317,12 +317,21 @@ static void timer_notify_task(struct k_itimer *timr)
 	if (timr->it_incr)
 		timr->sigq->info.si_sys_private = ++timr->it_requeue_pending;
 
-	if (timr->it_sigev_notify & SIGEV_THREAD_ID )
+	if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
+		if (unlikely(timr->it_process->flags & PF_EXITING)) {
+			timr->it_sigev_notify = SIGEV_SIGNAL;
+			put_task_struct(timr->it_process);
+			timr->it_process = timr->it_process->group_leader;
+			goto group;
+		}
 		ret = send_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
-	else
+	}
+	else {
+	group:
 		ret = send_group_sigqueue(timr->it_sigev_signo, timr->sigq,
 			timr->it_process);
+	}
 	if (ret) {
 		/*
 		 * signal was not sent because of sig_ignor
@@ -352,7 +361,7 @@ static void posix_timer_fn(unsigned long __data)
 
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
-	struct task_struct *rtn = current;
+	struct task_struct *rtn = current->group_leader;
 
 	if ((event->sigev_notify & SIGEV_THREAD_ID ) &&
 		(!(rtn = find_task_by_pid(event->sigev_notify_thread_id)) ||
@@ -395,11 +404,15 @@ static struct k_itimer * alloc_posix_timer(void)
 static void release_posix_timer(struct k_itimer *tmr)
 {
 	if (tmr->it_id != -1) {
-		spin_lock_irq(&idr_lock);
+		unsigned long flags;
+		spin_lock_irqsave(&idr_lock, flags);
 		idr_remove(&posix_timers_id, tmr->it_id);
-		spin_unlock_irq(&idr_lock);
+		spin_unlock_irqrestore(&idr_lock, flags);
 	}
 	sigqueue_free(tmr->sigq);
+	if (unlikely(tmr->it_process) &&
+	    tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+		put_task_struct(tmr->it_process);
 	kmem_cache_free(posix_timers_cache, tmr);
 }
 
@@ -414,6 +427,7 @@ sys_timer_create(clockid_t which_clock,
 	struct k_itimer *new_timer = NULL;
 	timer_t new_timer_id;
 	struct task_struct *process = 0;
+	unsigned long flags;
 	sigevent_t event;
 
 	if ((unsigned) which_clock >= MAX_CLOCKS ||
@@ -458,7 +472,7 @@ sys_timer_create(clockid_t which_clock,
 			 * We may be setting up this process for another
 			 * thread.  It may be exiting.  To catch this
 			 * case the we check the PF_EXITING flag.  If
-			 * the flag is not set, the task_lock will catch
+			 * the flag is not set, the siglock will catch
 			 * him before it is too late (in exit_itimers).
 			 *
 			 * The exec case is a bit more invloved but easy
@@ -469,13 +483,14 @@ sys_timer_create(clockid_t which_clock,
 			 * for us to die which means we can finish this
 			 * linkage with our last gasp. I.e. no code :)
 			 */
-			task_lock(process);
+			spin_lock_irqsave(&process->sighand->siglock, flags);
 			if (!(process->flags & PF_EXITING)) {
 				list_add(&new_timer->list,
-					 &process->posix_timers);
-				task_unlock(process);
+					 &process->signal->posix_timers);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
+				get_task_struct(process);
 			} else {
-				task_unlock(process);
+				spin_unlock_irqrestore(&process->sighand->siglock, flags);
 				process = 0;
 			}
 		}
@@ -491,10 +506,10 @@ sys_timer_create(clockid_t which_clock,
 		new_timer->it_sigev_notify = SIGEV_SIGNAL;
 		new_timer->it_sigev_signo = SIGALRM;
 		new_timer->it_sigev_value.sival_int = new_timer->it_id;
-		process = current;
-		task_lock(process);
-		list_add(&new_timer->list, &process->posix_timers);
-		task_unlock(process);
+		process = current->group_leader;
+		spin_lock_irqsave(&process->sighand->siglock, flags);
+		list_add(&new_timer->list, &process->signal->posix_timers);
+		spin_unlock_irqrestore(&process->sighand->siglock, flags);
 	}
 
 	new_timer->it_clock = which_clock;
@@ -925,14 +940,18 @@ retry_delete:
 #else
 	p_timer_del(&posix_clocks[timer->it_clock], timer);
 #endif
-	task_lock(timer->it_process);
+	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
-	task_unlock(timer->it_process);
+	spin_unlock(&current->sighand->siglock);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
 	 * they got something (see the lock code above).
 	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
 	timer->it_process = NULL;
+	}
 	unlock_timer(timer, flags);
 	release_posix_timer(timer);
 	return 0;
@@ -942,24 +961,50 @@ retry_delete:
  */
 static inline void itimer_delete(struct k_itimer *timer)
 {
-	if (sys_timer_delete(timer->it_id))
-		BUG();
+	unsigned long flags;
+
+#ifdef CONFIG_SMP
+	int error;
+retry_delete:
+#endif
+	spin_lock_irqsave(&timer->it_lock, flags);
+
+#ifdef CONFIG_SMP
+	error = p_timer_del(&posix_clocks[timer->it_clock], timer);
+
+	if (error == TIMER_RETRY) {
+		unlock_timer(timer, flags);
+		goto retry_delete;
+	}
+#else
+	p_timer_del(&posix_clocks[timer->it_clock], timer);
+#endif
+	list_del(&timer->list);
+	/*
+	 * This keeps any tasks waiting on the spin lock from thinking
+	 * they got something (see the lock code above).
+	 */
+	if (timer->it_process) {
+		if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
+			put_task_struct(timer->it_process);
+		timer->it_process = NULL;
+	}
+	unlock_timer(timer, flags);
+	release_posix_timer(timer);
 }
+
 /*
- * This is exported to exit and exec
+ * This is called by __exit_signal, only when there are no more
+ * references to the shared signal_struct.
  */
-void exit_itimers(struct task_struct *tsk)
+void exit_itimers(struct signal_struct *sig)
 {
 	struct k_itimer *tmr;
 
-	task_lock(tsk);
-	while (!list_empty(&tsk->posix_timers)) {
-		tmr = list_entry(tsk->posix_timers.next, struct k_itimer, list);
-		task_unlock(tsk);
+	while (!list_empty(&sig->posix_timers)) {
+		tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
 		itimer_delete(tmr);
-		task_lock(tsk);
 	}
-	task_unlock(tsk);
 }
 
 /*
diff --git a/kernel/signal.c b/kernel/signal.c
index 7a4b479a6f45..c69671600bef 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -352,6 +352,7 @@ void __exit_signal(struct task_struct *tsk)
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		exit_itimers(sig);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 		kmem_cache_free(signal_cachep, sig);
@@ -2555,4 +2556,3 @@ void __init signals_init(void)
 	if (!sigqueue_cachep)
 		panic("signals_init(): cannot create sigqueue SLAB cache");
 }
-
-- 
cgit v1.2.3


From d54f0b47beed8ad9cf98ede8a76d106f7d878c7e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:33:35 -0700
Subject: [PATCH] v850: use volatile qualifier on v850 test-n-bitop asm
 statements

From: <miles@mcspd15.ucom.lsi.nec.co.jp> (Miles Bader)

Otherwise the compiler can delete them (this is one of those "how on earth
did it ever work before" moments).
---
 include/asm-v850/bitops.h | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

(limited to 'include')

diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h
index 3cee89697086..c837ea08c611 100644
--- a/include/asm-v850/bitops.h
+++ b/include/asm-v850/bitops.h
@@ -1,8 +1,8 @@
 /*
  * include/asm-v850/bitops.h -- Bit operations
  *
- *  Copyright (C) 2001,02,03  NEC Electronics Corporation
- *  Copyright (C) 2001,02,03  Miles Bader <miles@gnu.org>
+ *  Copyright (C) 2001,02,03,04  NEC Electronics Corporation
+ *  Copyright (C) 2001,02,03,04  Miles Bader <miles@gnu.org>
  *  Copyright (C) 1992  Linus Torvalds.
  *
  * This file is subject to the terms and conditions of the GNU General
@@ -84,24 +84,26 @@ extern __inline__ unsigned long ffz (unsigned long word)
 #define change_bit __change_bit
 
 
-#define __const_tns_bit_op(op, nr, addr)				\
-  ({ int __tns_res;							\
-     __asm__ ("tst1 (%1 - 0x123), %2; setf nz, %0; " op " (%1 - 0x123), %2" \
-	      : "=&r" (__tns_res)					\
-	      : "g" (((nr) & 0x7) + 0x123),				\
-		"m" (*((char *)(addr) + ((nr) >> 3)))			\
-	      : "memory");						\
-     __tns_res;							\
+#define __const_tns_bit_op(op, nr, addr)				      \
+  ({ int __tns_res;							      \
+     __asm__ __volatile__ (						      \
+	     "tst1 (%1 - 0x123), %2; setf nz, %0; " op " (%1 - 0x123), %2"    \
+	     : "=&r" (__tns_res)					      \
+	     : "g" (((nr) & 0x7) + 0x123),				      \
+	       "m" (*((char *)(addr) + ((nr) >> 3)))			      \
+	     : "memory");						      \
+     __tns_res;								      \
   })
-#define __var_tns_bit_op(op, nr, addr)					\
-  ({ int __nr = (nr);							\
-     int __tns_res;							\
-     __asm__ ("tst1 %1, [%2]; setf nz, %0; " op " %1, [%2]"		\
-	      : "=&r" (__tns_res)					\
-	      : "r" (__nr & 0x7),					\
-		"r" ((char *)(addr) + (__nr >> 3))			\
-	      : "memory");						\
-     __tns_res;							\
+#define __var_tns_bit_op(op, nr, addr)					      \
+  ({ int __nr = (nr);							      \
+     int __tns_res;							      \
+     __asm__ __volatile__ (						      \
+	     "tst1 %1, [%2]; setf nz, %0; " op " %1, [%2]"		      \
+	      : "=&r" (__tns_res)					      \
+	      : "r" (__nr & 0x7),					      \
+		"r" ((char *)(addr) + (__nr >> 3))			      \
+	      : "memory");						      \
+     __tns_res;								      \
   })
 #define __tns_bit_op(op, nr, addr)					\
   ((__builtin_constant_p (nr) && (unsigned)(nr) <= 0x7FFFF)		\
-- 
cgit v1.2.3


From ba5ff4c5edef262f74b1f96b83f09af8be4a20a3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:33:47 -0700
Subject: [PATCH] v850: make v850 dma-mapping.h header work when !CONFIG_PCI

From: <miles@mcspd15.ucom.lsi.nec.co.jp> (Miles Bader)

Is this something that should be done in <asm-generic/dma-mapping.h>?
---
 include/asm-v850/dma-mapping.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include')

diff --git a/include/asm-v850/dma-mapping.h b/include/asm-v850/dma-mapping.h
index e7e16901f686..c63fb50ec9ef 100644
--- a/include/asm-v850/dma-mapping.h
+++ b/include/asm-v850/dma-mapping.h
@@ -1 +1,12 @@
+#ifndef __V850_DMA_MAPPING_H__
+#define __V850_DMA_MAPPING_H__
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
+#else
+#include <asm-generic/dma-mapping-broken.h>
+#endif
+
+#endif /* __V850_DMA_MAPPING_H__ */
-- 
cgit v1.2.3


From 7400bc459f574c60ff62b139bf97b1abf5386f4c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:34:01 -0700
Subject: [PATCH] m68knommu: create dma-mapping.h

From: <gerg@snapgear.com>

Create a dma-mapping.h for m68knommu architecture.
---
 include/asm-m68knommu/dma-mapping.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'include')

diff --git a/include/asm-m68knommu/dma-mapping.h b/include/asm-m68knommu/dma-mapping.h
index e7e16901f686..a6c42ba48da6 100644
--- a/include/asm-m68knommu/dma-mapping.h
+++ b/include/asm-m68knommu/dma-mapping.h
@@ -1 +1,10 @@
+#ifndef _M68KNOMMU_DMA_MAPPING_H
+#define _M68KNOMMU_DMA_MAPPING_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PCI
 #include <asm-generic/dma-mapping.h>
+#endif
+
+#endif  /* _M68KNOMMU_DMA_MAPPING_H */
-- 
cgit v1.2.3


From 3549c6242ae5fcffa4470457577a57fec2f83181 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:41:09 -0700
Subject: [PATCH] 68knommu: add support for 64MHz clock for ColdFire boards

From: <gerg@snapgear.com>

Add support for boards that have a 64MHz clock to common Coldfire header.
---
 include/asm-m68knommu/coldfire.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/asm-m68knommu/coldfire.h b/include/asm-m68knommu/coldfire.h
index 628c93704142..8d8861b930f6 100644
--- a/include/asm-m68knommu/coldfire.h
+++ b/include/asm-m68knommu/coldfire.h
@@ -54,6 +54,8 @@
 #define	MCF_CLK		54000000
 #elif defined(CONFIG_CLOCK_60MHz)
 #define	MCF_CLK		60000000
+#elif defined(CONFIG_CLOCK_64MHz)
+#define	MCF_CLK		64000000
 #elif defined(CONFIG_CLOCK_66MHz)
 #define	MCF_CLK		66000000
 #elif defined(CONFIG_CLOCK_70MHz)
-- 
cgit v1.2.3


From c8b976af1af10de3d92968bf7d4bd5415e8a3778 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:41:36 -0700
Subject: [PATCH] hugetlb consolidation

From: William Lee Irwin III <wli@holomorphy.com>

The following patch consolidates redundant code in various hugetlb
implementations.  I took the liberty of renaming a few things, since the
code was all moved anyway, and it has the benefit of helping to catch
missed conversions and/or consolidations.
---
 arch/i386/mm/hugetlbpage.c    | 264 +-----------------------------------------
 arch/ia64/mm/hugetlbpage.c    | 251 +--------------------------------------
 arch/ppc64/mm/hugetlbpage.c   | 258 +----------------------------------------
 arch/sh/mm/hugetlbpage.c      | 258 +----------------------------------------
 arch/sparc64/mm/hugetlbpage.c | 259 +----------------------------------------
 fs/hugetlbfs/inode.c          |   2 +-
 include/linux/hugetlb.h       |   7 +-
 kernel/sysctl.c               |   6 +-
 mm/Makefile                   |   1 +
 mm/hugetlb.c                  | 245 +++++++++++++++++++++++++++++++++++++++
 10 files changed, 263 insertions(+), 1288 deletions(-)
 create mode 100644 mm/hugetlb.c

(limited to 'include')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index 7224ddcb6a11..a702f96373af 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -20,68 +20,6 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long    htlbpagemem;
-int     htlbpage_max;
-static long    htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES && !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-				HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -276,26 +214,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 }
 #endif
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end)
 {
@@ -319,16 +237,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void
-zap_hugepage_range(struct vm_area_struct *vma,
-		unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -360,7 +268,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -380,173 +288,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write,
-		struct file *file, void *buffer, size_t *length)
-{
-	if (!cpu_has_pse)
-		return -ENODEV;
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	if (!cpu_has_pse)
-		return -ENODEV;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index 3dec8e2f4056..8b5b1cac3a1c 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -22,69 +22,7 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-unsigned int	hpage_shift=HPAGE_SHIFT_DEFAULT;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
+unsigned int hpage_shift=HPAGE_SHIFT_DEFAULT;
 
 static pte_t *
 huge_pte_alloc (struct mm_struct *mm, unsigned long addr)
@@ -244,26 +182,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int wri
 	return NULL;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 /*
  * Same as generic free_pgtables(), except constant PGDIR_* and pgd_offset
  * are hugetlb region specific.
@@ -339,14 +257,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsig
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -378,7 +288,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -422,106 +332,6 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 		addr = ALIGN(vmm->vm_end, HPAGE_SIZE);
 	}
 }
-void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-int set_hugetlb_mem_size(int count)
-{
-	int  lcount;
-	struct page *page ;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write, struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
 
 static int __init hugetlb_setup_sz(char *str)
 {
@@ -551,60 +361,3 @@ static int __init hugetlb_setup_sz(char *str)
 	return 1;
 }
 __setup("hugepagesz=", hugetlb_setup_sz);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-__initcall(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	if (size > (htlbpagemem << HPAGE_SHIFT))
-		return 0;
-	return 1;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-static struct page *hugetlb_nopage(struct vm_area_struct * area, unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage =	hugetlb_nopage,
-};
diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c
index a7b2c63c700f..e81eeec9b009 100644
--- a/arch/ppc64/mm/hugetlbpage.c
+++ b/arch/ppc64/mm/hugetlbpage.c
@@ -29,65 +29,6 @@
 
 #include <linux/sysctl.h>
 
-int htlbpage_max;
-
-/* This lock protects the two counters and list below */
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static int htlbpage_free; /* = 0 */
-static int htlbpage_total; /* = 0 */
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		&hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-/* XXX make this a sysctl */
-unsigned long largepage_roundrobin = 1;
-
-static struct page *dequeue_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page = NULL;
-	int i;
-
-	if (!largepage_roundrobin)
-		nid = numa_node_id();
-
-	for (i = 0; i < numnodes; i++) {
-		if (!list_empty(&hugepage_freelists[nid]))
-			break;
-		nid = (nid + 1) % numnodes;
-	}
-
-	if (!list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next, struct page, lru);
-		list_del(&page->lru);
-	}
-
-	if (largepage_roundrobin)
-		nid = (nid + 1) % numnodes;
-
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	if (!page)
-		return NULL;
-
-	nid = page_zone(page)->zone_pgdat->node_id;
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
 /* HugePTE layout:
  *
  * 31 30 ... 15 14 13 12 10 9  8  7   6    5    4    3    2    1    0
@@ -119,7 +60,6 @@ typedef struct {unsigned int val;} hugepte_t;
 #define hugepte_none(x)	(!(hugepte_val(x) & _HUGEPAGE_PFN))
 
 
-static void free_huge_page(struct page *page);
 static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
 				hugepte_t pte, int local);
 
@@ -146,27 +86,6 @@ static inline void set_hugepte(hugepte_t *ptep, hugepte_t pte)
 		       hugepte_val(pte) & ~_HUGEPAGE_HPTEFLAGS);
 }
 
-static struct page *alloc_hugetlb_page(void)
-{
-	int i;
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-
-	htlbpage_free--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_highpage(&page[i]);
-	return page;
-}
-
 static hugepte_t *hugepte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -448,26 +367,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return page;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpage_free++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -510,16 +409,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	mm->rss -= (end - start) >> PAGE_SHIFT;
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -554,7 +443,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -876,148 +765,3 @@ static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
 
 	ppc_md.hpte_invalidate(slot, va, 1, local);
 }
-
-static void split_and_free_hugepage(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbpage_total--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (!(cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE))
-		return 0;
-	
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbpage_total;
-
-	if (lcount == 0)
-		return htlbpage_total;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpage_free++;
-			htlbpage_total++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return htlbpage_total;
-	}
-	/* Shrink the memory size. */
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		split_and_free_hugepage(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return htlbpage_total;
-}
-
-int hugetlb_sysctl_handler(ctl_table *table, int write,
-		struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	if (cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) {
-		for (i = 0; i < MAX_NUMNODES; ++i)
-			INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-		for (i = 0; i < htlbpage_max; ++i) {
-			page = alloc_fresh_huge_page();
-			if (!page)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			spin_unlock(&htlbpage_lock);
-		}
-		htlbpage_max = htlbpage_free = htlbpage_total = i;
-		printk(KERN_INFO "Total HugeTLB memory allocated, %d\n",
-		       htlbpage_free);
-	} else {
-		htlbpage_max = 0;
-		printk(KERN_INFO "CPU does not support HugeTLB\n");
-	}
-
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5d\n"
-			"HugePages_Free:  %5d\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbpage_total,
-			htlbpage_free,
-			HPAGE_SIZE/1024);
-}
-
-/* This is advisory only, so we can get away with accesing
- * htlbpage_free without taking the lock. */
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpage_free;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbpage_total * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c
index 6f72d865e8d2..751a7d1a666d 100644
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -24,68 +24,6 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, list);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	memset(page_address(page), 0, HPAGE_SIZE);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -250,25 +188,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-	BUG_ON(page->mapping);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -297,16 +216,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -338,7 +247,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -358,168 +267,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, list);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				   unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 771ec3757d73..b4e6dfa0833a 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -21,68 +21,6 @@
 #include <asm/tlbflush.h>
 #include <asm/cacheflush.h>
 
-static long	htlbpagemem;
-int		htlbpage_max;
-static long	htlbzone_pages;
-
-static struct list_head hugepage_freelists[MAX_NUMNODES];
-static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
-
-static void enqueue_huge_page(struct page *page)
-{
-	list_add(&page->lru,
-		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
-}
-
-static struct page *dequeue_huge_page(void)
-{
-	int nid = numa_node_id();
-	struct page *page = NULL;
-
-	if (list_empty(&hugepage_freelists[nid])) {
-		for (nid = 0; nid < MAX_NUMNODES; ++nid)
-			if (!list_empty(&hugepage_freelists[nid]))
-				break;
-	}
-	if (nid >= 0 && nid < MAX_NUMNODES &&
-	    !list_empty(&hugepage_freelists[nid])) {
-		page = list_entry(hugepage_freelists[nid].next,
-				  struct page, lru);
-		list_del(&page->lru);
-	}
-	return page;
-}
-
-static struct page *alloc_fresh_huge_page(void)
-{
-	static int nid = 0;
-	struct page *page;
-	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
-					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % numnodes;
-	return page;
-}
-
-static void free_huge_page(struct page *page);
-
-static struct page *alloc_hugetlb_page(void)
-{
-	struct page *page;
-
-	spin_lock(&htlbpage_lock);
-	page = dequeue_huge_page();
-	if (!page) {
-		spin_unlock(&htlbpage_lock);
-		return NULL;
-	}
-	htlbpagemem--;
-	spin_unlock(&htlbpage_lock);
-	set_page_count(page, 1);
-	page->lru.prev = (void *)free_huge_page;
-	memset(page_address(page), 0, HPAGE_SIZE);
-	return page;
-}
-
 static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
 	pgd_t *pgd;
@@ -247,26 +185,6 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 	return NULL;
 }
 
-static void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&htlbpage_lock);
-	enqueue_huge_page(page);
-	htlbpagemem++;
-	spin_unlock(&htlbpage_lock);
-}
-
-void huge_page_release(struct page *page)
-{
-	if (!put_page_testzero(page))
-		return;
-
-	free_huge_page(page);
-}
-
 void unmap_hugepage_range(struct vm_area_struct *vma,
 			  unsigned long start, unsigned long end)
 {
@@ -295,16 +213,6 @@ void unmap_hugepage_range(struct vm_area_struct *vma,
 	flush_tlb_range(vma, start, end);
 }
 
-void zap_hugepage_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long length)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	spin_lock(&mm->page_table_lock);
-	unmap_hugepage_range(vma, start, start + length);
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 {
 	struct mm_struct *mm = current->mm;
@@ -336,7 +244,7 @@ int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
 				ret = -ENOMEM;
 				goto out;
 			}
-			page = alloc_hugetlb_page();
+			page = alloc_huge_page();
 			if (!page) {
 				hugetlb_put_quota(mapping);
 				ret = -ENOMEM;
@@ -356,168 +264,3 @@ out:
 	spin_unlock(&mm->page_table_lock);
 	return ret;
 }
-
-static void update_and_free_page(struct page *page)
-{
-	int j;
-	struct page *map;
-
-	map = page;
-	htlbzone_pages--;
-	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
-		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-				1 << PG_private | 1<< PG_writeback);
-		set_page_count(map, 0);
-		map++;
-	}
-	set_page_count(page, 1);
-	__free_pages(page, HUGETLB_PAGE_ORDER);
-}
-
-static int try_to_free_low(int count)
-{
-	struct list_head *p;
-	struct page *page, *map;
-
-	map = NULL;
-	spin_lock(&htlbpage_lock);
-	/* all lowmem is on node 0 */
-	list_for_each(p, &hugepage_freelists[0]) {
-		if (map) {
-			list_del(&map->lru);
-			update_and_free_page(map);
-			htlbpagemem--;
-			map = NULL;
-			if (++count == 0)
-				break;
-		}
-		page = list_entry(p, struct page, lru);
-		if (!PageHighMem(page))
-			map = page;
-	}
-	if (map) {
-		list_del(&map->lru);
-		update_and_free_page(map);
-		htlbpagemem--;
-		count++;
-	}
-	spin_unlock(&htlbpage_lock);
-	return count;
-}
-
-static int set_hugetlb_mem_size(int count)
-{
-	int lcount;
-	struct page *page;
-
-	if (count < 0)
-		lcount = count;
-	else
-		lcount = count - htlbzone_pages;
-
-	if (lcount == 0)
-		return (int)htlbzone_pages;
-	if (lcount > 0) {	/* Increase the mem size. */
-		while (lcount--) {
-			page = alloc_fresh_huge_page();
-			if (page == NULL)
-				break;
-			spin_lock(&htlbpage_lock);
-			enqueue_huge_page(page);
-			htlbpagemem++;
-			htlbzone_pages++;
-			spin_unlock(&htlbpage_lock);
-		}
-		return (int) htlbzone_pages;
-	}
-	/* Shrink the memory size. */
-	lcount = try_to_free_low(lcount);
-	while (lcount++) {
-		page = alloc_hugetlb_page();
-		if (page == NULL)
-			break;
-		spin_lock(&htlbpage_lock);
-		update_and_free_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	return (int) htlbzone_pages;
-}
-
-int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-			   struct file *file, void *buffer, size_t *length)
-{
-	proc_dointvec(table, write, file, buffer, length);
-	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
-	return 0;
-}
-
-static int __init hugetlb_setup(char *s)
-{
-	if (sscanf(s, "%d", &htlbpage_max) <= 0)
-		htlbpage_max = 0;
-	return 1;
-}
-__setup("hugepages=", hugetlb_setup);
-
-static int __init hugetlb_init(void)
-{
-	int i;
-	struct page *page;
-
-	for (i = 0; i < MAX_NUMNODES; ++i)
-		INIT_LIST_HEAD(&hugepage_freelists[i]);
-
-	for (i = 0; i < htlbpage_max; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
-			break;
-		spin_lock(&htlbpage_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&htlbpage_lock);
-	}
-	htlbpage_max = htlbpagemem = htlbzone_pages = i;
-	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
-	return 0;
-}
-module_init(hugetlb_init);
-
-int hugetlb_report_meminfo(char *buf)
-{
-	return sprintf(buf,
-			"HugePages_Total: %5lu\n"
-			"HugePages_Free:  %5lu\n"
-			"Hugepagesize:    %5lu kB\n",
-			htlbzone_pages,
-			htlbpagemem,
-			HPAGE_SIZE/1024);
-}
-
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
-}
-
-/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
-unsigned long hugetlb_total_pages(void)
-{
-	return htlbzone_pages * (HPAGE_SIZE / PAGE_SIZE);
-}
-EXPORT_SYMBOL(hugetlb_total_pages);
-
-/*
- * We cannot handle pagefaults against hugetlb pages at all.  They cause
- * handle_mm_fault() to try to instantiate regular-sized pages in the
- * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
- * this far.
- */
-static struct page *hugetlb_nopage(struct vm_area_struct *vma,
-				   unsigned long address, int *unused)
-{
-	BUG();
-	return NULL;
-}
-
-struct vm_operations_struct hugetlb_vm_ops = {
-	.nopage = hugetlb_nopage,
-};
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c60d937b202e..5e37a271dd2e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -573,7 +573,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig)
 			unsigned long long size = memparse(value, &rest);
 			if (*rest == '%') {
 				size <<= HPAGE_SHIFT;
-				size *= htlbpage_max;
+				size *= max_huge_pages;
 				do_div(size, 100);
 				rest++;
 			}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index da3fc826a0de..b0e98cfe15f9 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -28,8 +28,11 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
+struct page *alloc_huge_page(void);
+void free_huge_page(struct page *);
 
-extern int htlbpage_max;
+extern unsigned long max_huge_pages;
+extern const unsigned long hugetlb_zero, hugetlb_infinity;
 
 static inline void
 mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma)
@@ -78,6 +81,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(addr, len)	0
 #define hugetlb_free_pgtables(tlb, prev, start, end) do { } while (0)
+#define alloc_huge_page()			({ NULL; })
+#define free_huge_page(p)			({ (void)(p); BUG(); })
 
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	0		/* Keep the compiler happy */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 05ea59ae4276..69e9123cdd0f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -710,10 +710,12 @@ static ctl_table vm_table[] = {
 	 {
 		.ctl_name	= VM_HUGETLB_PAGES,
 		.procname	= "nr_hugepages",
-		.data		= &htlbpage_max,
-		.maxlen		= sizeof(int),
+		.data		= &max_huge_pages,
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= &hugetlb_sysctl_handler,
+		.extra1		= (void *)&hugetlb_zero,
+		.extra2		= (void *)&hugetlb_infinity,
 	 },
 #endif
 	{
diff --git a/mm/Makefile b/mm/Makefile
index c66aba5886f8..5f3baecd85a7 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -12,3 +12,4 @@ obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   slab.o swap.o truncate.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
new file mode 100644
index 000000000000..cb72a40c38b6
--- /dev/null
+++ b/mm/hugetlb.c
@@ -0,0 +1,245 @@
+/*
+ * Generic hugetlb support.
+ * (C) William Irwin, April 2004
+ */
+#include <linux/gfp.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/sysctl.h>
+
+const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+static unsigned long nr_huge_pages, free_huge_pages;
+unsigned long max_huge_pages;
+static struct list_head hugepage_freelists[MAX_NUMNODES];
+static spinlock_t hugetlb_lock = SPIN_LOCK_UNLOCKED;
+
+static void enqueue_huge_page(struct page *page)
+{
+	list_add(&page->lru,
+		 &hugepage_freelists[page_zone(page)->zone_pgdat->node_id]);
+}
+
+static struct page *dequeue_huge_page(void)
+{
+	int nid = numa_node_id();
+	struct page *page = NULL;
+
+	if (list_empty(&hugepage_freelists[nid])) {
+		for (nid = 0; nid < MAX_NUMNODES; ++nid)
+			if (!list_empty(&hugepage_freelists[nid]))
+				break;
+	}
+	if (nid >= 0 && nid < MAX_NUMNODES &&
+	    !list_empty(&hugepage_freelists[nid])) {
+		page = list_entry(hugepage_freelists[nid].next,
+				  struct page, lru);
+		list_del(&page->lru);
+	}
+	return page;
+}
+
+static struct page *alloc_fresh_huge_page(void)
+{
+	static int nid = 0;
+	struct page *page;
+	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP,
+					HUGETLB_PAGE_ORDER);
+	nid = (nid + 1) % numnodes;
+	return page;
+}
+
+void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	free_huge_pages++;
+	spin_unlock(&hugetlb_lock);
+}
+
+struct page *alloc_huge_page(void)
+{
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	page = dequeue_huge_page();
+	if (!page) {
+		spin_unlock(&hugetlb_lock);
+		return NULL;
+	}
+	free_huge_pages--;
+	spin_unlock(&hugetlb_lock);
+	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
+	memset(page_address(page), 0, HPAGE_SIZE);
+	return page;
+}
+
+void huge_page_release(struct page *page)
+{
+	if (!put_page_testzero(page))
+		return;
+
+	free_huge_page(page);
+}
+
+static int __init hugetlb_init(void)
+{
+	unsigned long i;
+	struct page *page;
+
+	for (i = 0; i < MAX_NUMNODES; ++i)
+		INIT_LIST_HEAD(&hugepage_freelists[i]);
+
+	for (i = 0; i < max_huge_pages; ++i) {
+		page = alloc_fresh_huge_page();
+		if (!page)
+			break;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		spin_unlock(&hugetlb_lock);
+	}
+	max_huge_pages = free_huge_pages = nr_huge_pages = i;
+	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
+	return 0;
+}
+module_init(hugetlb_init);
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%lu", &max_huge_pages) <= 0)
+		max_huge_pages = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static void update_and_free_page(struct page *page)
+{
+	int i;
+	nr_huge_pages--;
+	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(&page[i], 0);
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+#ifdef CONFIG_HIGHMEM
+static int try_to_free_low(unsigned long count)
+{
+	int i;
+	for (i = 0; i < MAX_NUMNODES; ++i) {
+		struct page *page;
+		list_for_each_entry(page, &hugepage_freelists[i], lru) {
+			if (PageHighMem(page))
+				continue;
+			list_del(&page->lru);
+			update_and_free_page(page);
+			--free_huge_pages;
+			if (!--count)
+				return 0;
+		}
+	}
+	return count;
+}
+#else
+static inline int try_to_free_low(unsigned long count)
+{
+	return count;
+}
+#endif
+
+static unsigned long set_max_huge_pages(unsigned long count)
+{
+	while (count > nr_huge_pages) {
+		struct page *page = alloc_fresh_huge_page();
+		if (!page)
+			return nr_huge_pages;
+		spin_lock(&hugetlb_lock);
+		enqueue_huge_page(page);
+		free_huge_pages++;
+		nr_huge_pages++;
+		spin_unlock(&hugetlb_lock);
+	}
+	if (count >= nr_huge_pages)
+		return nr_huge_pages;
+
+	spin_lock(&hugetlb_lock);
+	for (count = try_to_free_low(count); count < nr_huge_pages; --free_huge_pages) {
+		struct page *page = dequeue_huge_page();
+		if (!page)
+			break;
+		update_and_free_page(page);
+	}
+	spin_unlock(&hugetlb_lock);
+	return nr_huge_pages;
+}
+
+#ifdef CONFIG_SYSCTL
+int hugetlb_sysctl_handler(struct ctl_table *table, int write,
+			   struct file *file, void *buffer, size_t *length)
+{
+	proc_doulongvec_minmax(table, write, file, buffer, length);
+	max_huge_pages = set_max_huge_pages(max_huge_pages);
+	return 0;
+}
+#endif /* CONFIG_SYSCTL */
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			nr_huge_pages,
+			free_huge_pages,
+			HPAGE_SIZE/1024);
+}
+
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
+}
+
+/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
+unsigned long hugetlb_total_pages(void)
+{
+	return nr_huge_pages * (HPAGE_SIZE / PAGE_SIZE);
+}
+EXPORT_SYMBOL(hugetlb_total_pages);
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int *unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
-- 
cgit v1.2.3


From 74216ef57713a9cb85aa0e1bb240b766d77a1351 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sun, 11 Apr 2004 23:41:49 -0700
Subject: [PATCH] s390: core s390

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

s390 core changes:
 - Fix _raw_spin_trylock for 64 bit.
 - Add clarification to s390 debug debug documentation.
---
 Documentation/s390/s390dbf.txt |  6 +++---
 include/asm-s390/spinlock.h    | 10 ++++------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/Documentation/s390/s390dbf.txt b/Documentation/s390/s390dbf.txt
index e7048364a9f9..692cc26cfc74 100644
--- a/Documentation/s390/s390dbf.txt
+++ b/Documentation/s390/s390dbf.txt
@@ -66,9 +66,9 @@ a view can be inspected simply by reading the corresponding proc file.
 All debug logs have an an actual debug level (range from 0 to 6).
 The default level is 3. Event and Exception functions have a 'level'
 parameter. Only debug entries with a level that is lower or equal
-than the actual level are written to the log. This means that high 
-priority log entries should have a low level value whereas low priority
-entries should have a high one. 
+than the actual level are written to the log. This means, when
+writing events, high priority log entries should have a low level
+value whereas low priority entries should have a high one.
 The actual debug level can be changed with the help of the proc-filesystem 
 through writing a number string "x" to the 'level' proc file which is
 provided for every debug log. Debugging can be switched off completely
diff --git a/include/asm-s390/spinlock.h b/include/asm-s390/spinlock.h
index 12da8b06104b..4c3b6aadbefd 100644
--- a/include/asm-s390/spinlock.h
+++ b/include/asm-s390/spinlock.h
@@ -70,11 +70,9 @@ extern inline void _raw_spin_lock(spinlock_t *lp)
 
 extern inline int _raw_spin_trylock(spinlock_t *lp)
 {
-#ifndef __s390x__
-	unsigned long result, reg;
-#else /* __s390x__ */
-	unsigned int result, reg;
-#endif /* __s390x__ */
+	unsigned long reg;
+	unsigned int result;
+
 	__asm__ __volatile("    basr  %1,0\n"
 			   "0:  cs    %0,%1,0(%3)"
 			   : "=d" (result), "=&d" (reg), "=m" (lp->lock)
@@ -226,7 +224,7 @@ extern inline int _raw_write_trylock(rwlock_t *rw)
 			     "0: csg %0,%1,0(%3)\n"
 #endif /* __s390x__ */
 			     : "=d" (result), "=&d" (reg), "=m" (rw->lock)
-			     : "a" (&rw->lock), "m" (rw->lock), "0" (0)
+			     : "a" (&rw->lock), "m" (rw->lock), "0" (0UL)
 			     : "cc", "memory" );
 	return result == 0;
 }
-- 
cgit v1.2.3


From fa7bb531f6f268a5836a36e689b27c5652dcba05 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:14:58 -0700
Subject: [PATCH] s390: rewritten qeth driver

From: Martin Schwidefsky <schwidefsky@de.ibm.com>

The rewritten qeth network driver.
---
 drivers/s390/net/Kconfig     |     2 +-
 drivers/s390/net/Makefile    |     6 +-
 drivers/s390/net/qeth.c      | 10872 -----------------------------------------
 drivers/s390/net/qeth.h      |  1809 +++----
 drivers/s390/net/qeth_fs.h   |   156 +
 drivers/s390/net/qeth_main.c |  6820 ++++++++++++++++++++++++++
 drivers/s390/net/qeth_mpc.c  |    22 +-
 drivers/s390/net/qeth_mpc.h  |   768 +--
 drivers/s390/net/qeth_proc.c |   468 ++
 drivers/s390/net/qeth_sys.c  |  1479 ++++++
 include/asm-s390/qeth.h      |    60 +
 11 files changed, 10146 insertions(+), 12316 deletions(-)
 delete mode 100644 drivers/s390/net/qeth.c
 create mode 100644 drivers/s390/net/qeth_fs.h
 create mode 100644 drivers/s390/net/qeth_main.c
 create mode 100644 drivers/s390/net/qeth_proc.c
 create mode 100644 drivers/s390/net/qeth_sys.c
 create mode 100644 include/asm-s390/qeth.h

(limited to 'include')

diff --git a/drivers/s390/net/Kconfig b/drivers/s390/net/Kconfig
index ab38a0631795..7899560dbfa4 100644
--- a/drivers/s390/net/Kconfig
+++ b/drivers/s390/net/Kconfig
@@ -63,7 +63,7 @@ config QETH
 	  <http://www10.software.ibm.com/developerworks/opensource/linux390>
 
 	  To compile this driver as a module, choose M here: the
-	  module will be called qeth.
+	  module will be called qeth.ko.
 
 
 comment "Gigabit Ethernet default settings"
diff --git a/drivers/s390/net/Makefile b/drivers/s390/net/Makefile
index e2fbc362dfba..8befcd6ea6e4 100644
--- a/drivers/s390/net/Makefile
+++ b/drivers/s390/net/Makefile
@@ -9,6 +9,6 @@ obj-$(CONFIG_NETIUCV) += netiucv.o fsm.o
 obj-$(CONFIG_SMSGIUCV) += smsgiucv.o
 obj-$(CONFIG_CTC) += ctc.o fsm.o cu3088.o
 obj-$(CONFIG_LCS) += lcs.o cu3088.o
-qeth_mod-objs := qeth.o qeth_mpc.o
-obj-$(CONFIG_QETH) += qeth_mod.o
-
+qeth-y := qeth_main.o qeth_mpc.o qeth_sys.o
+qeth-$(CONFIG_PROC_FS) += qeth_proc.o
+obj-$(CONFIG_QETH) += qeth.o
diff --git a/drivers/s390/net/qeth.c b/drivers/s390/net/qeth.c
deleted file mode 100644
index ebef0a9c91cf..000000000000
--- a/drivers/s390/net/qeth.c
+++ /dev/null
@@ -1,10872 +0,0 @@
-/*
- *
- * linux/drivers/s390/net/qeth.c ($Revision: 1.177 $)
- *
- * Linux on zSeries OSA Express and HiperSockets support
- *
- * Copyright 2000,2003 IBM Corporation
- *
- * Author(s): Utz Bacher <utz.bacher@de.ibm.com>
- *            Cornelia Huck <cohuck@de.ibm.com> (2.5 integration,
- *                                               numerous bugfixes)
- *            Frank Pavlic <pavlic@de.ibm.com>  (query/purge ARP, SNMP, fixes)
- *            Andreas Herrmann <aherrman@de.ibm.com> (bugfixes)
- *            Thomas Spatzier <tspat@de.ibm.com> (bugfixes)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-/*
- * The driver supports in general all QDIO driven network devices on the
- * Hydra card.
- *
- * For all devices, three channels must be available to the driver. One
- * channel is the read channel, one is the write channel and the third
- * one is the channel used to control QDIO.
- *
- * There are several stages from the channel recognition to the running
- * network device:
- * - The channels are scanned and ordered due to the parameters (see
- *   MODULE_PARM_DESC)
- * - The card is hardsetup: this means, that the communication channels
- *   are prepared
- * - The card is softsetup: this means, that commands are issued
- *   to activate the network parameters
- * - After that, data can flow through the card (transported by QDIO)
- *
- *IPA Takeover:
- * /proc/qeth_ipa_takeover provides the possibility to add and remove
- * certain ranges of IP addresses to the driver. As soon as these
- * addresses have to be set by the driver, the driver uses the OSA
- * Address Takeover mechanism.
- * reading out of the proc-file displays the registered addresses;
- * writing into it changes the information. Only one command at one
- * time must be written into the file. Subsequent commands are ignored.
- * The following commands are available:
- * inv4
- * inv6
- * add4 <ADDR>/<mask bits>[:<interface>]
- * add6 <ADDR>/<mask bits>[:<interface>]
- * del4 <ADDR>/<mask bits>[:<interface>]
- * del6 <ADDR>/<mask bits>[:<interface>]
- * inv4 and inv6 toggle the IPA takeover behaviour for all interfaces:
- * when inv4 was input once, all addresses specified with add4 are not
- * set using the takeover mechanism, but all other IPv4 addresses are set so.
- *
- * add# adds an address range, del# deletes an address range. # corresponds
- * to the IP version (4 or 6).
- * <ADDR> is a 8 or 32byte hexadecimal view of the IP address.
- * <mask bits> specifies the number of bits which are set in the network mask.
- * <interface> is optional and specifies the interface name to which the
- * address range is bound.
- * E. g.
- *   add4 C0a80100/24
- * activates all addresses in the 192.168.10 subnet for address takeover.
- * Note, that the address is not taken over before an according ifconfig
- * is executed.
- *
- *VIPA:
- * add_vipa4 <ADDR>:<interface>
- * add_vipa6 <ADDR>:<interface>
- * del_vipa4 <ADDR>:<interface>
- * del_vipa6 <ADDR>:<interface>
- *
- * the specified address is set/unset as VIPA on the specified interface.
- * use the src_vipa package to exploit this out of arbitrary applications.
- *
- *Proxy ARP:
- *
- * add_rxip4 <ADDR>:<interface>
- * add_rxip6 <ADDR>:<interface>
- * del_rxip4 <ADDR>:<interface>
- * del_rxip6 <ADDR>:<interface>
- *
- * the specified address is set/unset as "do not fail a gratuitous ARP"
- * on the specified interface. this can be used to act as a proxy ARP.
- */
-
-static void volatile
-qeth_eyecatcher(void)
-{
-	return;
-}
-
-#undef DEBUG
-
-#include <linux/config.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/mm.h>
-
-#include <asm/io.h>
-#include <asm/ebcdic.h>
-#include <linux/ctype.h>
-#include <asm/semaphore.h>
-#include <asm/timex.h>
-#include <linux/if.h>
-#include <linux/if_arp.h>
-#include <linux/ip.h>
-#include <linux/inetdevice.h>
-#include <linux/netdevice.h>
-#include <linux/sched.h>
-#include <linux/workqueue.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/interrupt.h>
-#include <linux/tcp.h>
-#include <linux/icmp.h>
-#include <linux/skbuff.h>
-#ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
-#endif /* CONFIG_PROC_FS */
-#include <net/route.h>
-#include <net/arp.h>
-#include <linux/in.h>
-#include <linux/igmp.h>
-#include <net/ip.h>
-#include <asm/uaccess.h>
-#include <linux/init.h>
-#include <net/ipv6.h>
-#include <linux/in6.h>
-#include <net/if_inet6.h>
-#include <net/addrconf.h>
-#include <linux/if_tr.h>
-#include <linux/trdevice.h>
-#include <linux/etherdevice.h>
-#include <linux/reboot.h>
-
-#include <linux/if_vlan.h>
-
-#include <asm/ccwdev.h>
-#include <asm/ccwgroup.h>
-#include <asm/debug.h>
-
-#include "qeth_mpc.h"
-#include "qeth.h"
-
-/****************** MODULE PARAMETER VARIABLES ********************/
-static int qeth_sparebufs = 0;
-module_param(qeth_sparebufs, int, 0);
-MODULE_PARM_DESC(qeth_sparebufs, "the number of pre-allocated spare buffers "
-		 "reserved for low memory situations");
-
-/****************** MODULE STUFF **********************************/
-#define VERSION_QETH_C "$Revision: 1.177 $"
-static const char *version = "qeth S/390 OSA-Express driver ("
-    VERSION_QETH_C "/" VERSION_QETH_H "/" VERSION_QETH_MPC_H
-    QETH_VERSION_IPV6 QETH_VERSION_VLAN ")";
-
-MODULE_AUTHOR("Utz Bacher <utz.bacher@de.ibm.com>");
-MODULE_DESCRIPTION("Linux on zSeries OSA Express and HiperSockets support\n" \
-		   "Copyright 2000,2003 IBM Corporation\n");
-MODULE_LICENSE("GPL");
-
-/******************** HERE WE GO ***********************************/
-
-#define PROCFILE_SLEEP_SEM_MAX_VALUE 0
-#define PROCFILE_IOCTL_SEM_MAX_VALUE 3
-static struct semaphore qeth_procfile_ioctl_lock;
-static struct semaphore qeth_procfile_ioctl_sem;
-static struct qeth_card *firstcard = NULL;
-
-static struct sparebufs sparebufs[MAX_SPARE_BUFFERS];
-static int sparebuffer_count;
-
-static unsigned int known_devices[][10] = QETH_MODELLIST_ARRAY;
-
-static spinlock_t setup_lock = SPIN_LOCK_UNLOCKED;
-static rwlock_t list_lock = RW_LOCK_UNLOCKED;
-
-static debug_info_t *qeth_dbf_setup = NULL;
-static debug_info_t *qeth_dbf_data = NULL;
-static debug_info_t *qeth_dbf_misc = NULL;
-static debug_info_t *qeth_dbf_control = NULL;
-static debug_info_t *qeth_dbf_trace = NULL;
-static debug_info_t *qeth_dbf_sense = NULL;
-static debug_info_t *qeth_dbf_qerr = NULL;
-
-static int proc_file_registration;
-#ifdef QETH_PERFORMANCE_STATS
-static int proc_perf_file_registration;
-#define NOW qeth_get_micros()
-#endif /* QETH_PERFORMANCE_STATS */
-static int proc_ipato_file_registration;
-
-static int ipato_inv4 = 0, ipato_inv6 = 0;
-static struct ipato_entry *ipato_entries = NULL;
-static spinlock_t ipato_list_lock = SPIN_LOCK_UNLOCKED;
-
-struct tempinfo{
-	char *data;
-	int len;
-};
-
-/* thought I could get along without forward declarations...
- * just lazyness here */
-static int qeth_reinit_thread(void *);
-static inline void qeth_schedule_recovery(struct qeth_card *card);
-
-static inline int
-QETH_IP_VERSION(struct sk_buff *skb)
-{
-	switch (skb->protocol) {
-	case ETH_P_IPV6:
-		return 6;
-	case ETH_P_IP:
-		return 4;
-	default:
-		return 0;
-	}
-}
-
-/* not a macro, as one of the arguments is atomic_read */
-static inline int
-qeth_min(int a, int b)
-{
-	if (a < b)
-		return a;
-	else
-		return b;
-}
-
-static inline unsigned int
-qeth_get_millis(void)
-{
-	return (int) (get_clock() >> 22);   /* time>>12 is microseconds, we
-					       divide it by 1024 */
-}
-
-#ifdef QETH_PERFORMANCE_STATS
-static inline unsigned int
-qeth_get_micros(void)
-{
-	return (int) (get_clock() >> 12);
-}
-#endif /* QETH_PERFORMANCE_STATS */
-
-static void
-qeth_delay_millis(unsigned long msecs)
-{
-	unsigned int start;
-
-	start = qeth_get_millis();
-	while (qeth_get_millis() - start < msecs) ;
-}
-
-static void
-qeth_wait_nonbusy(unsigned int timeout)
-{
-	unsigned int start;
-	char dbf_text[15];
-
-	sprintf(dbf_text, "wtnb%4x", timeout);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	start = qeth_get_millis();
-	for (;;) {
-		set_task_state(current, TASK_INTERRUPTIBLE);
-		if (qeth_get_millis() - start > timeout) {
-			goto out;
-		}
-		schedule_timeout(((start + timeout -
-				   qeth_get_millis()) >> 10) * HZ);
-	}
-out:
-	set_task_state(current, TASK_RUNNING);
-}
-
-static void
-qeth_get_mac_for_ipm(__u32 ipm, char *mac, struct net_device *dev)
-{
-	if (dev->type == ARPHRD_IEEE802_TR)
-		ip_tr_mc_map(ipm, mac);
-	else
-		ip_eth_mc_map(ipm, mac);
-}
-
-#define atomic_swap(a,b) xchg((int*)a.counter,b)
-
-static int inline
-my_spin_lock_nonbusy(struct qeth_card *card, spinlock_t * lock)
-{
-	for (;;) {
-		if (card) {
-			if (atomic_read(&card->shutdown_phase))
-				return -1;
-		}
-		if (spin_trylock(lock))
-			return 0;
-		qeth_wait_nonbusy(QETH_IDLE_WAIT_TIME);
-	}
-}
-
-static int inline
-my_down_trylock_nonbusy(struct qeth_card *card, struct semaphore  *sema)
-{
-	for (;;) {
-		if (card) {
-			if (atomic_read(&card->shutdown_phase))
-				return -1;
-		}
-		if (down_trylock(sema))
-			return 0;
-		qeth_wait_nonbusy(QETH_IDLE_WAIT_TIME);
-	}
-}
-
-
-#ifdef CONFIG_ARCH_S390X
-#define QETH_GET_ADDR(x) ((__u32)(unsigned long)x)
-#else /* CONFIG_ARCH_S390X */
-#define QETH_GET_ADDR(x) ((__u32)x)
-#endif /* CONFIG_ARCH_S390X */
-
-static int
-qeth_does_card_exist(struct qeth_card *card)
-{
-	struct qeth_card *c = firstcard;
-	int rc = 0;
-
-	read_lock(&list_lock);
-	while (c) {
-		if (c == card) {
-			rc = 1;
-			break;
-		}
-		c = c->next;
-	}
-	read_unlock(&list_lock);
-	return rc;
-}
-
-static int
-qeth_getxdigit(char c)
-{
-	if ((c >= '0') && (c <= '9'))
-		return c - '0';
-	if ((c >= 'a') && (c <= 'f'))
-		return c + 10 - 'a';
-	if ((c >= 'A') && (c <= 'F'))
-		return c + 10 - 'A';
-	return -1;
-}
-
-static struct qeth_card *
-qeth_get_card_by_name(char *name)
-{
-	struct qeth_card *card;
-
-	read_lock(&list_lock);
-	card = firstcard;
-	while (card) {
-		if (!strncmp(name, card->dev_name, DEV_NAME_LEN))
-			break;
-		card = card->next;
-	}
-	read_unlock(&list_lock);
-
-	return card;
-}
-
-static void
-qeth_convert_addr_to_text(int version, __u8 * addr, char *text)
-{
-	if (version == 4) {
-		sprintf(text, "%02x%02x%02x%02x",
-			addr[0], addr[1], addr[2], addr[3]);
-	} else {
-		sprintf(text, "%02x%02x%02x%02x%02x%02x%02x%02x"
-			"%02x%02x%02x%02x%02x%02x%02x%02x",
-			addr[0], addr[1], addr[2], addr[3],
-			addr[4], addr[5], addr[6], addr[7],
-			addr[8], addr[9], addr[10], addr[11],
-			addr[12], addr[13], addr[14], addr[15]);
-	}
-}
-
-static int
-qeth_convert_text_to_addr(int version, char *text, __u8 * addr)
-{
-	int olen = (version == 4) ? 4 : 16;
-
-	while (olen--) {
-		if ((!isxdigit(*text)) || (!isxdigit(*(text + 1))))
-			return -EINVAL;
-		*addr =
-		    (qeth_getxdigit(*text) << 4) + qeth_getxdigit(*(text + 1));
-		addr++;
-		text += 2;
-	}
-	return 0;
-}
-
-static void
-qeth_add_ipato_entry(int version, __u8 * addr, int mask_bits, char *dev_name)
-{
-	struct ipato_entry *entry, *e;
-	int len = (version == 4) ? 4 : 16;
-
-	entry =
-	    (struct ipato_entry *) kmalloc(sizeof (struct ipato_entry),
-					   GFP_KERNEL);
-	if (!entry) {
-		PRINT_ERR("not enough memory for ipato allocation\n");
-		return;
-	}
-	entry->version = version;
-	memcpy(entry->addr, addr, len);
-	if (dev_name) {
-		strncpy(entry->dev_name, dev_name, DEV_NAME_LEN);
-		if (qeth_get_card_by_name(dev_name)->options.ena_ipat !=
-		    ENABLE_TAKEOVER)
-			PRINT_WARN("IP takeover is not enabled on %s! "
-				   "Ignoring line\n", dev_name);
-	} else
-		memset(entry->dev_name, 0, DEV_NAME_LEN);
-	entry->mask_bits = mask_bits;
-	entry->next = NULL;
-
-	spin_lock(&ipato_list_lock);
-	if (ipato_entries) {
-		e = ipato_entries;
-		while (e) {
-			if ((e->version == version) &&
-			    (e->mask_bits == mask_bits) &&
-			    (((dev_name) && !strncmp(e->dev_name, dev_name,
-						     DEV_NAME_LEN)) ||
-			     (!dev_name)) && (!memcmp(e->addr, addr, len))) {
-				PRINT_INFO("ipato to be added does already "
-					   "exist\n");
-				kfree(entry);
-				goto out;
-			}
-			if (e->next)
-				e = e->next;
-			else
-				break;
-		}
-		e->next = entry;
-	} else
-		ipato_entries = entry;
-      out:
-	spin_unlock(&ipato_list_lock);
-}
-
-static void
-qeth_del_ipato_entry(int version, __u8 * addr, int mask_bits, char *dev_name)
-{
-	struct ipato_entry *e, *e_before;
-	int len = (version == 4) ? 4 : 16;
-	int found = 0;
-
-	spin_lock(&ipato_list_lock);
-	e = ipato_entries;
-	if ((e->version == version) &&
-	    (e->mask_bits == mask_bits) && (!memcmp(e->addr, addr, len))) {
-		ipato_entries = e->next;
-		kfree(e);
-	} else
-		while (e) {
-			e_before = e;
-			e = e->next;
-			if (!e)
-				break;
-			if ((e->version == version) &&
-			    (e->mask_bits == mask_bits) &&
-			    (((dev_name) && !strncmp(e->dev_name, dev_name,
-						     DEV_NAME_LEN)) ||
-			     (!dev_name)) && (!memcmp(e->addr, addr, len))) {
-				e_before->next = e->next;
-				kfree(e);
-				found = 1;
-				break;
-			}
-		}
-	if (!found)
-		PRINT_INFO("ipato to be deleted does not exist\n");
-	spin_unlock(&ipato_list_lock);
-}
-
-static void
-qeth_convert_addr_to_bits(__u8 * addr, char *bits, int len)
-{
-	int i, j;
-	__u8 octet;
-
-	for (i = 0; i < len; i++) {
-		octet = addr[i];
-		for (j = 7; j >= 0; j--) {
-			bits[i * 8 + j] = (octet & 1) ? 1 : 0;
-			octet >>= 1;
-		}
-	}
-}
-
-static int
-qeth_is_ipa_covered_by_ipato_entries(int version, __u8 * addr,
-				     struct qeth_card *card)
-{
-	char *memarea, *addr_bits, *entry_bits;
-	int len = (version == 4) ? 4 : 16;
-	int invert = (version == 4) ? ipato_inv4 : ipato_inv6;
-	int result = 0;
-	struct ipato_entry *e;
-
-	if (card->options.ena_ipat != ENABLE_TAKEOVER) {
-		return 0;
-	}
-
-	memarea = kmalloc(256, GFP_KERNEL);
-	if (!memarea) {
-		PRINT_ERR("not enough memory to check out whether to "
-			  "use ipato\n");
-		return 0;
-	}
-	addr_bits = memarea;
-	entry_bits = memarea + 128;
-	qeth_convert_addr_to_bits(addr, addr_bits, len);
-	e = ipato_entries;
-	while (e) {
-		qeth_convert_addr_to_bits(e->addr, entry_bits, len);
-		if ((!memcmp(addr_bits, entry_bits,
-			     __min(len * 8, e->mask_bits))) &&
-		    ((e->dev_name[0] &&
-		      (!strncmp(e->dev_name, card->dev_name, DEV_NAME_LEN))) ||
-		     (!e->dev_name[0]))) {
-			result = 1;
-			break;
-		}
-		e = e->next;
-	}
-
-	kfree(memarea);
-	if (invert)
-		return !result;
-	else
-		return result;
-}
-
-static void
-qeth_set_dev_flag_running(struct qeth_card *card)
-{
-	if (card) {
-		card->dev->flags |= IFF_RUNNING;
-	}
-}
-
-static void
-qeth_set_dev_flag_norunning(struct qeth_card *card)
-{
-	if (card) {
-		card->dev->flags &= ~IFF_RUNNING;
-	}
-}
-
-static void
-qeth_restore_dev_flag_state(struct qeth_card *card)
-{
-	if (card) {
-		if (card->saved_dev_flags & IFF_RUNNING)
-			card->dev->flags |= IFF_RUNNING;
-		else
-			card->dev->flags &= ~IFF_RUNNING;
-	}
-}
-
-static void
-qeth_save_dev_flag_state(struct qeth_card *card)
-{
-	if (card) {
-		card->saved_dev_flags = card->dev->flags & IFF_RUNNING;
-	}
-}
-
-static int
-qeth_open(struct net_device *dev)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-	QETH_DBF_CARD2(0, trace, "open", card);
-	QETH_DBF_CARD2(0, setup, "open", card);
-
-	qeth_save_dev_flag_state(card);
-
-	netif_start_queue(dev);
-	atomic_set(&((struct qeth_card *) dev->priv)->is_open, 1);
-
-	return 0;
-}
-
-static int
-qeth_set_config(struct net_device *dev, struct ifmap *map)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *)dev->priv;
-	QETH_DBF_CARD3(0, trace, "nscf", card);
-
-	return -EOPNOTSUPP;
-}
-
-static int
-qeth_is_multicast_skb_at_all(struct sk_buff *skb, int version)
-{
-	int i;
-	struct qeth_card *card;
-
-	i = RTN_UNSPEC;
-	card = (struct qeth_card *)skb->dev->priv;
-	if (skb->dst && skb->dst->neighbour) {
-		i = skb->dst->neighbour->type;
-		return ((i == RTN_BROADCAST) ||
-			(i == RTN_MULTICAST) || (i == RTN_ANYCAST)) ? i : 0;
-	}
-	/* ok, we've to try it somehow else */
-	if (version == 4) {
-		return ((skb->nh.raw[16] & 0xf0) == 0xe0) ? RTN_MULTICAST : 0;
-	} else if (version == 6) {
-		return (skb->nh.raw[24] == 0xff) ? RTN_MULTICAST : 0;
-	}
-	if (!memcmp(skb->nh.raw, skb->dev->broadcast, 6)) {
-		i = RTN_BROADCAST;
-	} else {
-		__u16 hdr_mac;
-
-	        hdr_mac = *((__u16*)skb->nh.raw);
-	        /* tr multicast? */
-	        switch (card->link_type) {
-	        case QETH_MPC_LINK_TYPE_HSTR:
-	        case QETH_MPC_LINK_TYPE_LANE_TR:
-	        	if ((hdr_mac == QETH_TR_MAC_NC) ||
-			    (hdr_mac == QETH_TR_MAC_C))
-				i = RTN_MULTICAST;
-			break;
-	        /* eth or so multicast? */
-                default:
-                      	if ((hdr_mac == QETH_ETH_MAC_V4) ||
-			    (hdr_mac == QETH_ETH_MAC_V6))
-			        i = RTN_MULTICAST;
-	        }
-        }
-	return ((i == RTN_BROADCAST)||
-	        (i == RTN_MULTICAST)||
-	        (i == RTN_ANYCAST)) ? i : 0;
-}
-
-static int
-qeth_get_prioqueue(struct qeth_card *card, struct sk_buff *skb,
-		   int multicast, int version)
-{
-	if (!version && (card->type == QETH_CARD_TYPE_OSAE))
-		return QETH_DEFAULT_QUEUE;
-	switch (card->no_queues) {
-	case 1:
-		return 0;
-	case 4:
-		if (card->is_multicast_different) {
-			if (multicast) {
-				return card->is_multicast_different &
-				    (card->no_queues - 1);
-			} else {
-				return 0;
-			}
-		}
-		if (card->options.do_prio_queueing) {
-			if (version == 4) {
-				if (card->options.do_prio_queueing ==
-				    PRIO_QUEUEING_TOS) {
-					if (skb->nh.iph->tos &
-					    IP_TOS_NOTIMPORTANT) {
-						return 3;
-					}
-					if (skb->nh.iph->tos & IP_TOS_LOWDELAY) {
-						return 0;
-					}
-					if (skb->nh.iph->tos &
-					    IP_TOS_HIGHTHROUGHPUT) {
-						return 1;
-					}
-					if (skb->nh.iph->tos &
-					    IP_TOS_HIGHRELIABILITY) {
-						return 2;
-					}
-					return QETH_DEFAULT_QUEUE;
-				}
-				if (card->options.do_prio_queueing ==
-				    PRIO_QUEUEING_PREC) {
-					return 3 - (skb->nh.iph->tos >> 6);
-				}
-			} else if (version == 6) {
-				/********************
-				 ********************
-				 *TODO: IPv6!!!
-				 ********************/
-			}
-			return card->options.default_queue;
-		} else
-			return card->options.default_queue;
-	default:
-		return 0;
-	}
-}
-
-static void
-qeth_wakeup(struct qeth_card *card)
-{
-	QETH_DBF_CARD5(0, trace, "wkup", card);
-
-	atomic_set(&card->data_has_arrived, 1);
-	wake_up(&card->wait_q);
-}
-
-static int
-qeth_check_idx_response(unsigned char *buffer)
-{
-	if (!buffer)
-		return 0;
-	if ((buffer[2] & 0xc0) == 0xc0) {
-		return -EIO;
-	}
-	return 0;
-}
-
-static int
-qeth_get_cards_problem(struct ccw_device *cdev, unsigned char *buffer,
-		       int dstat, int cstat, int rqparam,
-		       char *irb, char *sense)
-{
-	char dbf_text[15];
-	int problem = 0;
-	struct qeth_card *card;
-
-	card = CARD_FROM_CDEV(cdev);
-
-	if (atomic_read(&card->shutdown_phase))
-		return 0;
-	if (dstat & DEV_STAT_UNIT_CHECK) {
-		if (sense[SENSE_RESETTING_EVENT_BYTE] &
-		    SENSE_RESETTING_EVENT_FLAG) {
-			QETH_DBF_CARD1(0, trace, "REVN", card);
-			problem = PROBLEM_RESETTING_EVENT_INDICATOR;
-			goto out;
-		}
-		if (sense[SENSE_COMMAND_REJECT_BYTE] &
-		    SENSE_COMMAND_REJECT_FLAG) {
-			QETH_DBF_CARD1(0, trace, "CREJ", card);
-			problem = PROBLEM_COMMAND_REJECT;
-			goto out;
-		}
-		if ((sense[2] == 0xaf) && (sense[3] == 0xfe)) {
-			QETH_DBF_CARD1(0, trace, "AFFE", card);
-			problem = PROBLEM_AFFE;
-			goto out;
-		}
-		if ((!sense[0]) && (!sense[1]) && (!sense[2]) && (!sense[3])) {
-			QETH_DBF_CARD1(0, trace, "ZSNS", card);
-			problem = PROBLEM_ZERO_SENSE_DATA;
-			goto out;
-		}
-		QETH_DBF_CARD1(0, trace, "GCHK", card);
-		problem = PROBLEM_GENERAL_CHECK;
-		goto out;
-	}
-	if (cstat & (SCHN_STAT_CHN_CTRL_CHK | SCHN_STAT_INTF_CTRL_CHK |
-		     SCHN_STAT_CHN_DATA_CHK | SCHN_STAT_CHAIN_CHECK |
-		     SCHN_STAT_PROT_CHECK | SCHN_STAT_PROG_CHECK)) {
-		QETH_DBF_TEXT1(0, trace, "GCHK");
-		QETH_DBF_TEXT1(0, trace, cdev->dev.bus_id);
-		QETH_DBF_HEX1(0, misc, irb, __max(QETH_DBF_MISC_LEN, 64));
-		PRINT_WARN("check on device %s, dstat=x%x, cstat=x%x, "
-			   "rqparam=x%x\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-		HEXDUMP16(WARN, "irb: ", irb);
-		HEXDUMP16(WARN, "irb: ", ((char *) irb) + 32);
-		problem = PROBLEM_GENERAL_CHECK;
-		goto out;
-	}
-	if (qeth_check_idx_response(buffer)) {
-		PRINT_WARN("received an IDX TERMINATE on device %s "
-			   "with cause code 0x%02x%s\n",
-			   CARD_BUS_ID(card), buffer[4],
-			   (buffer[4] ==
-			    0x22) ? " -- try another portname" : "");
-		QETH_DBF_CARD1(0, trace, "RTRM", card);
-		problem = PROBLEM_RECEIVED_IDX_TERMINATE;
-		goto out;
-	}
-	if (IS_IPA(buffer) && !IS_IPA_REPLY(buffer)) {
-		if (*(PDU_ENCAPSULATION(buffer)) == IPA_CMD_STOPLAN) {
-			atomic_set(&card->is_startlaned, 0);
-			/* we don't do a  netif_stop_queue(card->dev);
-			   we better discard all packets --
-			   the outage could take longer */
-			PRINT_WARN("Link failure on %s (CHPID 0x%X) -- "
-				   "there is a network problem or someone "
-				   "pulled the cable or disabled the port."
-				   "Discarding outgoing packets.\n",
-				   card->dev_name, card->chpid);
-			QETH_DBF_CARD1(0, trace, "CBOT", card);
-			qeth_set_dev_flag_norunning(card);
-			problem = 0;
-			goto out;
-		}
-		if (*(PDU_ENCAPSULATION(buffer)) == IPA_CMD_STARTLAN) {
-			if (!atomic_read(&card->is_startlaned)) {
-				atomic_set(&card->is_startlaned, 1);
-				problem = PROBLEM_CARD_HAS_STARTLANED;
-			}
-			goto out;
-		}
-		if (*(PDU_ENCAPSULATION(buffer)) == IPA_CMD_REGISTER_LOCAL_ADDR)
-			QETH_DBF_CARD3(0, trace, "irla", card);
-		if (*(PDU_ENCAPSULATION(buffer)) == 
-		    IPA_CMD_UNREGISTER_LOCAL_ADDR)
-			QETH_DBF_CARD3(0, trace, "irla", card);
-		PRINT_WARN("probably a problem on %s: received data is IPA, "
-			   "but not a reply: command=0x%x\n", card->dev_name,
-			   *(PDU_ENCAPSULATION(buffer) + 1));
-		QETH_DBF_CARD1(0, trace, "INRP", card);
-		goto out;
-	}
-	/* no probs */
-out:
-	if (problem) {
-		QETH_DBF_CARD3(0, trace, "gcpr", card);
-		sprintf(dbf_text, "%2x%2x%4x", dstat, cstat, problem);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		sprintf(dbf_text, "%8x", rqparam);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		if (buffer)
-			QETH_DBF_HEX3(0, trace, &buffer, sizeof (void *));
-		QETH_DBF_HEX3(0, trace, &irb, sizeof (void *));
-		QETH_DBF_HEX3(0, trace, &sense, sizeof (void *));
-	}
-	atomic_set(&card->problem, problem);
-	return problem;
-}
-
-static void
-qeth_issue_next_read(struct qeth_card *card)
-{
-	int result, result2;
-	char dbf_text[15];
-
-	QETH_DBF_CARD5(0, trace, "isnr", card);
-
-	/* set up next read ccw */
-	memcpy(&card->dma_stuff->read_ccw, READ_CCW, sizeof (struct ccw1));
-	card->dma_stuff->read_ccw.count = QETH_BUFSIZE;
-	/* recbuf is not yet used by read channel program */
-	card->dma_stuff->read_ccw.cda = QETH_GET_ADDR(card->dma_stuff->recbuf);
-
-	/* 
-	 * we don't spin_lock_irqsave(get_ccwdev_lock(CARD_RDEV(card)),flags), as
-	 * we are only called in the interrupt handler
-	 */
-	result = ccw_device_start(CARD_RDEV(card), &card->dma_stuff->read_ccw,
-				  MPC_SETUP_STATE, 0, 0);
-	if (result) {
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO);
-		result2 =
-		    ccw_device_start(CARD_RDEV(card), &card->dma_stuff->read_ccw,
-				     MPC_SETUP_STATE, 0, 0);
-		PRINT_WARN("read handler on device %s, read: ccw_device_start "
-			   "returned %i, next try returns %i\n",
-			   CARD_BUS_ID(card), result, result2);
-		QETH_DBF_CARD1(0, trace, "IsNR", card);
-		sprintf(dbf_text, "%04x%04x", (__s16) result, (__s16) result2);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-	}
-}
-
-static int
-qeth_is_to_recover(struct qeth_card *card, int problem)
-{
-	switch (problem) {
-	case PROBLEM_CARD_HAS_STARTLANED:
-		return 1;
-	case PROBLEM_RECEIVED_IDX_TERMINATE:
-		if (atomic_read(&card->in_recovery)) {
-			return 1;
-		} else {
-			qeth_set_dev_flag_norunning(card);
-			return 0;
-		}
-	case PROBLEM_ACTIVATE_CHECK_CONDITION:
-		return 1;
-	case PROBLEM_RESETTING_EVENT_INDICATOR:
-		return 1;
-	case PROBLEM_COMMAND_REJECT:
-		return 0;
-	case PROBLEM_ZERO_SENSE_DATA:
-		return 0;
-	case PROBLEM_GENERAL_CHECK:
-		return 1;
-	case PROBLEM_BAD_SIGA_RESULT:
-		return 1;
-	case PROBLEM_USER_TRIGGERED_RECOVERY:
-		return 1;
-	case PROBLEM_AFFE:
-		return 1;
-	case PROBLEM_MACHINE_CHECK:
-		return 1;
-	case PROBLEM_TX_TIMEOUT:
-		return 1;
-	}
-	return 0;
-}
-
-static int
-qeth_get_spare_buf(void)
-{
-	int i = 0;
-	char dbf_text[15];
-
-	while (i < sparebuffer_count) {
-		if (!atomic_compare_and_swap(SPAREBUF_FREE, SPAREBUF_USED,
-					     &sparebufs[i].status)) {
-			sprintf(dbf_text, "gtspb%3x", i);
-			QETH_DBF_TEXT4(0, trace, dbf_text);
-			return i;
-		}
-		i++;
-	}
-	QETH_DBF_TEXT3(0, trace, "nospbuf");
-
-	return -1;
-}
-
-static void
-qeth_put_spare_buf(int no)
-{
-	char dbf_text[15];
-
-	sprintf(dbf_text, "ptspb%3x", no);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	atomic_set(&sparebufs[no].status, SPAREBUF_FREE);
-}
-
-static inline void
-qeth_put_buffer_pool_entry(struct qeth_card *card, int entry_no)
-{
-	if (entry_no & SPAREBUF_MASK)
-		qeth_put_spare_buf(entry_no & (~SPAREBUF_MASK));
-	else
-		card->inbound_buffer_pool_entry_used[entry_no] = BUFFER_UNUSED;
-}
-
-static inline int
-qeth_get_empty_buffer_pool_entry(struct qeth_card *card)
-{
-	int i;
-	int max_buffers = card->options.inbound_buffer_count;
-
-	for (i = 0; i < max_buffers; i++) {
-		if (xchg((int *) &card->inbound_buffer_pool_entry_used[i],
-			 BUFFER_USED) == BUFFER_UNUSED)
-			return i;
-	}
-	return -1;
-}
-
-static inline void
-qeth_clear_input_buffer(struct qeth_card *card, int bufno)
-{
-	struct qdio_buffer *buffer;
-	int i;
-	int elements, el_m_1;
-	char dbf_text[15];
-
-	QETH_DBF_CARD6(0, trace, "clib", card);
-	sprintf(dbf_text, "bufno%3x", bufno);
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-
-	buffer = &card->inbound_qdio_buffers[bufno];
-	elements = BUFFER_MAX_ELEMENTS;
-	el_m_1 = elements - 1;
-
-	for (i = 0; i < elements; i++) {
-		if (i == el_m_1)
-			buffer->element[i].flags = SBAL_FLAGS_LAST_ENTRY;
-		else
-			buffer->element[i].flags = 0;
-
-		buffer->element[i].length = PAGE_SIZE;
-		buffer->element[i].addr = INBOUND_BUFFER_POS(card, bufno, i);
-	}
-}
-
-static void
-qeth_queue_input_buffer(struct qeth_card *card, int bufno,
-			unsigned int under_int)
-{
-	int count = 0, start = 0, stop = 0, pos;
-	int result;
-	int cnt1, cnt2 = 0;
-	int wrapped = 0;
-	int i;
-	int requeue_counter;
-	char dbf_text[15];
-	int no;
-
-	QETH_DBF_CARD5(0, trace, "qibf", card);
-	sprintf(dbf_text, "%4x%4x", under_int, bufno);
-	QETH_DBF_TEXT5(0, trace, dbf_text);
-	atomic_inc(&card->requeue_counter);
-	if (atomic_read(&card->requeue_counter) <= QETH_REQUEUE_THRESHOLD)
-		return;
-
-	if (!spin_trylock(&card->requeue_input_lock)) {
-		QETH_DBF_CARD5(0, trace, "qibl", card);
-		return;
-	}
-	requeue_counter = atomic_read(&card->requeue_counter);
-	pos = atomic_read(&card->requeue_position);
-	
-	start = pos;
-	/* 
-	 * omit the situation with 128 simultaneously
-	 * enqueued buffers, as then we can't benefit from PCI
-	 * avoidance anymore -- therefore we let count not grow as
-	 * big as requeue_counter
-	 */
-	while ((!atomic_read(&card->inbound_buffer_refcnt[pos])) &&
-	       (count < requeue_counter - 1)) {
-		no = qeth_get_empty_buffer_pool_entry(card);
-		if (no == -1) {
-			if (count)
-				break;
-			no = qeth_get_spare_buf();
-			if (no == -1) {
-				PRINT_ERR("%s: no more input buffers "
-					  "available! Inbound traffic could "
-					  "be lost! Try increasing the bufcnt "
-					  "parameter\n",
-					  card->dev_name);
-				QETH_DBF_CARD2(1, trace, "QINB", card);
-				goto out;
-			}
-			card->inbound_buffer_entry_no[pos] =
-				no | SPAREBUF_MASK;
-		}
-		card->inbound_buffer_entry_no[pos] = no;
-		atomic_set(&card->inbound_buffer_refcnt[pos], 1);
-		count++;
-		if (pos >= QDIO_MAX_BUFFERS_PER_Q - 1) {
-			pos = 0;
-			wrapped = 1;
-		} else
-			pos++;
-	}
-	/* stop points to the position after the last element */
-	stop = pos;
-
-	QETH_DBF_CARD3(0, trace, "qibi", card);
-	sprintf(dbf_text, "%4x", requeue_counter);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x%4x", start, stop);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-	
-	if (wrapped) {
-		cnt1 = QDIO_MAX_BUFFERS_PER_Q - start;
-		cnt2 = stop;
-	} else {
-		cnt1 = count;
-		/* cnt2 is already set to 0 */
-	}
-	
-	atomic_sub(count, &card->requeue_counter);
-	/* 
-	 * this is the only place where card->requeue_position is
-	 * written to, so that's ok (as it is in a lock)
-	 */
-	atomic_set(&card->requeue_position,
-		   (atomic_read(&card->requeue_position) + count)
-		   & (QDIO_MAX_BUFFERS_PER_Q - 1));
-	
-	if (cnt1) {
-		for (i = start; i < start + cnt1; i++) {
-			qeth_clear_input_buffer(card, i);
-		}
-		result = do_QDIO(CARD_DDEV(card),
-				 QDIO_FLAG_SYNC_INPUT | under_int,
-				 0, start, cnt1, NULL);
-		if (result) {
-			PRINT_WARN("qeth_queue_input_buffer's "
-				   "do_QDIO returnd %i (device %s)\n",
-				   result, CARD_DDEV_ID(card));
-			QETH_DBF_CARD1(0, trace, "QIDQ", card);
-			sprintf(dbf_text, "%4x%4x", result, requeue_counter);
-			QETH_DBF_TEXT1(0, trace, dbf_text);
-			sprintf(dbf_text, "%4x%4x", start, cnt1);
-			QETH_DBF_TEXT1(1, trace, dbf_text);
-		}
-	}
-	if (cnt2) {
-		for (i = 0; i < cnt2; i++) {
-			qeth_clear_input_buffer(card, i);
-		}
-		result = do_QDIO(CARD_DDEV(card),
-				 QDIO_FLAG_SYNC_INPUT | under_int, 0,
-				 0, cnt2, NULL);
-		if (result) {
-			PRINT_WARN("qeth_queue_input_buffer's "
-				   "do_QDIO returnd %i (device %s)\n",
-				   result, CARD_DDEV_ID(card));
-			QETH_DBF_CARD1(0, trace, "QIDQ", card);
-			sprintf(dbf_text, "%4x%4x", result, requeue_counter);
-			QETH_DBF_TEXT1(0, trace, dbf_text);
-			sprintf(dbf_text, "%4x%4x", 0, cnt2);
-			QETH_DBF_TEXT1(1, trace, dbf_text);
-		}
-	}
-out:
-	spin_unlock(&card->requeue_input_lock);
-
-}
-
-static inline struct sk_buff *
-qeth_get_skb(unsigned int len)
-{
-	struct sk_buff *skb;
-
-#ifdef QETH_VLAN
-	skb = dev_alloc_skb(len + VLAN_HLEN);
-	if (skb)
-		skb_reserve(skb, VLAN_HLEN);
-#else /* QETH_VLAN */
-	skb = dev_alloc_skb(len);
-#endif /* QETH_VLAN */
-	return skb;
-}
-
-static inline struct sk_buff *
-qeth_get_next_skb(struct qeth_card *card,
-		  int *element_ptr, int *pos_in_el_ptr,
-		  void **hdr_ptr, struct qdio_buffer *buffer)
-{
-	int length;
-	char *data_ptr;
-	int step, len_togo, element, pos_in_el;
-	int curr_len;
-	int max_elements;
-	struct sk_buff *skb;
-	char dbf_text[15];
-
-	max_elements = BUFFER_MAX_ELEMENTS;
-
-#define SBALE_LEN(x) ((x>=max_elements)?0:(buffer->element[x].length))
-#define SBALE_ADDR(x) (buffer->element[x].addr)
-
-	element = *element_ptr;
-
-	if (element >= max_elements) {
-		PRINT_WARN("device %s: error in interpreting buffer (data "
-			   "too long), %i elements.\n",
-			   CARD_BUS_ID(card), element);
-		QETH_DBF_CARD0(0, trace, "IEDL", card);
-		sprintf(dbf_text, "%4x%4x", *element_ptr, *pos_in_el_ptr);
-		QETH_DBF_TEXT0(1, trace, dbf_text);
-		QETH_DBF_HEX0(0, misc, buffer, QETH_DBF_MISC_LEN);
-		QETH_DBF_HEX0(0, misc, buffer + QETH_DBF_MISC_LEN,
-			      QETH_DBF_MISC_LEN);
-		return NULL;
-	}
-
-	pos_in_el = *pos_in_el_ptr;
-
-	curr_len = SBALE_LEN(element);
-	if (curr_len > PAGE_SIZE) {
-		PRINT_WARN("device %s: bad element length in element %i: "
-			   "0x%x\n", CARD_BUS_ID(card), element, curr_len);
-		QETH_DBF_CARD0(0, trace, "BELN", card);
-		sprintf(dbf_text, "%4x", curr_len);
-		QETH_DBF_TEXT0(0, trace, dbf_text);
-		sprintf(dbf_text, "%4x%4x", *element_ptr, *pos_in_el_ptr);
-		QETH_DBF_TEXT0(1, trace, dbf_text);
-		QETH_DBF_HEX0(0, misc, buffer, QETH_DBF_MISC_LEN);
-		QETH_DBF_HEX0(0, misc, buffer + QETH_DBF_MISC_LEN,
-			      QETH_DBF_MISC_LEN);
-		return NULL;
-	}
-	/* header fits in current element? */
-	if (curr_len < pos_in_el + QETH_HEADER_SIZE) {
-		if (!pos_in_el) {
-			QETH_DBF_CARD6(0, trace, "gnmh", card);
-			return NULL;	/* no more data in buffer */
-		}
-		/* set hdr to next element */
-		element++;
-		pos_in_el = 0;
-		curr_len = SBALE_LEN(element);
-		/* does it fit in there? */
-		if (curr_len < QETH_HEADER_SIZE) {
-			QETH_DBF_CARD6(0, trace, "gdnf", card);
-			return NULL;
-		}
-	}
-
-	*hdr_ptr = SBALE_ADDR(element) + pos_in_el;
-
-	length = *(__u16 *) ((char *) (*hdr_ptr) + QETH_HEADER_LEN_POS);
-
-	QETH_DBF_CARD6(0, trace, "gdHd", card);
-	QETH_DBF_HEX6(0, trace, hdr_ptr, sizeof (void *));
-
-	pos_in_el += QETH_HEADER_SIZE;
-	if (curr_len <= pos_in_el) {
-		/* switch to next element for data */
-		pos_in_el = 0;
-		element++;
-		curr_len = SBALE_LEN(element);
-		if (!curr_len) {
-			PRINT_WARN("device %s: inb. buffer with more headers "
-				   "than data areas (%i elements).\n",
-				   CARD_BUS_ID(card), element);
-			QETH_DBF_CARD0(0, trace, "IEMH", card);
-			sprintf(dbf_text, "%2x%2x%4x", element, *element_ptr,
-				*pos_in_el_ptr);
-			QETH_DBF_TEXT0(1, trace, dbf_text);
-			QETH_DBF_HEX0(0, misc, buffer, QETH_DBF_MISC_LEN);
-			QETH_DBF_HEX0(0, misc, buffer + QETH_DBF_MISC_LEN,
-				      QETH_DBF_MISC_LEN);
-			return NULL;
-		}
-	}
-
-	data_ptr = SBALE_ADDR(element) + pos_in_el;
-
-	if (card->options.fake_ll == FAKE_LL) {
-		skb = qeth_get_skb(length + QETH_FAKE_LL_LEN);
-		if (!skb)
-			goto nomem;
-		skb_pull(skb, QETH_FAKE_LL_LEN);
-	} else {
-		skb = qeth_get_skb(length);
-		if (!skb)
-			goto nomem;
-	}
-
-	QETH_DBF_HEX6(0, trace, &data_ptr, sizeof (void *));
-	QETH_DBF_HEX6(0, trace, &skb, sizeof (void *));
-
-	len_togo = length;
-	while (1) {
-		step = qeth_min(len_togo, curr_len - pos_in_el);
-		if (!step) {
-			PRINT_WARN("device %s: unexpected end of buffer, "
-				   "length of element %i is 0. Discarding "
-				   "packet.\n",
-				   CARD_BUS_ID(card), element);
-			QETH_DBF_CARD0(0, trace, "IEUE", card);
-			sprintf(dbf_text, "%2x%2x%4x", element, *element_ptr,
-				*pos_in_el_ptr);
-			QETH_DBF_TEXT0(0, trace, dbf_text);
-			sprintf(dbf_text, "%4x%4x", len_togo, step);
-			QETH_DBF_TEXT0(0, trace, dbf_text);
-			sprintf(dbf_text, "%4x%4x", curr_len, pos_in_el);
-			QETH_DBF_TEXT0(1, trace, dbf_text);
-			QETH_DBF_HEX0(0, misc, buffer, QETH_DBF_MISC_LEN);
-			QETH_DBF_HEX0(0, misc, buffer + QETH_DBF_MISC_LEN,
-				      QETH_DBF_MISC_LEN);
-			dev_kfree_skb_irq(skb);
-			return NULL;
-		}
-		memcpy(skb_put(skb, step), data_ptr, step);
-		len_togo -= step;
-		if (len_togo) {
-			pos_in_el = 0;
-			element++;
-			curr_len = SBALE_LEN(element);
-			data_ptr = SBALE_ADDR(element);
-		} else {
-#ifdef QETH_INBOUND_PACKING_1_PACKET_PER_SBALE
-			element++;
-			/* we don't need to calculate curr_len */
-			pos_in_el = 0;
-#else /* QETH_INBOUND_PACKING_1_PACKET_PER_SBALE */
-			pos_in_el += step;
-#endif /* QETH_INBOUND_PACKING_1_PACKET_PER_SBALE */
-			break;
-		}
-	}
-
-	sprintf(dbf_text, "%4x%4x", element, pos_in_el);
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-
-	*element_ptr = element;
-	*pos_in_el_ptr = pos_in_el;
-
-	return skb;
-
-nomem:
-	if (net_ratelimit()) {
-		PRINT_WARN("no memory for packet from %s\n", card->dev_name);
-	}
-	QETH_DBF_CARD0(0, trace, "NOMM", card);
-	return NULL;
-}
-
-static inline void
-__qeth_rebuild_skb_fake_ll(struct qeth_card *card, struct sk_buff *skb,
-			   void *hdr_ptr)
-{
-	skb->mac.raw = skb->data - QETH_FAKE_LL_LEN;
-	switch (skb->pkt_type) {
-	case PACKET_MULTICAST:
-		switch (skb->protocol) {
-#ifdef QETH_IPV6
-		case __constant_htons(ETH_P_IPV6):
-			ndisc_mc_map((struct in6_addr *)
-				     skb->data + QETH_FAKE_LL_V6_ADDR_POS,
-				     skb->mac.raw + QETH_FAKE_LL_DEST_MAC_POS,
-				     card->dev, 0);
-				break;
-#endif /* QETH_IPV6 */
-		case __constant_htons(ETH_P_IP):
-			qeth_get_mac_for_ipm(*(__u32*)
-					     skb->data + QETH_FAKE_LL_V4_ADDR_POS,
-					     skb->mac.raw + QETH_FAKE_LL_DEST_MAC_POS,
-					     card->dev);
-			break;
-		default:
-			memcpy(skb->mac.raw + QETH_FAKE_LL_DEST_MAC_POS,
-			       card->dev->dev_addr, QETH_FAKE_LL_ADDR_LEN);
-		}
-		break;
-	case PACKET_BROADCAST:
-		memset(skb->mac.raw + QETH_FAKE_LL_DEST_MAC_POS,
-		       0xff, QETH_FAKE_LL_ADDR_LEN);
-		break;
-	default:
-		memcpy(skb->mac.raw + QETH_FAKE_LL_DEST_MAC_POS,
-		       card->dev->dev_addr, QETH_FAKE_LL_ADDR_LEN);
-	}
-
-	if (*(__u8 *) (hdr_ptr + 11) & QETH_EXT_HEADER_SRC_MAC_ADDRESS) {
-		memcpy(skb->mac.raw + QETH_FAKE_LL_SRC_MAC_POS,
-		       hdr_ptr + QETH_FAKE_LL_SRC_MAC_POS_IN_QDIO_HDR,
-		       QETH_FAKE_LL_ADDR_LEN);
-	} else {
-		/* clear source MAC for security reasons */
-		memset(skb->mac.raw + QETH_FAKE_LL_SRC_MAC_POS,
-		       0, QETH_FAKE_LL_ADDR_LEN);
-	}
-	memcpy(skb->mac.raw + QETH_FAKE_LL_PROT_POS,
-	       &skb->protocol, QETH_FAKE_LL_PROT_LEN);
-
-}
-
-static inline void
-__qeth_rebuild_skb_vlan(struct qeth_card *card, struct sk_buff *skb,
-			void *hdr_ptr)
-{
-#ifdef QETH_VLAN
-	__u16 *vlan_tag;
-
-	if (*(__u8 *) (hdr_ptr + 11) & QETH_EXT_HEADER_VLAN_FRAME) {
-
-		vlan_tag = (__u16 *) skb_push(skb, VLAN_HLEN);
-		/*
-		  if (*(__u8*)(hdr_ptr+11) & 
-		  QETH_EXT_HEADER_INCLUDE_VLAN_TAG) {
-		  *vlan_tag = *(__u16*)(hdr_ptr+28);
-		  *(vlan_tag+1)= *(__u16*)(hdr_ptr+30);
-		  } else {
-		*/
-		*vlan_tag = *(__u16 *) (hdr_ptr + 12);
-		*(vlan_tag + 1) = skb->protocol;
-		/*
-		  }
-		*/
-		skb->protocol = __constant_htons(ETH_P_8021Q);
-	}
-#endif
-}
-
-static inline void
-__qeth_rebuild_skb(struct qeth_card *card, struct sk_buff *skb, void *hdr_ptr)
-{
-	char dbf_text[15];
-	int version;
-	unsigned short cast_type;
-	
-	version = ((*(__u16 *) (hdr_ptr)) & (QETH_HEADER_IPV6)) ? 6 : 4;
-	skb->protocol = htons((version == 4) ? ETH_P_IP : 
-			      (version == 6) ? ETH_P_IPV6 : ETH_P_ALL);
-	cast_type = (*(__u16 *) (hdr_ptr)) & (QETH_CAST_FLAGS);
-	switch (cast_type) {
-	case QETH_CAST_UNICAST:
-		skb->pkt_type = PACKET_HOST;
-		break;
-	case QETH_CAST_MULTICAST:
-		skb->pkt_type = PACKET_MULTICAST;
-		break;
-	case QETH_CAST_BROADCAST:
-		skb->pkt_type = PACKET_BROADCAST;
-		break;
-	case QETH_CAST_ANYCAST:
-	case QETH_CAST_NOCAST:
-		QETH_DBF_CARD2(0, trace, "ribf", card);
-		sprintf(dbf_text, "castan%2x", cast_type);
-		QETH_DBF_TEXT2(1, trace, dbf_text);
-		skb->pkt_type = PACKET_HOST;
-		break;
-	default:
-		PRINT_WARN("adapter is using an unknown casting value "
-			   "of 0x%x. Using unicasting instead.\n",
-			   cast_type);
-		skb->pkt_type = PACKET_HOST;
-		QETH_DBF_CARD2(0, trace, "ribf", card);
-		sprintf(dbf_text, "castun%2x", cast_type);
-		QETH_DBF_TEXT2(1, trace, dbf_text);
-	}
-
-	if (card->options.fake_ll == FAKE_LL)
-		__qeth_rebuild_skb_fake_ll(card, skb, hdr_ptr);
-	else
-		skb->mac.raw = skb->data;
-
-	skb->ip_summed = card->options.checksum_type;
-	if (card->options.checksum_type == HW_CHECKSUMMING) {
-		/* do we have a checksummed packet? */
-
-		/* 
-		 * we only check for TCP/UDP checksums when the pseudo
-		 * header was also checked successfully -- for the
-		 * rest of the packets, it's not clear, whether the
-		 * upper layer csum is alright. And they shouldn't
-		 * occur too often anyway in real life 
-		 */
-
-		if ((*(__u8*)(hdr_ptr+11) & (QETH_EXT_HEADER_CSUM_HDR_REQ |
-					     QETH_EXT_HEADER_CSUM_TRANSP_REQ)) ==
-		    (QETH_EXT_HEADER_CSUM_HDR_REQ |
-		     QETH_EXT_HEADER_CSUM_TRANSP_REQ)) {
-#if 0
-			/* csum does not need to be set inbound anyway */
-			
-			/* 
-			 * vlan is not an issue here, it's still in
-			 * the QDIO header, not pushed in the skb yet
-			 */
-			int ip_len = (skb->data[0] & 0x0f) << 2;
-
-			if (*(__u8 *) (hdr_ptr + 11) &
-			    QETH_EXT_HEADER_CSUM_TRANSP_FRAME_TYPE) {
-				/* get the UDP checksum */
-				skb->csum = *(__u16 *)
-					(&skb->data[ip_len + 
-						    QETH_UDP_CSUM_OFFSET]);
-			} else {
-				/* get the TCP checksum */
-				skb->csum = *(__u16 *)
-					(&skb->data[ip_len +
-						    QETH_TCP_CSUM_OFFSET]);
-			}
-#endif /* 0 */
-			skb->ip_summed=CHECKSUM_UNNECESSARY;
-		} else {
-			/* make the stack check it */
-			skb->ip_summed = SW_CHECKSUMMING;
-		}
-	} else
-		skb->ip_summed=card->options.checksum_type;
-
-	__qeth_rebuild_skb_vlan(card, skb, hdr_ptr);
-}
-
-static void
-qeth_read_in_buffer(struct qeth_card *card, int buffer_no)
-{
-	struct sk_buff *skb;
-	void *hdr_ptr;
-	int element = 0, pos_in_el = 0;
-	struct qdio_buffer *buffer;
-	int i;
-	int max_elements;
-	char dbf_text[15];
-	struct net_device *dev;
-
-	dev = card->dev;
-	max_elements = BUFFER_MAX_ELEMENTS;
-
-	buffer = &card->inbound_qdio_buffers[buffer_no];
-
-	/* inform about errors */
-	if (buffer->element[15].flags & 0xff) {
-		PRINT_WARN("on device %s: incoming SBALF 15 on buffer "
-			   "0x%x are 0x%x\n",
-			   CARD_BUS_ID(card), buffer_no,
-			   buffer->element[15].flags & 0xff);
-		sprintf(dbf_text, "SF%s%2x%2x",
-			CARD_BUS_ID(card), buffer_no,
-			buffer->element[15].flags & 0xff);
-		QETH_DBF_HEX1(1, trace, dbf_text, QETH_DBF_TRACE_LEN);
-	}
-
-	for (i = 0; i < max_elements - 1; i++) {
-		if (buffer->element[i].flags & SBAL_FLAGS_LAST_ENTRY) {
-			buffer->element[i + 1].length = 0;
-			break;
-		}
-	}
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.bufs_rec++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	sprintf(dbf_text, "ribX%s", CARD_BUS_ID(card));
-	dbf_text[3] = buffer_no;
-	QETH_DBF_HEX6(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-
-	while ((skb = qeth_get_next_skb(card, &element, &pos_in_el,
-					&hdr_ptr, buffer))) {
-
-#ifdef QETH_PERFORMANCE_STATS
-		card->perf_stats.skbs_rec++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-		if (skb) {
-			skb->dev = dev;
-
-#ifdef QETH_IPV6
-			if ((*(__u16 *) (hdr_ptr)) & (QETH_HEADER_PASSTHRU))
-				skb->protocol = card->type_trans(skb, dev);
-			else
-#endif /* QETH_IPV6 */
-				__qeth_rebuild_skb(card, skb, hdr_ptr);
-
-#ifdef QETH_PERFORMANCE_STATS
-			card->perf_stats.inbound_time +=
-			    NOW - card->perf_stats.inbound_start_time;
-			card->perf_stats.inbound_cnt++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-			QETH_DBF_CARD6(0, trace, "rxpk", card);
-
-			netif_rx(skb);
-			dev->last_rx = jiffies;
-			card->stats->rx_packets++;
-			card->stats->rx_bytes += skb->len;
-		} else {
-			PRINT_WARN("%s: dropped packet, no buffers "
-				   "available.\n", card->dev_name);
-			QETH_DBF_CARD2(1, trace, "DROP", card);
-			card->stats->rx_dropped++;
-		}
-	}
-	atomic_set(&card->inbound_buffer_refcnt[buffer_no], 0);
-	qeth_put_buffer_pool_entry(card,
-				   card->inbound_buffer_entry_no[buffer_no]);
-}
-
-static inline void
-__qeth_fill_header_add_vlan(struct qeth_hdr *hdr, struct sk_buff *skb,
-			    int version)
-{
-#ifdef QETH_VLAN
-	struct qeth_card *card;
-
-	/* 
-	 * before we're going to overwrite this location with next hop ip.
-	 * v6 uses passthrough, v4 sets the tag in the QDIO header.
-	 */
-	card = (struct qeth_card *) skb->dev->priv;
-	if ((card->vlangrp != NULL) && vlan_tx_tag_present(skb)) {
-		hdr->ext_flags = (version == 4) ? QETH_EXT_HEADER_VLAN_FRAME :
-			QETH_EXT_HEADER_INCLUDE_VLAN_TAG;
-		hdr->vlan_id = vlan_tx_tag_get(skb);
-	}
-#endif
-}
-
-static inline __u8
-__qeth_get_flags_v4(int multicast)
-{
-	if (multicast == RTN_MULTICAST)
-		return QETH_CAST_MULTICAST;
-	if (multicast == RTN_BROADCAST)
-		return QETH_CAST_BROADCAST;
-	return QETH_CAST_UNICAST;
-}
-
-static inline __u8
-__qeth_get_flags_v6(int multicast)
-{
-	if (multicast == RTN_MULTICAST)
-		return QETH_CAST_MULTICAST |
-			QETH_HEADER_PASSTHRU | QETH_HEADER_IPV6;
-	if (multicast == RTN_ANYCAST)
-		return QETH_CAST_ANYCAST |
-			QETH_HEADER_PASSTHRU | QETH_HEADER_IPV6;
-	if (multicast == RTN_BROADCAST)
-		return QETH_CAST_BROADCAST |
-			QETH_HEADER_PASSTHRU | QETH_HEADER_IPV6;
-	return QETH_CAST_UNICAST |
-#ifdef QETH_IPV6
-		QETH_HEADER_PASSTHRU |
-#endif /* QETH_IPV6 */
-		QETH_HEADER_IPV6;
-}
-
-static inline void
-qeth_fill_header(struct qeth_hdr *hdr, struct sk_buff *skb,
-		 int version, int multicast)
-{
-	char dbf_text[15];
-
-	hdr->id = 1;
-	hdr->ext_flags = 0;
-
-	__qeth_fill_header_add_vlan(hdr, skb, version);
-
-	hdr->length = skb->len - QETH_HEADER_SIZE;	/* as skb->len includes
-							   the header now */
-
-	/* yes, I know this is doubled code, but a small little bit
-	   faster maybe */
-	if (version == 4) {	/* IPv4 */
-		hdr->flags = __qeth_get_flags_v4(multicast);
-		*((__u32 *) (&hdr->dest_addr[0])) = 0;
-		*((__u32 *) (&hdr->dest_addr[4])) = 0;
-		*((__u32 *) (&hdr->dest_addr[8])) = 0;
-		if ((skb->dst) && (skb->dst->neighbour)) {
-			*((__u32 *) (&hdr->dest_addr[12])) =
-			    *((__u32 *) skb->dst->neighbour->primary_key);
-		} else {
-			/* fill in destination address used in ip header */
-			*((__u32 *) (&hdr->dest_addr[12])) = skb->nh.iph->daddr;
-		}
-	} else if (version == 6) {	/* IPv6 or passthru */
-		hdr->flags = __qeth_get_flags_v6(multicast);
-		if ((skb->dst) && (skb->dst->neighbour)) {
-			memcpy(hdr->dest_addr,
-			       skb->dst->neighbour->primary_key, 16);
-		} else {
-			/* fill in destination address used in ip header */
-			memcpy(hdr->dest_addr, &skb->nh.ipv6h->daddr, 16);
-		}
-	} else {		/* passthrough */
-		if (!memcmp(skb->data + QETH_HEADER_SIZE,
-			    skb->dev->broadcast, 6)) {   /* broadcast? */
-			hdr->flags = QETH_CAST_BROADCAST | QETH_HEADER_PASSTHRU;
-		} else {
- 			hdr->flags = (multicast == RTN_MULTICAST) ?
- 				QETH_CAST_MULTICAST | QETH_HEADER_PASSTHRU :
- 				QETH_CAST_UNICAST | QETH_HEADER_PASSTHRU;
-		}
-	}
-	sprintf(dbf_text, "filhdr%2x", version);
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-	sprintf(dbf_text, "%2x", multicast);
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-	QETH_DBF_HEX6(0, trace, &skb, sizeof (void *));
-	QETH_DBF_HEX6(0, trace, &skb->data, sizeof (void *));
-	QETH_DBF_HEX6(0, misc, hdr, __max(QETH_HEADER_SIZE, QETH_DBF_MISC_LEN));
-	QETH_DBF_HEX6(0, data, skb->data,
-		      __max(QETH_DBF_DATA_LEN, QETH_DBF_DATA_LEN));
-}
-
-static inline int
-qeth_fill_buffer(struct qdio_buffer *buffer, char *dataptr,
-		 int length, int element)
-{
-	int length_here;
-	int first_lap = 1;
-	char dbf_text[15];
-	int first_element = element;
-
-	while (length > 0) {
-		/* length_here is the remaining amount of data in this page */
-		length_here =
-		    PAGE_SIZE - ((unsigned long) dataptr & (PAGE_SIZE - 1));
-		if (length < length_here)
-			length_here = length;
-
-		buffer->element[element].addr = dataptr;
-		buffer->element[element].length = length_here;
-		length -= length_here;
-		if (!length) {
-			if (first_lap) {
-				buffer->element[element].flags = 0;
-			} else {
-				buffer->element[element].flags =
-				    SBAL_FLAGS_LAST_FRAG;
-			}
-		} else {
-			if (first_lap) {
-				buffer->element[element].flags =
-				    SBAL_FLAGS_FIRST_FRAG;
-			} else {
-				buffer->element[element].flags =
-				    SBAL_FLAGS_MIDDLE_FRAG;
-			}
-		}
-		dataptr = dataptr + length_here;
-		element++;
-		if (element > QDIO_MAX_ELEMENTS_PER_BUFFER) {
-			PRINT_ERR("qeth_fill_buffer: IP packet too big!\n");
-			QETH_DBF_TEXT1(0, trace, "IPpktobg");
-			QETH_DBF_HEX1(1, trace, &dataptr, sizeof (void *));
-			buffer->element[first_element].length = 0;
-			break;
-		}
-		first_lap = 0;
-	}
-	sprintf(dbf_text, "filbuf%2x", element);
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-	QETH_DBF_HEX3(0, misc, buffer, QETH_DBF_MISC_LEN);
-	QETH_DBF_HEX3(0, misc, buffer + QETH_DBF_MISC_LEN, QETH_DBF_MISC_LEN);
-
-	return element;
-}
-
-static inline void
-qeth_flush_packed_packets(struct qeth_card *card, int queue, int under_int)
-{
-	struct qdio_buffer *buffer;
-	int result;
-	int position;
-	int position_for_do_qdio;
-	char dbf_text[15];
-	int last_pci;
-
-	position = card->outbound_first_free_buffer[queue];
-	/* can happen, when in the time between deciding to pack and sending
-	   the next packet the lower mark was reached: */
-	if (!card->outbound_ringbuffer[queue]->ringbuf_element[position].
-	    next_element_to_fill)
-		return;
-
-	buffer = &card->outbound_ringbuffer[queue]->buffer[position];
-	buffer->element[card->outbound_ringbuffer[queue]->
-			ringbuf_element[position].
-			next_element_to_fill - 1].flags |=
-	    SBAL_FLAGS_LAST_ENTRY;
-
-	card->dev->trans_start = jiffies;
-
-#ifdef QETH_PERFORMANCE_STATS
-	if (card->outbound_buffer_send_state[queue][position] ==
-	    SEND_STATE_DONT_PACK) {
-		card->perf_stats.bufs_sent_dont_pack++;
-	} else if (card->outbound_buffer_send_state[queue][position] ==
-		   SEND_STATE_PACK) {
-		card->perf_stats.bufs_sent_pack++;
-	}
-	card->perf_stats.bufs_sent++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	position_for_do_qdio = position;
-
-	position = (position + 1) & (QDIO_MAX_BUFFERS_PER_Q - 1);
-	card->outbound_first_free_buffer[queue] = position;
-
-	card->outbound_bytes_in_buffer[queue] = 0;
-	/* we can override that, as we have at most 127 buffers enqueued */
-	card->outbound_ringbuffer[queue]->ringbuf_element[position].
-	    next_element_to_fill = 0;
-
-	atomic_inc(&card->outbound_used_buffers[queue]);
-
-	QETH_DBF_CARD5(0, trace, "flsp", card);
-	sprintf(dbf_text, "%4x%2x%2x", position_for_do_qdio, under_int, queue);
-	QETH_DBF_TEXT5(0, trace, dbf_text);
-	QETH_DBF_HEX5(0, misc, buffer, QETH_DBF_MISC_LEN);
-	QETH_DBF_HEX5(0, misc, buffer + QETH_DBF_MISC_LEN, QETH_DBF_MISC_LEN);
-
-	/* 
-	 * we always set the outbound pci flag, don't care, whether the
-	 * adapter honors it or not
-	 */
-	switch (card->send_state[queue]) {
-	case SEND_STATE_DONT_PACK:
-		if (atomic_read(&card->outbound_used_buffers[queue])
-		    < HIGH_WATERMARK_PACK - WATERMARK_FUZZ)
-			break;
-		/* set the PCI bit */
-		card->outbound_ringbuffer[queue]->
-		    buffer[position_for_do_qdio].element[0].flags |= 0x40;
-		atomic_set(&card->last_pci_pos[queue], position_for_do_qdio);
-		break;
-	case SEND_STATE_PACK:
-		last_pci = atomic_read(&card->last_pci_pos[queue]);
-		if (position_for_do_qdio < last_pci)
-			last_pci -= QDIO_MAX_BUFFERS_PER_Q;
-		/* so:
-		 * last_pci is the position of the last pci we've set
-		 * position_for_do_qdio is the position we will send out now
-		 * outbound_used_buffers is the number of buffers used (means
-		 *   all buffers hydra has, inclusive position_for_do_qdio)
-		 *
-		 * we have to request a pci, if we have got the buffer of the
-		 * last_pci position back.
-		 *
-		 * position_for_do_qdio-outbound_used_buffers is the newest
-		 *   buffer that we got back from hydra
-		 *
-		 * if this is greater or equal than the last_pci position,
-		 * we should request a pci, as no pci request is
-		 * outstanding anymore
-		 */
-		if (position_for_do_qdio -
-		    atomic_read(&card->outbound_used_buffers[queue]) >=
-		    last_pci) {
-			/* set the PCI bit */
-			card->outbound_ringbuffer[queue]->
-			    buffer[position_for_do_qdio].
-			    element[0].flags |= 0x40;
-			atomic_set(&card->last_pci_pos[queue],
-				   position_for_do_qdio);
-		}
-	}
-
-	/* 
-	 * this has to be at the end, otherwise a buffer could be flushed
-	 * twice (see comment in qeth_do_send_packet)
-	 */
-	result = do_QDIO(CARD_DDEV(card), QDIO_FLAG_SYNC_OUTPUT | under_int, queue,
-			 position_for_do_qdio, 1, NULL);
-
-	if (result) {
-		PRINT_WARN("Outbound do_QDIO returned %i "
-			   "(device %s)\n", result, CARD_DDEV_ID(card));
-		QETH_DBF_CARD5(0, trace, "FLSP", card);
-		sprintf(dbf_text, "odoQ%4x", result);
-		QETH_DBF_TEXT5(0, trace, dbf_text);
-		sprintf(dbf_text, "%4x%2x%2x", position_for_do_qdio,
-			under_int, queue);
-		QETH_DBF_TEXT5(0, trace, dbf_text);
-		QETH_DBF_HEX5(0, misc, buffer, QETH_DBF_MISC_LEN);
-		QETH_DBF_HEX5(0, misc, buffer + QETH_DBF_MISC_LEN,
-			      QETH_DBF_MISC_LEN);
-	}
-}
-
-#define ERROR_NONE 0
-#define ERROR_RETRY 1
-#define ERROR_LINK_FAILURE 2
-#define ERROR_KICK_THAT_PUPPY 3
-static inline int
-qeth_determine_send_error(int cc, int qdio_error, int sbalf15)
-{
-	char dbf_text[15];
-
-	switch (cc & 3) {
-	case 0:
-		if (qdio_error)
-			return ERROR_LINK_FAILURE;
-		return ERROR_NONE;
-	case 2:
-		if (cc & QDIO_SIGA_ERROR_B_BIT_SET) {
-			QETH_DBF_TEXT3(0, trace, "sigacc2b");
-			return ERROR_KICK_THAT_PUPPY;
-		}
-		if (qeth_sbalf15_in_retrieable_range(sbalf15))
-			return ERROR_RETRY;
-		return ERROR_LINK_FAILURE;
-		/* look at qdio_error and sbalf 15 */
-	case 1:
-		PRINT_WARN("siga returned cc 1! cc=0x%x, "
-			   "qdio_error=0x%x, sbalf15=0x%x\n",
-			   cc, qdio_error, sbalf15);
-
-		QETH_DBF_TEXT3(0, trace, "siga-cc1");
-		QETH_DBF_TEXT2(0, qerr, "siga-cc1");
-		sprintf(dbf_text, "%1x%2x%2x", cc, qdio_error, sbalf15);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		QETH_DBF_TEXT2(0, qerr, dbf_text);
-		return ERROR_LINK_FAILURE;
-	case 3:
-		QETH_DBF_TEXT3(0, trace, "siga-cc3");
-		return ERROR_KICK_THAT_PUPPY;
-	}
-	return ERROR_LINK_FAILURE;	/* should never happen */
-}
-
-static inline void
-qeth_free_buffer(struct qeth_card *card, int queue, int bufno,
-		 int qdio_error, int siga_error)
-{
-	struct sk_buff *skb;
-	int error;
-	int retries;
-	int sbalf15;
-	char dbf_text[15];
-	struct qdio_buffer *buffer;
-
-	switch (card->outbound_buffer_send_state[queue][bufno]) {
-	case SEND_STATE_DONT_PACK:	/* fallthrough */
-	case SEND_STATE_PACK:
-		QETH_DBF_CARD5(0, trace, "frbf", card);
-		sprintf(dbf_text, "%2x%2x%4x", queue, bufno,
-			card->outbound_buffer_send_state[queue][bufno]);
-		QETH_DBF_TEXT5(0, trace, dbf_text);
-
-		buffer = &card->outbound_ringbuffer[queue]->buffer[bufno];
-		sbalf15 = buffer->element[15].flags & 0xff;
-		error =
-		    qeth_determine_send_error(siga_error, qdio_error, sbalf15);
-		if (error == ERROR_KICK_THAT_PUPPY) {
-			sprintf(dbf_text, "KP%s%2x",
-				CARD_BUS_ID(card), queue);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			QETH_DBF_TEXT2(0, qerr, dbf_text);
-			QETH_DBF_TEXT2(1, setup, dbf_text);
-			sprintf(dbf_text, "%2x%2x%2x%2x", bufno,
-				siga_error, qdio_error, sbalf15);
-			QETH_DBF_TEXT2(1, trace, dbf_text);
-			QETH_DBF_TEXT2(1, qerr, dbf_text);
-			PRINT_ERR("Outbound queue x%x on device %s (%s); "
-				  "errs: siga: x%x, qdio: x%x, flags15: "
-				  "x%x. The device will be taken down.\n",
-				  queue, CARD_BUS_ID(card), card->dev_name,
-				  siga_error, qdio_error, sbalf15);
-			netif_stop_queue(card->dev);
-			qeth_set_dev_flag_norunning(card);
-			atomic_set(&card->problem, PROBLEM_BAD_SIGA_RESULT);
-			qeth_schedule_recovery(card);
-		} else if (error == ERROR_RETRY) {
-			/* analyze, how many retries we did so far */
-			retries = card->send_retries[queue][bufno];
-
-			sprintf(dbf_text, "Rt%s%2x",
-				CARD_BUS_ID(card), queue);
-			QETH_DBF_TEXT4(0, trace, dbf_text);
-			sprintf(dbf_text, "b%2x:%2x%2x", bufno,
-				sbalf15, retries);
-			QETH_DBF_TEXT4(0, trace, dbf_text);
-
-			if (++retries > SEND_RETRIES_ALLOWED) {
-				error = ERROR_LINK_FAILURE;
-				QETH_DBF_TEXT4(1, trace, "ndegelnd");
-			}
-			/* else error stays RETRY for the switch statemnet */
-		} else if (error == ERROR_LINK_FAILURE) {
-			/* we don't want to log failures resulting from
-			 * too many retries */
-			QETH_DBF_CARD3(1, trace, "Fail", card);
-			QETH_DBF_HEX3(0, misc, buffer, QETH_DBF_MISC_LEN);
-			QETH_DBF_HEX3(0, misc, buffer + QETH_DBF_MISC_LEN,
-				      QETH_DBF_MISC_LEN);
-		}
-
-		while ((skb = skb_dequeue(&card->outbound_ringbuffer[queue]->
-					  ringbuf_element[bufno].skb_list))) {
-			switch (error) {
-			case ERROR_NONE:
-				atomic_dec(&skb->users);
-				dev_kfree_skb_irq(skb);
-				break;
-			case ERROR_RETRY:
-				QETH_DBF_TEXT3(0, qerr, "RETRY!!!");
-				QETH_DBF_TEXT4(0, trace, "RETRY!!!");
-				atomic_dec(&skb->users);
-				/* retry packet async (quickly) ... */
-				atomic_dec(&skb->users);
-				dev_kfree_skb_irq(skb);
-				break;
-			case ERROR_LINK_FAILURE:
-			case ERROR_KICK_THAT_PUPPY:
-				QETH_DBF_TEXT4(0, trace, "endeglnd");
-				atomic_dec(&skb->users);
-				dev_kfree_skb_irq(skb);
-				break;
-			}
-		}
-		break;
-	default:
-		PRINT_WARN("oops... wrong send_state on %s. "
-			   "shouldn't happen "
-			   "(line %i). q=%i, bufno=x%x, state=%i\n",
-			   card->dev_name, __LINE__, queue, bufno,
-			   card->outbound_buffer_send_state[queue][bufno]);
-		QETH_DBF_CARD0(1, trace, "UPSf", card);
-		QETH_DBF_CARD0(1, qerr, "UPSf", card);
-		sprintf(dbf_text, "%2x%2x%4x", queue, bufno,
-			card->outbound_buffer_send_state[queue][bufno]);
-		QETH_DBF_TEXT0(1, trace, dbf_text);
-		QETH_DBF_TEXT0(1, qerr, dbf_text);
-	}
-	card->outbound_buffer_send_state[queue][bufno] = SEND_STATE_INACTIVE;
-	card->send_retries[queue][bufno] = 0;
-}
-
-static inline void
-qeth_free_all_skbs(struct qeth_card *card)
-{
-	int q, b;
-
-	for (q = 0; q < card->no_queues; q++)
-		for (b = 0; b < QDIO_MAX_BUFFERS_PER_Q; b++)
-			if (card->outbound_buffer_send_state[q][b] !=
-			    SEND_STATE_INACTIVE)
-				qeth_free_buffer(card, q, b, 0, 0);
-}
-
-static inline void
-qeth_flush_buffer(struct qeth_card *card, int queue, int under_int)
-{
-	char dbf_text[15];
-	QETH_DBF_CARD5(0, trace, "flsb", card);
-	sprintf(dbf_text, "%2x%2x%2x", queue, under_int,
-		card->outbound_buffer_send_state[queue]
-		[card->outbound_first_free_buffer[queue]]);
-	QETH_DBF_TEXT5(0, trace, dbf_text);
-
-	switch (card->outbound_buffer_send_state[queue]
-		[card->outbound_first_free_buffer[queue]]) {
-	case SEND_STATE_DONT_PACK:
-		break;
-	case SEND_STATE_PACK:
-		qeth_flush_packed_packets(card, queue, under_int);
-		break;
-	default:
-		break;
-	}
-}
-
-#ifdef QETH_VLAN
-static inline void
-qeth_insert_ipv6_vlan_tag(struct sk_buff *__skb)
-{
-
-	/* Move the mac addresses to the beginning of the new header.
-	 * We are using three memcpys instead of one memmove to save cycles.
-	 */
-#define TMP_CPYSIZE 4
-	__u16 *tag;
-	tag = (__u16 *) skb_push(__skb, VLAN_HLEN);
-	memcpy(__skb->data, __skb->data + TMP_CPYSIZE, TMP_CPYSIZE);
-	memcpy(__skb->data + TMP_CPYSIZE,
-	       __skb->data + (2 * TMP_CPYSIZE), TMP_CPYSIZE);
-	memcpy(__skb->data + (2 * TMP_CPYSIZE),
-	       __skb->data + (3 * TMP_CPYSIZE), TMP_CPYSIZE);
-	tag = (__u16 *) (__skb->data + (3 * TMP_CPYSIZE));
-
-	/*first two bytes  = ETH_P_8021Q (0x8100)
-	 *second two bytes = VLANID
-	 */
-
-	*tag = __constant_htons(ETH_P_8021Q);
-	*(tag + 1) = vlan_tx_tag_get(__skb);
-	*(tag + 1) = htons(*(tag + 1));
-#undef TMP_CPYSIZE
-}
-#endif
-
-static inline void
-__qeth_add_vlan_tag(struct qeth_card *card, struct sk_buff *skb, int version)
-{
-#ifdef QETH_VLAN
-	if ((card->vlangrp != NULL) &&
-	    vlan_tx_tag_present(skb) && (version == 6)) {
-		qeth_insert_ipv6_vlan_tag(skb);
-	}
-#endif
-}
-
-static inline void
-qeth_send_packet_fast(struct qeth_card *card, struct sk_buff *skb,
-		      struct net_device *dev,
-		      int queue, int version, int multicast)
-{
-	struct qeth_ringbuffer_element *mybuffer;
-	int position;
-	struct qeth_hdr *hdr;
-	char *dataptr;
-	char dbf_text[15];
-	struct sk_buff *nskb;
-
-	position = card->outbound_first_free_buffer[queue];
-
-	card->outbound_buffer_send_state[queue][position] =
-	    SEND_STATE_DONT_PACK;
-
-	mybuffer = &card->outbound_ringbuffer[queue]->ringbuf_element[position];
-	if (skb_headroom(skb) < QETH_HEADER_SIZE) {
-		if ((version) && (!card->realloc_message)) {
-			card->realloc_message = 1;
-			PRINT_WARN("%s: not enough headroom in skb. "
- 				   "Increasing the "
- 				   "add_hhlen parameter by %i may help.\n",
-				   card->dev_name,
-				   QETH_HEADER_SIZE - skb_headroom(skb));
-		}
-		PRINT_STUPID("%s: not enough headroom in skb (missing: %i)\n",
-			     card->dev_name,
-			     QETH_HEADER_SIZE - skb_headroom(skb));
-		QETH_DBF_CARD3(0, trace, "NHRf", card);
-		sprintf(dbf_text, "%2x%2x%2x%2x", skb_headroom(skb),
-			version, multicast, queue);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		QETH_DBF_HEX3(0, trace, &skb->head, sizeof (void *));
-		QETH_DBF_HEX3(0, trace, &skb->data, sizeof (void *));
-		nskb = skb_realloc_headroom(skb, QETH_HEADER_SIZE);
-		if (!nskb) {
-			PRINT_WARN("%s: could not realloc headroom\n",
-				   card->dev_name);
-			QETH_DBF_CARD2(0, trace, "CNRf", card);
-			dev_kfree_skb_irq(skb);
-			return;
-		}
-		dev_kfree_skb_irq(skb);
-		skb = nskb;
-	}
-	__qeth_add_vlan_tag(card, skb, version);
-	hdr = (struct qeth_hdr *) (skb_push(skb, QETH_HEADER_SIZE));
-	/* 
-	 * sanity check, the Linux memory allocation scheme should
-	 * never present us cases like this one (the 32bytes header plus
-	 * the first 40 bytes of the paket cross a 4k boundary)
-	 */
-	dataptr = (char *) hdr;
-	if ((((unsigned long) dataptr) & (~(PAGE_SIZE - 1))) !=
-	    (((unsigned long) dataptr + QETH_HEADER_SIZE +
-	      QETH_IP_HEADER_SIZE) & (~(PAGE_SIZE - 1)))) {
-		PRINT_ERR("%s: packet misaligned -- the first %i bytes "
-			  "are not in the same page. Discarding packet!\n",
-			  card->dev_name,
-			  QETH_HEADER_SIZE + QETH_IP_HEADER_SIZE);
-		PRINT_ERR("head=%p, data=%p\n", skb->head, skb->data);
-		QETH_DBF_CARD1(0, trace, "PMAf", card);
-		sprintf(dbf_text, "%2x%2x%2x%2x", skb_headroom(skb),
-			version, multicast, queue);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_HEX1(0, trace, &skb->head, sizeof (void *));
-		QETH_DBF_HEX1(1, trace, &skb->data, sizeof (void *));
-		dev_kfree_skb_irq(skb);
-		return;
-	}
-
-	atomic_inc(&skb->users);
-	skb_queue_tail(&mybuffer->skb_list, skb);
-	qeth_fill_header(hdr, skb, version, multicast);
-	/* we need to write to next_element_to_fill as
-	   qeth_flush_packed_packets checks it */
-	card->outbound_ringbuffer[queue]->ringbuf_element[position].
-	    next_element_to_fill =
-	    qeth_fill_buffer(&card->outbound_ringbuffer[queue]->
-			     buffer[position], (char *) hdr, skb->len, 0);
-
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.skbs_sent_dont_pack++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	qeth_flush_packed_packets(card, queue, 0);
-}
-
-/* no checks, if all elements are used, as then we would not be here (at most
-   127 buffers are enqueued) */
-static inline void
-qeth_send_packet_packed(struct qeth_card *card, struct sk_buff *skb,
-			struct net_device *dev,
-			int queue, int version, int multicast)
-{
-	struct qeth_ringbuffer_element *mybuffer;
-	int elements_needed;
-	int element_to_fill;
-	int buffer_no;
-	int length;
-	char *dataptr;
-	struct qeth_hdr *hdr;
-	char dbf_text[15];
-	struct sk_buff *nskb;
-
-	/* sanity check, dev->hard_header_len should prevent this */
-	if (skb_headroom(skb) < QETH_HEADER_SIZE) {
-		if ((version) && (!card->realloc_message)) {
-			card->realloc_message = 1;
-			PRINT_WARN("%s: not enough headroom in skb. "
-				   "Try increasing the "
-				   "add_hhlen parameter by %i.\n",
-				   card->dev_name,
-				   QETH_HEADER_SIZE - skb_headroom(skb));
-		}
-		PRINT_STUPID("%s: not enough headroom in skb (missing: %i)\n",
-			     card->dev_name,
-			     QETH_HEADER_SIZE - skb_headroom(skb));
-		QETH_DBF_CARD3(0, trace, "NHRp", card);
-		sprintf(dbf_text, "%2x%2x%2x%2x", skb_headroom(skb),
-			version, multicast, queue);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		QETH_DBF_HEX3(0, trace, &skb->head, sizeof (void *));
-		QETH_DBF_HEX3(0, trace, &skb->data, sizeof (void *));
-		nskb = skb_realloc_headroom(skb, QETH_HEADER_SIZE);
-		if (!nskb) {
-			PRINT_WARN("%s: could not realloc headroom\n",
-				   card->dev_name);
-			QETH_DBF_CARD2(0, trace, "CNRp", card);
-			dev_kfree_skb_irq(skb);
-			return;
-		}
-		dev_kfree_skb_irq(skb);
-		skb = nskb;
-	}
-	__qeth_add_vlan_tag(card, skb, version);
-	hdr = (struct qeth_hdr *) (skb_push(skb, QETH_HEADER_SIZE));
-
-	length = skb->len;
-
-	/* 
-	 * sanity check, the Linux memory allocation scheme should
-	 * never present us cases like this one (the 32bytes header plus
-	 * the first 40 bytes of the paket cross a 4k boundary)
-	 */
-	dataptr = (char *) hdr;
-	if ((((unsigned long) dataptr) & (~(PAGE_SIZE - 1))) !=
-	    (((unsigned long) dataptr + QETH_HEADER_SIZE +
-	      QETH_IP_HEADER_SIZE) & (~(PAGE_SIZE - 1)))) {
-		PRINT_ERR("%s: packet misaligned -- the first %i bytes "
-			  "are not in the same page. Discarding packet!\n",
-			  card->dev_name,
-			  QETH_HEADER_SIZE + QETH_IP_HEADER_SIZE);
-		QETH_DBF_CARD1(0, trace, "PMAp", card);
-		sprintf(dbf_text, "%2x%2x%2x%2x", skb_headroom(skb),
-			version, multicast, queue);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_HEX1(0, trace, &skb->head, sizeof (void *));
-		QETH_DBF_HEX1(1, trace, &skb->data, sizeof (void *));
-		dev_kfree_skb_irq(skb);
-		return;
-	}
-
-	buffer_no = card->outbound_first_free_buffer[queue];
-
-	element_to_fill = card->outbound_ringbuffer[queue]->
-	    ringbuf_element[buffer_no].next_element_to_fill;
-
-	elements_needed = 1 + (((((unsigned long) dataptr) & (PAGE_SIZE - 1)) +
-				length) >> PAGE_SHIFT);
-	if ((elements_needed > (QDIO_MAX_ELEMENTS_PER_BUFFER - element_to_fill))
-	    ||
-	    ((elements_needed ==
-	      (QDIO_MAX_ELEMENTS_PER_BUFFER - element_to_fill))
-	     && ((element_to_fill >> PAGE_SHIFT) ==
-		 card->outbound_bytes_in_buffer[queue]))) {
-		qeth_flush_packed_packets(card, queue, 0);
-		element_to_fill = 0;
-		card->outbound_bytes_in_buffer[queue] = 0;
-		buffer_no = (buffer_no + 1) & (QDIO_MAX_BUFFERS_PER_Q - 1);
-	}
-
-	if (!element_to_fill)
-		card->outbound_buffer_send_state[queue][buffer_no]
-		    = SEND_STATE_PACK;
-
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.skbs_sent_pack++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	mybuffer =
-	    &card->outbound_ringbuffer[queue]->ringbuf_element[buffer_no];
-	atomic_inc(&skb->users);
-	skb_queue_tail(&mybuffer->skb_list, skb);
-	qeth_fill_header(hdr, skb, version, multicast);
-	card->outbound_bytes_in_buffer[queue] += length + QETH_HEADER_SIZE;
-	card->outbound_ringbuffer[queue]->ringbuf_element[buffer_no].
-	    next_element_to_fill =
-	    qeth_fill_buffer(&card->outbound_ringbuffer[queue]->
-			     buffer[buffer_no],
-			     dataptr, length, element_to_fill);
-}
-
-static void
-qeth_alloc_spare_bufs(void)
-{
-	int i;
-	int dont_alloc_more = 0;
-	char dbf_text[15];
-
-	sparebuffer_count = 0;
-	for (i = 0; i < qeth_sparebufs; i++) {
-		if (!dont_alloc_more) {
-			sparebufs[i].buf = (char *)
-			    kmalloc(DEFAULT_BUFFER_SIZE, GFP_KERNEL);
-			if (sparebufs[i].buf)
-				sparebuffer_count++;
-			else
-				dont_alloc_more = 1;
-		}
-		atomic_set(&sparebufs[i].status, (dont_alloc_more) ?
-			   SPAREBUF_UNAVAIL : SPAREBUF_FREE);
-	}
-	sprintf(dbf_text, "alspb%3x", sparebuffer_count);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	PRINT_INFO("allocated %i spare buffers\n", sparebuffer_count);
-}
-
-static void
-qeth_free_all_spare_bufs(void)
-{
-	int i;
-
-	QETH_DBF_TEXT2(0, trace, "frealspb");
-
-	for (i = 0; i < qeth_sparebufs; i++)
-		if (atomic_read(&sparebufs[i].status) != SPAREBUF_UNAVAIL) {
-			kfree(sparebufs[i].buf);
-			atomic_set(&sparebufs[i].status, SPAREBUF_UNAVAIL);
-		}
-}
-
-static inline void
-__qeth_dump_packet_info(struct qeth_card *card, int version, int multicast,
-			int queue)
-{
-	char dbf_text[15];
-
-	QETH_DBF_CARD6(0, trace, "dsp:", card);
-	sprintf(dbf_text, "%c %c%4x",
-		(version == 4) ? '4' : ((version == 6) ? '6' : '0'),
-		(multicast) ? 'm' : '_', queue);
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x%4x",
-		card->outbound_first_free_buffer[queue],
-		atomic_read(&card->outbound_used_buffers[queue]));
-	QETH_DBF_TEXT6(0, trace, dbf_text);
-	if (qeth_sbal_packing_on_card(card->type)) {
-		switch (card->send_state[queue]) {
-		case SEND_STATE_DONT_PACK:
-			QETH_DBF_TEXT6(0, trace, "usngfast");
-			break;
-		case SEND_STATE_PACK:
-			QETH_DBF_TEXT6(0, trace, "usngpack");
-			break;
-		}
-	} else {
-		QETH_DBF_TEXT6(0, trace, "usngfast");
-	}
-}
-
-static inline void
-__qeth_switch_state_if_needed(struct qeth_card *card, int queue)
-{
-	if (atomic_read(&card->outbound_used_buffers[queue])
-	    >= HIGH_WATERMARK_PACK) {
-		card->send_state[queue] = SEND_STATE_PACK;
-		QETH_DBF_CARD3(0, trace, "stchup", card);
-#ifdef QETH_PERFORMANCE_STATS
-		card->perf_stats.sc_dp_p++;
-#endif /* QETH_PERFORMANCE_STATS */
-	}
-}
-
-static inline int
-qeth_do_send_packet(struct qeth_card *card, struct sk_buff *skb,
-		    struct net_device *dev)
-{
-	int queue, result = 0;
-	int multicast, version;
-
-	version = QETH_IP_VERSION(skb);
-	multicast = qeth_is_multicast_skb_at_all(skb, version);
-	queue = qeth_get_prioqueue(card, skb, multicast, version);
-
-	__qeth_dump_packet_info(card, version, multicast, queue);
-
-	if (atomic_read(&card->outbound_used_buffers[queue])
-	    >= QDIO_MAX_BUFFERS_PER_Q - 1) {
-		QETH_DBF_CARD2(1, trace, "cdbs", card);
-		netif_stop_queue(dev);
-		return -EBUSY;
-	}
-
-	/* 
-	 * we are not called under int, so we just spin
-	 * happens around once a second under heavy traffic. takes a little
-	 * bit less than 10usec in avg. on a z900
-	 */
-	if (atomic_compare_and_swap(QETH_LOCK_UNLOCKED, QETH_LOCK_NORMAL,
-				    &card->outbound_ringbuffer_lock[queue])) {
-		QETH_DBF_CARD2(0, trace, "SPIN", card);
-		while (atomic_compare_and_swap
-		       (QETH_LOCK_UNLOCKED, QETH_LOCK_NORMAL,
-			&card->outbound_ringbuffer_lock[queue])) ;
-		QETH_DBF_CARD2(0, trace, "spin", card);
-	}
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.skbs_sent++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	if (qeth_sbal_packing_on_card(card->type)) {
-		switch (card->send_state[queue]) {
-		case SEND_STATE_DONT_PACK:
-			qeth_send_packet_fast(card, skb, dev, queue,
-					      version, multicast);
-			__qeth_switch_state_if_needed(card, queue);
-			break;
-		case SEND_STATE_PACK:
-			qeth_send_packet_packed(card, skb, dev, queue,
-						version, multicast);
-			break;
-		default:
-			result = -EBUSY;
-			QETH_DBF_CARD0(1, trace, "UPSs", card);
-			PRINT_ALL("oops... shouldn't happen (line %i:%i).\n",
-				  __LINE__, card->send_state[queue]);
-		}
-	} else {
-		qeth_send_packet_fast(card, skb, dev, queue,
-				      version, multicast);
-	}
-
-	/* ATOMIC: (NORMAL->UNLOCKED, FLUSH->NORMAL) */
-	while (atomic_dec_return(&card->outbound_ringbuffer_lock[queue])) {
-		qeth_flush_buffer(card, queue, 0);
-		card->send_state[queue] = SEND_STATE_DONT_PACK;
-	}
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.outbound_time +=
-	    NOW - card->perf_stats.outbound_start_time;
-	card->perf_stats.outbound_cnt++;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	card->stats->tx_packets++;
-	card->stats->tx_bytes += skb->len;
-
-	return result;
-}
-
-static int
-qeth_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
-{
-	struct qeth_card *card;
-	int result;
-
-	card = (struct qeth_card *) (dev->priv);
-
-	if (skb == NULL)
-		return 0;
-
-	QETH_DBF_HEX4(0, data, skb->data, __max(QETH_DBF_DATA_LEN, skb->len));
-
-	netif_stop_queue(dev);
-
-	if (!card) {
-		QETH_DBF_TEXT2(0, trace, "XMNSNOCD");
-		dev_kfree_skb_irq(skb);
-		return 0;
-	}
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.outbound_start_time = NOW;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	if (!atomic_read(&card->is_startlaned)) {
-		card->stats->tx_carrier_errors++;
-		QETH_DBF_CARD2(0, trace, "XMNS", card);
-		dev_kfree_skb_irq(skb);
-		return 0;
-	}
-
-	result = qeth_do_send_packet(card, skb, dev);
-
-	if (!result)
-		netif_wake_queue(card->dev);
-
-	return result;
-}
-
-static struct net_device_stats *
-qeth_get_stats(struct net_device *dev)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) (dev->priv);
-
-	QETH_DBF_CARD3(0, trace, "gtst", card);
-
-	return card->stats;
-}
-
-static int
-qeth_change_mtu(struct net_device *dev, int new_mtu)
-{
-	struct qeth_card *card;
-	char dbf_text[15];
-
-	card = (struct qeth_card *) (dev->priv);
-
-	QETH_DBF_CARD2(0, trace, "mtu", card);
-	sprintf(dbf_text, "%8x", new_mtu);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	if (new_mtu < 64)
-		return -EINVAL;
-	if (new_mtu > 65535)
-		return -EINVAL;
-	if ((!qeth_is_supported(IPA_IP_FRAGMENTATION)) &&
-	    (!qeth_mtu_is_valid(card, new_mtu)))
-		return -EINVAL;
-	dev->mtu = new_mtu;
-	return 0;
-}
-
-static void
-qeth_start_softsetup_thread(struct qeth_card *card)
-{
-	if (!atomic_read(&card->shutdown_phase)) {
-		QETH_DBF_CARD2(0, trace, "stss", card);
-		up(&card->softsetup_thread_sem);
-	}
-}
-
-static int
-qeth_sleepon(struct qeth_card *card, int timeout)
-{
-	char dbf_text[15];
-
-	QETH_DBF_CARD5(0, trace, "slpn", card);
-	sprintf(dbf_text, "%08x", timeout);
-	QETH_DBF_TEXT5(0, trace, dbf_text);
-
-	wait_event_interruptible_timeout(card->wait_q,
-					 atomic_read(&card->data_has_arrived),
-					 timeout * HZ);
-	if (atomic_read(&card->data_has_arrived)) {
-		atomic_set(&card->data_has_arrived, 0);
-		return 0;
-	}
-	return -ETIME;
-}
-
-static void
-qeth_wakeup_ioctl(struct qeth_card *card)
-{
-
-	QETH_DBF_CARD5(0, trace, "wkup", card);
-
-	atomic_set(&card->ioctl_data_has_arrived, 1);
-	wake_up(&card->ioctl_wait_q);
-}
-
-static int
-qeth_sleepon_ioctl(struct qeth_card *card, int timeout)
-{
-	char dbf_text[15];
-
-	QETH_DBF_CARD5(0, trace, "ioctlslpn", card);
-	sprintf(dbf_text, "%08x", timeout);
-	QETH_DBF_TEXT5(0, trace, dbf_text);
-
-	wait_event_interruptible_timeout(card->ioctl_wait_q,
-					 atomic_read(&card->
-						     ioctl_data_has_arrived),
-					 timeout * HZ);
-	if (atomic_read(&card->ioctl_data_has_arrived)) {
-		atomic_set(&card->ioctl_data_has_arrived, 0);
-		return 0;
-	}
-	return -ETIME;
-}
-
-/*SNMP IOCTL on Procfile */
-
-static void
-qeth_wakeup_procfile(void)
-{
-	QETH_DBF_TEXT5(0, trace, "procwkup");
-	/* is this if statement correct? */
-	if (atomic_read(&qeth_procfile_ioctl_sem.count) <=
-	    PROCFILE_SLEEP_SEM_MAX_VALUE)
-		up(&qeth_procfile_ioctl_sem);
-}
-
-static int
-qeth_sleepon_procfile(void)
-{
-	QETH_DBF_TEXT5(0, trace, "procslp");
-	if (down_interruptible(&qeth_procfile_ioctl_sem)) {
-		return -ERESTARTSYS;
-	}
-	return 0;
-}
-
-/* SNMP END */
-
-static char *
-qeth_send_control_data(struct qeth_card *card, unsigned char *buffer,
-		       int len, unsigned long intparam)
-{
-	unsigned long flags;
-	int result, result2;
-	char dbf_text[15];
-	unsigned char *rec_buf;
-	int setip = (intparam & IPA_SETIP_FLAG) ? 1 : 0;
-
-again:
-	if (atomic_read(&card->shutdown_phase) == QETH_REMOVE_CARD_QUICK)
-		return NULL;
-	if (atomic_read(&card->escape_softsetup))
-		return NULL;
-
-	/* we lock very early to synchronize access to seqnos */
-	if (atomic_swap(&card->write_busy, 1)) {
-		qeth_wait_nonbusy(QETH_IDLE_WAIT_TIME);
-		QETH_DBF_CARD2(0, trace, "LSCD", card);
-		goto again;
-	}
-	memcpy(card->dma_stuff->sendbuf, card->send_buf, QETH_BUFSIZE);
-
-	memcpy(QETH_TRANSPORT_HEADER_SEQ_NO(buffer),
-	       &card->seqno.trans_hdr, QETH_SEQ_NO_LENGTH);
-	card->seqno.trans_hdr++;
-
-	memcpy(QETH_PDU_HEADER_SEQ_NO(buffer),
-	       &card->seqno.pdu_hdr, QETH_SEQ_NO_LENGTH);
-	card->seqno.pdu_hdr++;
-	memcpy(QETH_PDU_HEADER_ACK_SEQ_NO(buffer),
-	       &card->seqno.pdu_hdr_ack, QETH_SEQ_NO_LENGTH);
-
-	/* there is noone doing this except sleep and this function */
-	atomic_set(&card->data_has_arrived, 0);
-
-	memcpy(&card->dma_stuff->write_ccw, WRITE_CCW, sizeof (struct ccw1));
-	card->dma_stuff->write_ccw.count = len;
-	card->dma_stuff->write_ccw.cda =
-	    QETH_GET_ADDR(card->dma_stuff->sendbuf);
-
-	QETH_DBF_CARD2(0, trace, "scdw", card);
-	sprintf(dbf_text, "%8x", len);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	QETH_DBF_HEX4(0, trace, &intparam, QETH_DBF_TRACE_LEN);
-	QETH_DBF_HEX2(0, control, buffer, QETH_DBF_CONTROL_LEN);
-
-	spin_lock_irqsave(get_ccwdev_lock(CARD_WDEV(card)), flags);
-	result = ccw_device_start(CARD_WDEV(card), &card->dma_stuff->write_ccw,
-				  intparam, 0, 0);
-	if (result) {
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO);
-		result2 = ccw_device_start(CARD_WDEV(card),
-					   &card->dma_stuff->write_ccw,
-					   intparam, 0, 0);
-		if (result2 != -ENODEV)
-			PRINT_WARN("qeth_send_control_data: do_IO "
-				   "returned %i, next try returns %i\n",
-				   result, result2);
-		result = result2;
-	}
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_WDEV(card)), flags);
-
-	if (result) {
-		QETH_DBF_TEXT2(0, trace, "scd:doio");
-		sprintf(dbf_text, "%4x", (__s16) result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		/* re-enable qeth_send_control_data again */
-		atomic_set(&card->write_busy,0);
-		return NULL;
-	}
-
-	if (intparam == IPA_IOCTL_STATE) {
-		if (qeth_sleepon_ioctl(card, QETH_IPA_TIMEOUT)) {
-			QETH_DBF_TEXT2(0, trace, "scd:ioctime");
-			/* re-enable qeth_send_control_data again */
-			atomic_set(&card->write_busy, 0);
-			return NULL;
-		}
-		rec_buf = card->dma_stuff->recbuf;
-		QETH_DBF_CARD2(0, trace, "scro", card);
-	} else {
-		if (qeth_sleepon(card, (setip) ? QETH_IPA_TIMEOUT :
-				 QETH_MPC_TIMEOUT)) {
-			QETH_DBF_TEXT2(0, trace, "scd:time");
-			/* re-enable qeth_send_control_data again */
-			atomic_set(&card->write_busy, 0);
-			return NULL;
-		}
-		rec_buf = card->ipa_buf;
-		QETH_DBF_CARD2(0, trace, "scri", card);
-	}
-	QETH_DBF_HEX2(0, control, rec_buf, QETH_DBF_CONTROL_LEN);
-
-	memcpy(&card->seqno.pdu_hdr_ack,
-	       QETH_PDU_HEADER_SEQ_NO(rec_buf), QETH_SEQ_NO_LENGTH);
-
-	return rec_buf;
-}
-
-static int
-qeth_send_ipa_cmd(struct qeth_card *card, struct ipa_cmd *cmd, int update_cmd,
-		  int ipatype)
-{
-	unsigned char *buffer;
-	struct ipa_cmd *reply;
-	int ipa_cmd;
-	int result;
-
-	/* don't muck around with ipv6 if there's no use to do so */
-	if ((cmd->prot_version == 6) && (!qeth_is_supported(IPA_IPv6)))
-		return 0;
-
-	ipa_cmd = cmd->command;
-
-	memcpy(card->send_buf, IPA_PDU_HEADER, IPA_PDU_HEADER_SIZE);
-
-	memcpy(QETH_IPA_CMD_DEST_ADDR(card->send_buf),
-	       &card->token.ulp_connection_r, QETH_MPC_TOKEN_LENGTH);
-
-	memcpy(card->send_buf + IPA_PDU_HEADER_SIZE,
-	       cmd, sizeof (struct ipa_cmd));
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					IPA_PDU_HEADER_SIZE +
-					sizeof (struct ipa_cmd), ipatype);
-
-	if (!buffer) {
-		if (atomic_read(&card->escape_softsetup))
-			return 0;
-		else
-			return -1;
-	}
-	reply = (struct ipa_cmd *) PDU_ENCAPSULATION(buffer);
-	if ((update_cmd) && (reply))
-		memcpy(cmd, reply, sizeof (struct ipa_cmd));
-	result = reply->return_code;
-
-	/* some special sausages: */
-	if ((ipa_cmd == IPA_CMD_SETASSPARMS) && (result == 0)) {
-		result = reply->data.setassparms.return_code;
-		if ((reply->data.setassparms.assist_no==IPA_INBOUND_CHECKSUM) &&
-		    (reply->data.setassparms.command_code == IPA_CMD_ASS_START))
-			card->csum_enable_mask =
-				reply->data.setassparms.data.flags_32bit;
-	}
-	if ((ipa_cmd == IPA_CMD_SETADAPTERPARMS) && (result == 0)) {
-		result = reply->data.setadapterparms.return_code;
-	}
-
-	return result;
-}
-
-static void
-qeth_fill_ipa_cmd(struct qeth_card *card, struct ipa_cmd *cmd,
-		  __u8 command, int ip_vers)
-{
-	memset(cmd, 0, sizeof (struct ipa_cmd));
-	cmd->command = command;
-	cmd->initiator = INITIATOR_HOST;
-	cmd->seq_no = card->seqno.ipa++;
-	cmd->adapter_type = qeth_get_adapter_type_for_ipa(card->link_type);
-	cmd->rel_adapter_no = (__u8) card->options.portno;
-	cmd->prim_version_no = 1;
-	cmd->param_count = 1;
-	cmd->prot_version = ip_vers;
-	cmd->ipa_supported = 0;
-	cmd->ipa_enabled = 0;
-}
-
-static int
-qeth_send_startstoplan(struct qeth_card *card, __u8 ipacmd, __u16 ip_vers)
-{
-	struct ipa_cmd cmd;
-	int result;
-
-	qeth_fill_ipa_cmd(card, &cmd, ipacmd, 0);
-	cmd.param_count = 0;
-	cmd.prot_version = ip_vers;
-	cmd.ipa_supported = 0;
-	cmd.ipa_enabled = 0;
-
-	result = qeth_send_ipa_cmd(card, &cmd, 0, IPA_CMD_STATE);
-	return result;
-}
-
-static int
-qeth_send_startlan(struct qeth_card *card, __u16 ip_vers)
-{
-	int result;
-	char dbf_text[15];
-
-	QETH_DBF_CARD4(0, trace, "stln", card);
-
-	result = qeth_send_startstoplan(card, IPA_CMD_STARTLAN, ip_vers);
-	if (!result)
-		atomic_set(&card->is_startlaned, 1);
-
-	if (result) {
-		QETH_DBF_CARD2(0, trace, "STRTLNFL", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	}
-
-	return result;
-}
-
-static int
-qeth_send_stoplan(struct qeth_card *card)
-{
-#ifdef QETH_SEND_STOPLAN_ON_SHUTDOWN
-	int result;
-	char dbf_text[15];
-
-	atomic_set(&card->is_startlaned, 0);
-
-	QETH_DBF_CARD4(0, trace, "spln", card);
-
-	result = qeth_send_startstoplan(card, IPA_CMD_STOPLAN, 4);
-
-	if (result) {
-		QETH_DBF_CARD2(0, trace, "STPLNFLD", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	}
-
-	return result;
-#else /* QETH_SEND_STOPLAN_ON_SHUTDOWN */
-	return 0;
-#endif /* QETH_SEND_STOPLAN_ON_SHUTDOWN */
-}
-
-static int
-qeth_send_qipassist(struct qeth_card *card, short ip_vers)
-{
-	struct ipa_cmd cmd;
-	int result;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_QIPASSIST, ip_vers);
-
-	result = qeth_send_ipa_cmd(card, &cmd, 1, IPA_CMD_STATE);
-
-	if (!result) {
-		if (ip_vers == 4) {
-			card->ipa_supported = cmd.ipa_supported;
-			card->ipa_enabled = cmd.ipa_enabled;
-		} else {
-			card->ipa6_supported = cmd.ipa_supported;
-			card->ipa6_enabled = cmd.ipa_enabled;
-		}
-	}
-
-	return result;
-}
-
-/* QUERY ARP FUNCTIONS */
-
-static int
-qeth_send_ipa_arpcmd(struct qeth_card *card, struct arp_cmd *cmd,
-		     int update_cmd, int ipatype, __u32 req_size)
-{
-	unsigned char *buffer;
-	int ipa_cmd;
-	int result;
-	__u16 s1, s2;
-
-	/* don't muck around with ipv6 if there's no use to do so */
-	if ((cmd->prot_version == 6) && (!qeth_is_supported(IPA_IPv6)))
-		return 0;
-	result = 0;
-	ipa_cmd = cmd->command;
-
-	memcpy(card->send_buf, IPA_PDU_HEADER, IPA_PDU_HEADER_SIZE);
-	memcpy(QETH_IPA_CMD_DEST_ADDR(card->send_buf),
-	       &card->token.ulp_connection_r, QETH_MPC_TOKEN_LENGTH);
-	memcpy(card->send_buf + IPA_PDU_HEADER_SIZE,
-	       cmd, sizeof (struct arp_cmd));
-
-	if (req_size) {
-		/* adjust sizes for big requests */
-		s1 = (__u32) IPA_PDU_HEADER_SIZE + SNMP_BASE_CMDLENGTH +
-		    req_size;
-		s2 = (__u32) SNMP_BASE_CMDLENGTH + req_size;
-		memcpy(QETH_IPA_PDU_LEN_TOTAL(card->send_buf), &s1, 2);
-		memcpy(QETH_IPA_PDU_LEN_PDU1(card->send_buf), &s2, 2);
-		memcpy(QETH_IPA_PDU_LEN_PDU2(card->send_buf), &s2, 2);
-		memcpy(QETH_IPA_PDU_LEN_PDU3(card->send_buf), &s2, 2);
-	}
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					IPA_PDU_HEADER_SIZE +
-					sizeof (struct arp_cmd), ipatype);
-	if (!buffer)
-		result = -ENODATA;
-	else
-		result = card->ioctl_returncode;
-	return result;
-}
-
-static int
-qeth_ioctl_handle_snmp_data(struct qeth_card *card, struct arp_cmd *reply)
-{
-	__u16 data_len;
-
-#define SNMP_HEADER_SIZE_WITH_TOKEN 36
-
- 	data_len = *((__u16*)QETH_IPA_PDU_LEN_PDU1(card->dma_stuff->recbuf));
-	if (reply->data.setadapterparms.frame_seq_no == 1)
-		data_len = data_len - 
-			(__u16)((char*)reply->data.setadapterparms.data.
-				snmp_subcommand.snmp_data - (char*)reply); 
-	else
- 		data_len = data_len - 
- 		 	(__u16)((char*)&reply->data.setadapterparms.data.
-				snmp_subcommand.snmp_request - (char*)reply);
-
-	if (reply->data.setadapterparms.frame_seq_no == 1) {
-
-		if (card->ioctl_buffersize <= (SNMP_HEADER_SIZE_WITH_TOKEN +
-					       reply->data.setadapterparms.
-					       frames_used_total *
-					       ARP_DATA_SIZE)) {
-
-			card->ioctl_returncode = ARP_RETURNCODE_ERROR;
-			reply->data.setadapterparms.data.snmp_subcommand.
-			    snmp_returncode = -ENOMEM;
-		} else {
-			card->ioctl_returncode = ARP_RETURNCODE_SUCCESS;
-			card->number_of_entries = 0;
- 			memcpy(((char *)card->ioctl_data_buffer),
-			       reply->data.setadapterparms.snmp_token,
-			       SNMP_HEADER_SIZE_WITH_TOKEN);
- 			card->ioctl_buffer_pointer = card->ioctl_data_buffer+
-				SNMP_HEADER_SIZE_WITH_TOKEN;
-		}
-	}
-
-	if (card->ioctl_returncode != ARP_RETURNCODE_ERROR &&
-	    reply->data.setadapterparms.frame_seq_no <=
-	    reply->data.setadapterparms.frames_used_total) {
-
-		if (reply->data.setadapterparms.return_code ==
-		    IPA_REPLY_SUCCESS) {
-
- 			if (reply->data.setadapterparms.frame_seq_no == 1)
- 				memcpy(card->ioctl_buffer_pointer,
-				       reply->data.setadapterparms.data.
-				       snmp_subcommand.snmp_data, data_len);
-			else
- 				memcpy(card->ioctl_buffer_pointer,
-				       (char*)&reply->data.setadapterparms.
-				       data.snmp_subcommand.snmp_request,
-				       data_len);
-
-			card->ioctl_buffer_pointer =
-			    card->ioctl_buffer_pointer + data_len;
-			card->ioctl_returncode = ARP_RETURNCODE_SUCCESS;
-
-			if (reply->data.setadapterparms.frame_seq_no ==
-			    reply->data.setadapterparms.frames_used_total) {
-				card->ioctl_returncode =
-				    ARP_RETURNCODE_LASTREPLY;
-			}
-		} else {
-			card->ioctl_returncode = ARP_RETURNCODE_ERROR;
-			memset(card->ioctl_data_buffer, 0,
-			       card->ioctl_buffersize);
-			reply->data.setadapterparms.data.snmp_subcommand.
-			    snmp_returncode =
-			    reply->data.setadapterparms.return_code;
-		}
-	}
-#undef  SNMP_HEADER_SIZE_WITH_TOKEN
-
-	return card->ioctl_returncode;
-}
-
-static int
-qeth_ioctl_handle_arp_data(struct qeth_card *card, struct arp_cmd *reply)
-{
-
-	if (reply->data.setassparms.seq_no == 1) {
-		if (card->ioctl_buffersize <=
-		    (sizeof (__u16) + sizeof (int) +
-		     reply->data.setassparms.number_of_replies *
-		     ARP_DATA_SIZE)) {
-
-			card->ioctl_returncode = ARP_RETURNCODE_ERROR;
-
-		} else {
-			card->ioctl_returncode = ARP_RETURNCODE_SUCCESS;
-			card->number_of_entries = 0;
-			card->ioctl_buffer_pointer =
-			    card->ioctl_data_buffer + sizeof (__u16) +
-			    sizeof (int);
-		}
-	}
-
-	if (card->ioctl_returncode != ARP_RETURNCODE_ERROR &&
-	    reply->data.setassparms.seq_no <=
-	    reply->data.setassparms.number_of_replies) {
-
-		if (reply->data.setassparms.return_code == IPA_REPLY_SUCCESS) {
-
-			card->number_of_entries = card->number_of_entries +
-			    reply->data.setassparms.
-			    data.queryarp_data.number_of_entries;
-			memcpy(card->ioctl_buffer_pointer,
-			       reply->data.setassparms.data.queryarp_data.
-			       arp_data, ARP_DATA_SIZE);
-			card->ioctl_buffer_pointer = card->
-			    ioctl_buffer_pointer + ARP_DATA_SIZE;
-			card->ioctl_returncode = ARP_RETURNCODE_SUCCESS;
-			if (reply->data.setassparms.seq_no ==
-			    reply->data.setassparms.number_of_replies) {
-				memcpy(card->ioctl_data_buffer,
-				       &reply->data.setassparms.data.
-				       queryarp_data.osa_setbitmask,
-				       sizeof (__u16));
-				card->ioctl_returncode =
-				    ARP_RETURNCODE_LASTREPLY;
-			}
-		} else {
-
-			card->ioctl_returncode = ARP_RETURNCODE_ERROR;
-			memset(card->ioctl_data_buffer, 0,
-			       card->ioctl_buffersize);
-		}
-	}
-	return card->ioctl_returncode;
-}
-
-static int
-qeth_look_for_arp_data(struct qeth_card *card)
-{
-	struct arp_cmd *reply;
-	int result;
-
-	reply = (struct arp_cmd *) PDU_ENCAPSULATION(card->dma_stuff->recbuf);
-
-	if ((reply->command == IPA_CMD_SETASSPARMS) &&
-	    (reply->data.setassparms.assist_no == IPA_ARP_PROCESSING) &&
-	    (reply->data.setassparms.command_code ==
-	     IPA_CMD_ASS_ARP_FLUSH_CACHE)) {
-		result = ARP_FLUSH;
-	} else if ((reply->command == IPA_CMD_SETASSPARMS) &&
-		   (reply->data.setassparms.assist_no == IPA_ARP_PROCESSING) &&
-		   (reply->data.setassparms.command_code ==
-		    IPA_CMD_ASS_ARP_QUERY_INFO) &&
-		   (card->ioctl_returncode == ARP_RETURNCODE_SUCCESS)) {
-
-		result = qeth_ioctl_handle_arp_data(card, reply);
-
-	} else if ((reply->command == IPA_CMD_SETADAPTERPARMS) &&
-		   (reply->data.setadapterparms.command_code ==
-		    IPA_SETADP_SET_SNMP_CONTROL) &&
-		   (card->ioctl_returncode == ARP_RETURNCODE_SUCCESS)) {
-
-		result = qeth_ioctl_handle_snmp_data(card, reply);
-	} else
-		result = ARP_RETURNCODE_NOARPDATA;
-
-	return result;
-}
-
-static int
-qeth_queryarp(struct qeth_card *card, struct ifreq *req, int version,
-	      __u32 assist_no, __u16 command_code, char *c_data, __u16 len)
-{
-	int data_size;
-	struct arp_cmd *cmd;
-	int result;
-
-	cmd = (struct arp_cmd *) kmalloc(sizeof (struct arp_cmd), GFP_KERNEL);
-	if (!cmd) {
-		return IPA_REPLY_FAILED;
-	}
-
-	memcpy(&data_size, c_data, sizeof (int));
-
-	qeth_fill_ipa_cmd(card, (struct ipa_cmd *) cmd, IPA_CMD_SETASSPARMS,
-			  version);
-
-	cmd->data.setassparms.assist_no = assist_no;
-	cmd->data.setassparms.length = 8 + len;
-	cmd->data.setassparms.command_code = command_code;
-	cmd->data.setassparms.return_code = 0;
-	cmd->data.setassparms.seq_no = 0;
-
-	card->ioctl_buffersize = data_size;
-	card->ioctl_data_buffer = (char *) vmalloc(data_size);
-	if (!card->ioctl_data_buffer) {
-		kfree(cmd);
-		return IPA_REPLY_FAILED;
-	}
-
-	card->ioctl_returncode = ARP_RETURNCODE_SUCCESS;
-
-	result = qeth_send_ipa_arpcmd(card, cmd, 1, IPA_IOCTL_STATE, 0);
-
-	if ((result == ARP_RETURNCODE_ERROR) || (result == -ENODATA)) {
-		result = IPA_REPLY_FAILED;
-	} else {
-		result = IPA_REPLY_SUCCESS;
-		memcpy(((char *) (card->ioctl_data_buffer)) + sizeof (__u16),
-		       &(card->number_of_entries), sizeof (int));
-		if (copy_to_user(req->ifr_ifru.ifru_data,
-			     	card->ioctl_data_buffer, data_size))
-				result = -EFAULT;
-	}
-	card->ioctl_buffer_pointer = NULL;
-	vfree(card->ioctl_data_buffer);
-	kfree(cmd);
-	card->number_of_entries = 0;
-	card->ioctl_buffersize = 0;
-
-	return result;
-}
-
-static int
-snmp_set_setadapterparms_command(struct qeth_card *card,
-				 struct arp_cmd *cmd, struct ifreq *req,
-				 char *data, __u16 len,
-				 __u16 command_code, int req_size)
-{
-	__u32 data_size;
-
-	memcpy(&data_size, data, sizeof (__u32));
-
-	card->ioctl_buffersize = data_size;
-	card->ioctl_data_buffer = (char *) vmalloc(data_size);
-	if (!card->ioctl_data_buffer) {
-		return -ENOMEM;
-	}
-	card->ioctl_returncode = ARP_RETURNCODE_SUCCESS;
-
-	memcpy(cmd->data.setadapterparms.snmp_token,
-	       data + SNMP_REQUEST_DATA_OFFSET, req_size);
-
-	cmd->data.setadapterparms.cmdlength = SNMP_SETADP_CMDLENGTH + req_size;
-	cmd->data.setadapterparms.command_code = command_code;
-	cmd->data.setadapterparms.frames_used_total = 1;
-	cmd->data.setadapterparms.frame_seq_no = 1;
-
-	return 0;
-}
-static int
-qeth_send_snmp_control(struct qeth_card *card, struct ifreq *req,
-		       __u32 command, __u16 command_code,
-		       char *c_data, __u16 len)
-{
-	struct arp_cmd *cmd;
-	__u32 result, req_size;
-
-	cmd = (struct arp_cmd *) kmalloc(sizeof (struct arp_cmd), GFP_KERNEL);
-	if (!cmd) {
-		return IPA_REPLY_FAILED;
-	}
-
-	qeth_fill_ipa_cmd(card, (struct ipa_cmd *) cmd, command, 4);
-
-	memcpy(&req_size, ((char *) c_data) + sizeof (__u32), sizeof (__u32));
-
-	if (snmp_set_setadapterparms_command(card, cmd, req, c_data,
-					     len, command_code, req_size)) {
-		kfree(cmd);
-		return IPA_REPLY_FAILED;
-	}
-
-	result = qeth_send_ipa_arpcmd(card, cmd, 1, IPA_IOCTL_STATE, req_size);
-
-	if (result == -ENODATA) {
-		result = IPA_REPLY_FAILED;
-		goto snmp_out;
-	}
-	if (result == ARP_RETURNCODE_ERROR) {
-		result = IPA_REPLY_FAILED;
-		if (copy_to_user(req->ifr_ifru.ifru_data + 
-			     SNMP_REQUEST_DATA_OFFSET, card->ioctl_data_buffer,
-			     card->ioctl_buffersize))
-			result = -EFAULT;
-	} else {
-		result = IPA_REPLY_SUCCESS;
-		if (copy_to_user(req->ifr_ifru.ifru_data +
-				 SNMP_REQUEST_DATA_OFFSET, card->ioctl_data_buffer,
-				 card->ioctl_buffersize))
-			result = -EFAULT;
-	}
-snmp_out:
-	card->number_of_entries = 0;
-	card->ioctl_buffersize = 0;
-	card->ioctl_buffer_pointer = NULL;
-	vfree(card->ioctl_data_buffer);
-	kfree(cmd);
-
-	return result;
-}
-
-static int
-qeth_send_setassparms(struct qeth_card *card, int version, __u32 assist_no,
-		      __u16 command_code, long data, __u16 len)
-{
-	struct ipa_cmd cmd;
-	int result;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_SETASSPARMS, version);
-
-	cmd.data.setassparms.assist_no = assist_no;
-	cmd.data.setassparms.length = 8 + len;
-	cmd.data.setassparms.command_code = command_code;
-	cmd.data.setassparms.return_code = 0;
-	cmd.data.setassparms.seq_no = 0;
-
-	if (len <= sizeof (__u32))
-		cmd.data.setassparms.data.flags_32bit = (__u32) data;
-	else if (len > sizeof (__u32))
-		memcpy(&cmd.data.setassparms.data, (void *) data,
-		       qeth_min(len, PAGE_SIZE));
-	if (command_code != IPA_CMD_ASS_START) {
-		result = qeth_send_ipa_cmd(card, &cmd, 0,
-					   ((assist_no == IPA_ARP_PROCESSING) &&
-					    (command_code !=
-					     IPA_CMD_ASS_ARP_FLUSH_CACHE)) ?
-					   IPA_IOCTL_STATE : IPA_CMD_STATE);
-
-	} else
-		result = qeth_send_ipa_cmd(card, &cmd, 0, IPA_CMD_STATE);
-
-	return result;
-}
-
-static int
-qeth_send_setadapterparms_query(struct qeth_card *card)
-{
-	struct ipa_cmd cmd;
-	int result;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_SETADAPTERPARMS,
-			  IPA_SETADAPTERPARMS_IP_VERSION);
-	cmd.data.setadapterparms.cmdlength = sizeof (struct ipa_setadp_cmd);
-	cmd.data.setadapterparms.command_code =
-	    IPA_SETADP_QUERY_COMMANDS_SUPPORTED;
-	cmd.data.setadapterparms.frames_used_total = 1;
-	cmd.data.setadapterparms.frame_seq_no = 1;
-	result = qeth_send_ipa_cmd(card, &cmd, 1, IPA_CMD_STATE);
-
-	if (cmd.data.setadapterparms.data.query_cmds_supp.lan_type & 0x7f)
-		card->link_type = cmd.data.setadapterparms.data.
-		    query_cmds_supp.lan_type;
-
-	card->adp_supported =
-	    cmd.data.setadapterparms.data.query_cmds_supp.supported_cmds;
-
-	return result;
-}
-
-static int
-qeth_send_setadapterparms_mode(struct qeth_card *card, __u32 command,
-			       __u32 mode)
-{
-
-	struct ipa_cmd cmd;
-	int result;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_SETADAPTERPARMS,
-			  IPA_SETADAPTERPARMS_IP_VERSION);
-	cmd.data.setadapterparms.cmdlength = sizeof (struct ipa_setadp_cmd);
-	cmd.data.setadapterparms.command_code = command;
-	cmd.data.setadapterparms.frames_used_total = 1;
-	cmd.data.setadapterparms.frame_seq_no = 1;
-	cmd.data.setadapterparms.data.mode = mode;
-	result = qeth_send_ipa_cmd(card, &cmd, 0, IPA_CMD_STATE);
-
-	return result;
-}
-
-static int
-qeth_send_setadapterparms_change_addr(struct qeth_card *card,
-				      __u32 command,
-				      __u32 subcmd, __u8 * mac_addr,
-				      int addr_len)
-{
-	struct ipa_cmd cmd;
-	int result;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_SETADAPTERPARMS,
-			  IPA_SETADAPTERPARMS_IP_VERSION);
-	cmd.data.setadapterparms.cmdlength = sizeof (struct ipa_setadp_cmd);
-	cmd.data.setadapterparms.command_code = command;
-	cmd.data.setadapterparms.frames_used_total = 1;
-	cmd.data.setadapterparms.frame_seq_no = 1;
-	cmd.data.setadapterparms.data.change_addr.cmd = subcmd;
-	cmd.data.setadapterparms.data.change_addr.addr_size = addr_len;
-	memcpy(&cmd.data.setadapterparms.data.change_addr.addr,
-	       mac_addr, addr_len);
-
-	result = qeth_send_ipa_cmd(card, &cmd, 1, IPA_CMD_STATE);
-
-	memcpy(mac_addr, &cmd.data.setadapterparms.data.change_addr.addr,
-	       addr_len);
-
-	return result;
-}
-
-static int
-qeth_send_setassparms_simple_with_data(struct qeth_card *card,
-				       __u32 assist_no,
-				       __u16 command_code, long data)
-{
-	return qeth_send_setassparms(card, 4, assist_no, command_code, data, 4);
-}
-
-static int
-qeth_send_setassparms_simple_without_data(struct qeth_card *card,
-					  __u32 assist_no, __u16 command_code)
-{
-	return qeth_send_setassparms(card, 4, assist_no, command_code, 0, 0);
-}
-
-static int
-qeth_send_setassparms_simple_without_data6(struct qeth_card *card,
-					   __u32 assist_no, __u16 command_code)
-{
-	return qeth_send_setassparms(card, 6, assist_no, command_code, 0, 0);
-}
-
-static int
-qeth_send_setdelip(struct qeth_card *card, __u8 * ip, __u8 * netmask,
-		   int ipacmd, short ip_vers, unsigned int flags)
-{
-	struct ipa_cmd cmd;
-	int ip_len = (ip_vers == 6) ? 16 : 4;
-
-	qeth_fill_ipa_cmd(card, &cmd, ipacmd, ip_vers);
-
-	if (ip_vers == 6) {
-		memcpy(&cmd.data.setdelip6.ip, ip, ip_len);
-		memcpy(&cmd.data.setdelip6.netmask, netmask, ip_len);
-		cmd.data.setdelip6.flags = flags;
-	} else {
-		memcpy(&cmd.data.setdelip4.ip, ip, ip_len);
-		memcpy(&cmd.data.setdelip4.netmask, netmask, ip_len);
-		cmd.data.setdelip4.flags = flags;
-	}
-
-	return qeth_send_ipa_cmd(card, &cmd, 0, IPA_CMD_STATE |
-				 ((ipacmd ==
-				   IPA_CMD_SETIP) ? IPA_SETIP_FLAG : 0));
-}
-
-static int
-qeth_send_setdelipm(struct qeth_card *card, __u8 * ip, __u8 * mac,
-		    int ipacmd, short ip_vers)
-{
-	struct ipa_cmd cmd;
-	int ip_len = (ip_vers == 6) ? 16 : 4;
-
-	qeth_fill_ipa_cmd(card, &cmd, ipacmd, ip_vers);
-	memcpy(&cmd.data.setdelipm.mac, mac, 6);
-	if (ip_vers == 6) {
-		memcpy(&cmd.data.setdelipm.ip6, ip, ip_len);
-	} else {
-		memcpy(&cmd.data.setdelipm.ip4_6, ip, ip_len);
-	}
-
-	return qeth_send_ipa_cmd(card, &cmd, 0, IPA_CMD_STATE |
-				 ((ipacmd ==
-				   IPA_CMD_SETIPM) ? IPA_SETIP_FLAG : 0));
-}
-
-#define PRINT_SETIP_ERROR(x) \
-	if (result) \
-		PRINT_ERR("setip%c: return code 0x%x (%s)\n",x,result, \
-			  (result==0xe002)?"invalid mtu size": \
-	       		  (result==0xe005)?"duplicate ip address": \
-	       		  (result==0xe0a5)?"duplicate ip address": \
-       			  (result==0xe006)?"ip table full": \
-			  (result==0xe008)?"startlan not received": \
-			  (result==0xe009)?"setip already received": \
-			  (result==0xe00a)?"dup network ip address": \
-			  (result==0xe00b)?"mblk no free main task entry": \
-			  (result==0xe00d)?"invalid ip version": \
-			  (result==0xe00e)?"unsupported arp assist cmd": \
-			  (result==0xe00f)?"arp assist not enabled": \
-			  (result==0xe080)?"startlan disabled": \
-			  (result==0xf012)?"unicast IP address invalid": \
-			  (result==0xf013)?"multicast router limit reached": \
-			  (result==0xf014)?"stop assist not supported": \
-			  (result==0xf015)?"multicast assist not set": \
-			  (result==0xf080)?"VM: startlan disabled": \
-			  (result==-1)?"IPA communication timeout": \
-			  "unknown return code")
-
-static inline int
-qeth_send_setip(struct qeth_card *card, __u8 * ip,
-		__u8 * netmask, short ip_vers, int use_retries)
-{
-	int result;
-	int retries;
-	char dbf_text[15];
-	int takeover = 0;
-
-	retries = (use_retries) ? QETH_SETIP_RETRIES : 1;
-	if (qeth_is_ipa_covered_by_ipato_entries(ip_vers, ip, card)) {
-		QETH_DBF_CARD2(0, trace, "ipto", card);
-		if (ip_vers == 4) {
-			*((__u32 *) (&dbf_text[0])) = *((__u32 *) ip);
-			*((__u32 *) (&dbf_text[4])) = *((__u32 *) netmask);
-			QETH_DBF_HEX2(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-		} else {
-			QETH_DBF_HEX2(0, trace, ip, QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, ip + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, netmask, QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, netmask + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-		}
-		takeover = 1;
-	} else {
-	}
-retry:
-	result = qeth_send_setdelip(card, ip, netmask, IPA_CMD_SETIP, ip_vers,
-				    (takeover) ? IPA_SETIP_TAKEOVER_FLAGS :
-				    IPA_SETIP_FLAGS);
-	PRINT_SETIP_ERROR(' ');
-
-	if (result) {
-		QETH_DBF_CARD2(0, trace, "SETIPFLD", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	}
-
-	if (((result == -1) || (result == 0xe080) ||(result==0xf080)) &&
-	    (retries--)) {
-		QETH_DBF_CARD2(0, trace, "sipr", card);
-		if (ip_vers == 4) {
-			*((__u32 *) (&dbf_text[0])) = *((__u32 *) ip);
-			*((__u32 *) (&dbf_text[4])) = *((__u32 *) netmask);
-			QETH_DBF_HEX2(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-		} else {
-			QETH_DBF_HEX2(0, trace, ip, QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, ip + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, netmask, QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, netmask + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-		}
-		PRINT_WARN("trying again...\n");
-		goto retry;
-	}
-
-	return result;
-}
-
-static inline int
-qeth_send_delip(struct qeth_card *card, __u8 * ip,
-		__u8 * netmask, short ip_vers)
-{
-	return qeth_send_setdelip(card, ip, netmask, IPA_CMD_DELIP, ip_vers,
-				  IPA_DELIP_FLAGS);
-}
-
-static inline int
-qeth_send_setipm(struct qeth_card *card, __u8 * ip,
-		 __u8 * mac, short ip_vers, int use_retries)
-{
-	int result;
-	int retries;
-	char dbf_text[15];
-
-	retries = (use_retries) ? QETH_SETIP_RETRIES : 1;
-	if (qeth_is_ipa_covered_by_ipato_entries(ip_vers, ip, card)) {
-		QETH_DBF_CARD2(0, trace, "imto", card);
-		if (ip_vers == 4) {
-			*((__u32 *) (&dbf_text[0])) = *((__u32 *) ip);
-			QETH_DBF_HEX2(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-		} else {
-			QETH_DBF_HEX2(0, trace, ip, QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, ip + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-		}
-	}
-
-retry:
-	result = qeth_send_setdelipm(card, ip, mac, IPA_CMD_SETIPM, ip_vers);
-	PRINT_SETIP_ERROR('m');
-
-	if (result) {
-		QETH_DBF_CARD2(0, trace, "SETIMFLD", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	}
-
-	if ((result == -1) && (retries--)) {
-		QETH_DBF_CARD2(0, trace, "simr", card);
-		if (ip_vers == 4) {
-			sprintf(dbf_text, "%08x", *((__u32 *) ip));
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-		} else {
-			QETH_DBF_HEX2(0, trace, ip, QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX2(0, trace, ip + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-		}
-		QETH_DBF_HEX2(0, trace, mac, OSA_ADDR_LEN);
-		PRINT_WARN("trying again...\n");
-		goto retry;
-	}
-
-	return result;
-}
-
-static inline int
-qeth_send_delipm(struct qeth_card *card, __u8 * ip, __u8 * mac, short ip_vers)
-{
-	return qeth_send_setdelipm(card, ip, mac, IPA_CMD_DELIPM, ip_vers);
-}
-
-static int
-qeth_add_vipa_entry(struct qeth_card *card, int version, __u8 * addr, int flag)
-{
-	struct qeth_vipa_entry *entry, *e;
-	int result = 0;
-
-	entry =
-	    (struct qeth_vipa_entry *) kmalloc(sizeof (struct qeth_vipa_entry),
-					       GFP_KERNEL);
-	if (!entry) {
-		PRINT_ERR("not enough memory for vipa handling\n");
-		return -ENOMEM;
-	}
-	entry->version = version;
-	entry->flag = flag;
-	memcpy(entry->ip, addr, 16);
-	entry->state = VIPA_2_B_ADDED;
-
-	write_lock(&card->vipa_list_lock);
-	e = card->vipa_list;
-	while (e) {
-		if (e->version != version)
-			goto next;
-		if (memcmp(e->ip, addr, (version == 4) ? 4 : 16))
-			goto next;
-		if (flag == IPA_SETIP_VIPA_FLAGS) {
-			PRINT_ERR("vipa already set\n");
-		} else {
-			PRINT_ERR("rxip already set\n");
-		}
-		kfree(entry);
-		result = -EALREADY;
-		goto out;
-	next:
-		e = e->next;
-	}
-	entry->next = card->vipa_list;
-	card->vipa_list = entry;
-out:
-	write_unlock(&card->vipa_list_lock);
-	return result;
-}
-
-static int
-qeth_del_vipa_entry(struct qeth_card *card, int version, __u8 * addr, int flag)
-{
-	struct qeth_vipa_entry *e;
-	int result = 0;
-
-	write_lock(&card->vipa_list_lock);
-	e = card->vipa_list;
-	while (e) {
-		if (e->version != version)
-			goto next;
-		if (e->flag != flag)
-			goto next;
-		if (memcmp(e->ip, addr, (version == 4) ? 4 : 16))
-			goto next;
-		e->state = VIPA_2_B_REMOVED;
-		goto out;
-	next:
-		e = e->next;
-	}
-	if (flag == IPA_SETIP_VIPA_FLAGS) {
-		PRINT_ERR("vipa not found\n");
-	} else {
-		PRINT_ERR("rxip not found\n");
-	}
-	result = -ENOENT;
-out:
-	write_unlock(&card->vipa_list_lock);
-	return result;
-}
-
-static void
-qeth_set_vipas(struct qeth_card *card, int set_only)
-{
-	struct qeth_vipa_entry *e, *le = NULL, *ne;	/* ne stands for new entry,
-							   le is last entry */
-	char dbf_text[15];
-	int result;
-	__u8 netmask[16] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-	};
-	struct qeth_vipa_entry *priv_add_list = NULL;
-	struct qeth_vipa_entry *priv_del_list = NULL;
-
-	write_lock(&card->vipa_list_lock);
-	e = card->vipa_list;
-	while (e) {
-		switch (e->state) {
-		case VIPA_2_B_ADDED:
-			if (!set_only)
-				break;
-			if (!atomic_read(&card->is_open))
-				break;
-			/* we don't want to hold the lock for a long time...
-			 * so we clone the entry */
-			ne = (struct qeth_vipa_entry *)
-			    kmalloc(sizeof (struct qeth_vipa_entry),
-				    GFP_ATOMIC);
-			if (ne) {
-				ne->version = e->version;
-				ne->flag = e->flag;
-				memcpy(ne->ip, e->ip, 16);
-				ne->next = priv_add_list;
-				priv_add_list = ne;
-
-				e->state = VIPA_ESTABLISHED;
-			} else {
-				PRINT_ERR("not enough for internal vipa "
-					  "handling... trying to set "
-					  "vipa next time.\n");
-				qeth_start_softsetup_thread(card);
-			}
-			break;
-		case VIPA_2_B_REMOVED:
-			if (set_only)
-				break;
-			if (le)
-				le->next = e->next;
-			else
-				card->vipa_list = e->next;
-			ne = e->next;
-			e->next = priv_del_list;
-			priv_del_list = e;
-			e = ne;
-			continue;
-		case VIPA_ESTABLISHED:
-			if (atomic_read(&card->is_open))
-				break;
-			/* we don't want to hold the lock for a long time...
-			 * so we clone the entry */
-			ne = (struct qeth_vipa_entry *)
-			    kmalloc(sizeof (struct qeth_vipa_entry),
-				    GFP_KERNEL);
-			if (ne) {
-				ne->version = e->version;
-				ne->flag = e->flag;
-				memcpy(ne->ip, e->ip, 16);
-				ne->next = priv_del_list;
-				priv_del_list = ne;
-
-				e->state = VIPA_2_B_ADDED;
-			} else {
-				PRINT_ERR("not enough for internal vipa "
-					  "handling... VIPA/RXIP remains set "
-					  "although device is stopped.\n");
-				qeth_start_softsetup_thread(card);
-			}
-			break;
-		default:
-			break;
-		}
-		le = e;
-		e = e->next;
-	}
-	write_unlock(&card->vipa_list_lock);
-
-	while (priv_add_list) {
-		result = qeth_send_setdelip(card, priv_add_list->ip, netmask,
-					    IPA_CMD_SETIP,
-					    priv_add_list->version,
-					    priv_add_list->flag);
-		PRINT_SETIP_ERROR('s');
-
-		if (result) {
-			QETH_DBF_CARD2(0, trace, "SETSVFLD", card);
-			sprintf(dbf_text, "%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			if (priv_add_list->version == 4) {
-				PRINT_ERR("going to leave vipa/rxip x%08x"
-					  "unset...\n",
-					  *((__u32 *) & priv_add_list->ip[0]));
-				sprintf(dbf_text, "%08x",
-					*((__u32 *) & priv_add_list->ip[0]));
-				QETH_DBF_TEXT2(0, trace, dbf_text);
-			} else {
-				PRINT_ERR("going to leave vipa/rxip "
-					  "%08x%08x%08x%08x unset...\n",
-					  *((__u32 *) & priv_add_list->ip[0]),
-					  *((__u32 *) & priv_add_list->ip[4]),
-					  *((__u32 *) & priv_add_list->ip[8]),
-					  *((__u32 *) & priv_add_list->ip[12]));
-				QETH_DBF_HEX2(0, trace, &priv_add_list->ip[0],
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX2(0, trace, &priv_add_list->ip[8],
-					      QETH_DBF_TRACE_LEN);
-			}
-		}
-		e = priv_add_list;
-		priv_add_list = priv_add_list->next;
-		kfree(e);
-	}
-
-	while (priv_del_list) {
-		result = qeth_send_setdelip(card, priv_del_list->ip, netmask,
-					    IPA_CMD_DELIP,
-					    priv_del_list->version,
-					    priv_del_list->flag);
-		if (result) {
-			QETH_DBF_CARD2(0, trace, "DELSVFLD", card);
-			sprintf(dbf_text, "%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			if (priv_del_list->version == 4) {
-				PRINT_ERR("could not delete vipa/rxip "
-					  "%08x...\n",
-					  *((__u32 *) & priv_del_list->ip[0]));
-				sprintf(dbf_text, "%08x",
-					*((__u32 *) & priv_del_list->ip[0]));
-				QETH_DBF_TEXT2(0, trace, dbf_text);
-			} else {
-				PRINT_ERR("could not delete vipa/rxip "
-					  "%08x%08x%08x%08x...\n",
-					  *((__u32 *) & priv_del_list->ip[0]),
-					  *((__u32 *) & priv_del_list->ip[4]),
-					  *((__u32 *) & priv_del_list->ip[8]),
-					  *((__u32 *) & priv_del_list->ip[12]));
-				QETH_DBF_HEX2(0, trace, &priv_del_list->ip[0],
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX2(0, trace, &priv_del_list->ip[8],
-					      QETH_DBF_TRACE_LEN);
-			}
-/* in case of problems, it's better if we just display a message and
- * don't requeue the entry back...
-			write_lock(&card->vipa_list_lock);
-			e=card->vipa_list;
-			card->vipa_list=priv_del_list;
-			priv_del_list=priv_del_list->next;
-			card->vipa_list->next=e;
-			card->vipa_list->state=VIPA_ESTABLISHED;
-			write_unlock(&card->vipa_list_lock);
-			continue;
-*/
-		}
-		e = priv_del_list;
-		priv_del_list = priv_del_list->next;
-		kfree(e);
-	}
-}
-
-static void
-qeth_refresh_vipa_states(struct qeth_card *card)
-{
-	struct qeth_vipa_entry *e;
-
-	write_lock(&card->vipa_list_lock);
-	e = card->vipa_list;
-	while (e) {
-		if (e->state == VIPA_ESTABLISHED)
-			e->state = VIPA_2_B_ADDED;
-		e = e->next;
-	}
-	write_unlock(&card->vipa_list_lock);
-}
-
-static inline int
-qeth_send_setrtg(struct qeth_card *card, int routing_type, short ip_vers)
-{
-	struct ipa_cmd cmd;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_SETRTG, ip_vers);
-	/* strip off RESET_ROUTING_FLAG */
-	cmd.data.setrtg.type = (routing_type) & (ROUTER_MASK);
-
-	return qeth_send_ipa_cmd(card, &cmd, 0, IPA_CMD_STATE);
-}
-
-static int
-qeth_is_ipa_in_list(struct in_ifaddr *ip, struct in_ifaddr *list)
-{
-	while (list) {
-		if (ip->ifa_address == list->ifa_address)
-			return 1;
-		list = list->ifa_next;
-	}
-	return 0;
-}
-
-#ifdef QETH_IPV6
-static int
-qeth_is_ipa_in_list6(struct inet6_ifaddr *ip, struct inet6_ifaddr *list)
-{
-	while (list) {
-		if (!memcmp(&ip->addr.s6_addr, &list->addr.s6_addr, 16))
-			return 1;
-		list = list->if_next;
-	}
-	return 0;
-}
-
-static int
-qeth_add_ifa6_to_list(struct inet6_ifaddr **list, struct inet6_ifaddr *ifa)
-{
-	struct inet6_ifaddr *i;
-
-	if (*list == NULL) {
-		*list = ifa;
-	} else {
-		if (qeth_is_ipa_in_list6(ifa, *list))
-			return -EALREADY;
-		i = *list;
-		while (i->if_next) {
-			i = i->if_next;
-		}
-		i->if_next = ifa;
-	}
-	ifa->if_next = NULL;
-	return 0;
-}
-#endif /* QETH_IPV6 */
-
-static int
-qeth_add_ifa_to_list(struct in_ifaddr **list, struct in_ifaddr *ifa)
-{
-	struct in_ifaddr *i;
-
-	if (*list == NULL) {
-		*list = ifa;
-	} else {
-		if (qeth_is_ipa_in_list(ifa, *list))
-			return -EALREADY;
-		i = *list;
-		while (i->ifa_next) {
-			i = i->ifa_next;
-		}
-		i->ifa_next = ifa;
-	}
-	ifa->ifa_next = NULL;
-	return 0;
-}
-
-static void
-__qeth_setips_ipv6(struct qeth_card *card, int use_setip_retries)
-{
-#ifdef QETH_IPV6
-	int result;
-	char dbf_text[15];
-	struct inet6_ifaddr *addr6;
-	__u8 netmask[16];
-
-#define FILL_NETMASK(len) { \
-	int i,j; \
-	for (i=0;i<16;i++) { \
-		j=(len)-(i*8); \
-		netmask[i]=(__u8)(0xFF00>>j); \
-	} \
-}
-	/* here we go with IPv6 */
-	addr6 = card->ip_current_state.ip6_ifa;
-	while (addr6) {
-		if (qeth_is_ipa_in_list6(addr6, card->ip_new_state.ip6_ifa)) {
-			addr6 = addr6->if_next;
-			continue;
-		}
-		QETH_DBF_TEXT3(0, trace, "setipdl6");
-		QETH_DBF_HEX3(0, trace, &addr6->addr.s6_addr,
-			      QETH_DBF_TRACE_LEN);
-		QETH_DBF_HEX3(0, trace,
-			      ((char *) (&addr6->addr.s6_addr)) +
-			      QETH_DBF_TRACE_LEN, QETH_DBF_TRACE_LEN);
-		sprintf(dbf_text, "nmsk%4u", addr6->prefix_len);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		FILL_NETMASK(addr6->prefix_len);
-		result = qeth_send_delip(card,
-					 (__u8 *) & addr6->addr.s6_addr,
-					 (__u8 *) & netmask, 6);
-		if (result) {
-			PRINT_ERR("was not able to delete ip "
-				  "%04x:%04x:%04x:%04x:%04x:%04x:"
-				  "%04x:%04x/%u on device %s "
-				  "(result: 0x%x), "
-				  "trying to continue\n",
-				  addr6->addr.s6_addr16[0],
-				  addr6->addr.s6_addr16[1],
-				  addr6->addr.s6_addr16[2],
-				  addr6->addr.s6_addr16[3],
-				  addr6->addr.s6_addr16[4],
-				  addr6->addr.s6_addr16[5],
-				  addr6->addr.s6_addr16[6],
-				  addr6->addr.s6_addr16[7],
-				  addr6->prefix_len,
-				  CARD_BUS_ID(card), result);
- 			sprintf(dbf_text, "std6%4x", result);
- 			QETH_DBF_TEXT3(0, trace, dbf_text);
-		}
-		addr6 = addr6->if_next;
-	}
-
-	addr6 = card->ip_new_state.ip6_ifa;
-	while (addr6) {
-		if (qeth_is_ipa_in_list6(addr6,
-					  card->ip_current_state.ip6_ifa)) {
-			addr6 = addr6->if_next;
-			continue;
-		}
-		QETH_DBF_TEXT3(0, trace, "setipst6");
-		QETH_DBF_HEX3(0, trace, &addr6->addr.s6_addr,
-			      QETH_DBF_TRACE_LEN);
-		QETH_DBF_HEX3(0, trace,
-			      ((char *) (&addr6->addr.s6_addr)) +
-			      QETH_DBF_TRACE_LEN, QETH_DBF_TRACE_LEN);
-		sprintf(dbf_text, "nmsk%4u", addr6->prefix_len);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		FILL_NETMASK(addr6->prefix_len);
-		result = qeth_send_setip(card,
-					 (__u8 *) & addr6->addr.s6_addr,
-					 (__u8 *) & netmask, 6,
-					 use_setip_retries);
-		if (!result) {
-			addr6 = addr6->if_next;
-			continue;
-		}
-		PRINT_ERR("was not able to set ip "
-			  "%04x:%04x:%04x:%04x:%04x:%04x:"
-			  "%04x:%04x/%u on device %s "
-			  "(result: 0x%x), trying to continue\n",
-			  addr6->addr.s6_addr16[0],
-			  addr6->addr.s6_addr16[1],
-			  addr6->addr.s6_addr16[2],
-			  addr6->addr.s6_addr16[3],
-			  addr6->addr.s6_addr16[4],
-			  addr6->addr.s6_addr16[5],
-			  addr6->addr.s6_addr16[6],
-			  addr6->addr.s6_addr16[7],
-			  addr6->prefix_len,
-			  CARD_BUS_ID(card), result);
- 		sprintf(dbf_text, "sts6%4x", result);
- 		QETH_DBF_TEXT3(0, trace, dbf_text);
-		addr6 = addr6->if_next;
-	}
-#endif /* QETH_IPV6 */
-}
-
-static int
-qeth_setips(struct qeth_card *card, int use_setip_retries)
-{
-	struct in_ifaddr *addr;
-	int result;
-	char dbf_text[15];
-
-	QETH_DBF_CARD3(0, trace, "stip", card);
-
-	addr = card->ip_current_state.ip_ifa;
-	while (addr) {
-		if (!qeth_is_ipa_in_list(addr, card->ip_new_state.ip_ifa)) {
-			QETH_DBF_TEXT3(0, trace, "setipdel");
-			*((__u32 *) (&dbf_text[0])) =
-			    *((__u32 *) & addr->ifa_address);
-			*((__u32 *) (&dbf_text[4])) =
-			    *((__u32 *) & addr->ifa_mask);
-			QETH_DBF_HEX3(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-			result =
-			    qeth_send_delip(card, (__u8 *) & addr->ifa_address,
-					    (__u8 *) & addr->ifa_mask, 4);
-			if (result) {
-				PRINT_ERR("was not able to delete ip "
-					  "%08x/%08x on device %s "
-					  "(result: 0x%x), "
-					  "trying to continue\n",
-					  addr->ifa_address, addr->ifa_mask,
-					  CARD_BUS_ID(card), result);
- 				sprintf(dbf_text, "stdl%4x", result);
- 				QETH_DBF_TEXT3(0, trace, dbf_text);
-			}
-		}
-		addr = addr->ifa_next;
-	}
-
-	addr = card->ip_new_state.ip_ifa;
-	while (addr) {
-		if (qeth_is_ipa_in_list(addr, card->ip_current_state.ip_ifa)) {
-			addr = addr->ifa_next;
-			continue;
-		}
-		QETH_DBF_TEXT3(0, trace, "setipset");
-		*((__u32 *) (&dbf_text[0])) = *((__u32 *) & addr->ifa_address);
-		*((__u32 *) (&dbf_text[4])) = *((__u32 *) & addr->ifa_mask);
-		QETH_DBF_HEX3(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-		result = qeth_send_setip(card, (__u8 *) & addr->ifa_address,
-					 (__u8 *) & addr->ifa_mask, 4,
-					 use_setip_retries);
-		if (!result) {
-			addr = addr->ifa_next;
-			continue;
-		}
-		PRINT_ERR("was not able to set ip "
-			  "%08x/%08x on device %s, trying to continue\n",
-			  addr->ifa_address, addr->ifa_mask,
-			  CARD_BUS_ID(card));
- 		sprintf(dbf_text, "stst%4x", result);
- 		QETH_DBF_TEXT3(0, trace, dbf_text);
-		addr = addr->ifa_next;
-	}
-
-	__qeth_setips_ipv6(card, use_setip_retries);
-
-	return 0;
-}
-
-static int
-qeth_is_ipma_in_list(struct qeth_ipm_mac *ipma, struct qeth_ipm_mac *list)
-{
-	while (list) {
-		if ((!memcmp(ipma->ip, list->ip, 16)) &&
-		    (!memcmp(ipma->mac, list->mac, 6)))
-			return 1;
-		list = list->next;
-	}
-	return 0;
-}
-
-static void
-qeth_remove_mc_ifa_from_list(struct qeth_ipm_mac **list,
-			     struct qeth_ipm_mac *ipma)
-{
-	struct qeth_ipm_mac *i, *li = NULL;
-
-	if ((!(*list)) || (!ipma))
-		return;
-
-	if (*list == ipma) {
-		*list = ipma->next;
-	} else {
-		i = *list;
-		while (i) {
-			if (i == ipma) {
-				li->next = i->next;
-			} else {
-				li = i;
-			}
-			i = i->next;
-		}
-	}
-}
-
-static int
-qeth_add_mc_ifa_to_list(struct qeth_ipm_mac **list, struct qeth_ipm_mac *ipma)
-{
-	struct qeth_ipm_mac *i;
-
-	if (qeth_is_ipma_in_list(ipma, *list))
-		return -EALREADY;
-
-	if (*list == NULL) {
-		*list = ipma;
-	} else {
-		i = *list;
-		while (i->next) {
-			i = i->next;
-		}
-		i->next = ipma;
-	}
-	ipma->next = NULL;
-	return 0;
-}
-
-static void
-__qeth_setipms_ipv6(struct qeth_card *card, int use_setipm_retries)
-{
-#ifdef QETH_IPV6
-	struct qeth_ipm_mac *addr;
-	int result;
-	char dbf_text[15];
-
-	/* here we go with IPv6 */
-	addr = card->ip_mc_current_state.ipm6_ifa;
-	while (addr) {
-		if (!qeth_is_ipma_in_list(addr,
-					  card->ip_mc_new_state.ipm6_ifa)) {
-			QETH_DBF_TEXT3(0, trace, "setimdl6");
-			QETH_DBF_HEX3(0, trace, &addr->ip[0],
-				      QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX3(0, trace,
-				      (&addr->ip[0]) + QETH_DBF_TRACE_LEN,
-				      QETH_DBF_TRACE_LEN);
-			QETH_DBF_HEX3(0, trace, &addr->mac,
-				      QETH_DBF_TRACE_LEN);
-			result = qeth_send_delipm(card,
-						  (__u8 *) & addr->ip[0],
-						  (__u8 *) addr->mac, 6);
-			if (result) {
-				PRINT_ERR("was not able to delete "
-					  "multicast ip %04x:%04x:"
-					  "%04x:%04x:%04x:%04x:%04x:%04x/"
-					  "%02x%02x%02x%02x%02x%02x "
-					  "on device %s (result: 0x%x), "
-					  "trying to continue\n",
-					  *((__u16 *) & addr->ip[0]),
-					  *((__u16 *) & addr->ip[2]),
-					  *((__u16 *) & addr->ip[4]),
-					  *((__u16 *) & addr->ip[6]),
-					  *((__u16 *) & addr->ip[8]),
-					  *((__u16 *) & addr->ip[10]),
-					  *((__u16 *) & addr->ip[12]),
-					  *((__u16 *) & addr->ip[14]),
-					  addr->mac[0], addr->mac[1],
-					  addr->mac[2], addr->mac[3],
-					  addr->mac[4], addr->mac[5],
-					  CARD_BUS_ID(card), result);
- 				sprintf(dbf_text, "smd6%4x", result);
- 				QETH_DBF_TEXT3(0, trace, dbf_text);
-			}
-		}
-		addr = addr->next;
-	}
-
-	addr = card->ip_mc_new_state.ipm6_ifa;
-	while (addr) {
-		if (qeth_is_ipma_in_list(addr,
-					 card->ip_mc_current_state.ipm6_ifa)) {
-			qeth_remove_mc_ifa_from_list(
-					&card->ip_mc_new_state.ipm6_ifa,
-					addr);
-			addr = addr->next;
-			continue;
-		}
-		QETH_DBF_TEXT3(0, trace, "setimst6");
-		QETH_DBF_HEX3(0, trace, &addr->ip[0], QETH_DBF_TRACE_LEN);
-		QETH_DBF_HEX3(0, trace, (&addr->ip[0]) + QETH_DBF_TRACE_LEN,
-			      QETH_DBF_TRACE_LEN);
-		QETH_DBF_HEX3(0, trace, &addr->mac, QETH_DBF_TRACE_LEN);
-		result = qeth_send_setipm(card,
-					  (__u8 *) & addr->ip[0],
-					  (__u8 *) addr->mac, 6,
-					  use_setipm_retries);
-		if (result) {
-			PRINT_ERR("was not able to set "
-				  "multicast ip %04x:%04x:"
-				  "%04x:%04x:%04x:%04x:%04x:%04x/"
-				  "%02x%02x%02x%02x%02x%02x "
-				  "on device %s (result: 0x%x), "
-				  "trying to continue\n",
-				  *((__u16 *) & addr->ip[0]),
-				  *((__u16 *) & addr->ip[2]),
-				  *((__u16 *) & addr->ip[4]),
-				  *((__u16 *) & addr->ip[6]),
-				  *((__u16 *) & addr->ip[8]),
-				  *((__u16 *) & addr->ip[10]),
-				  *((__u16 *) & addr->ip[12]),
-				  *((__u16 *) & addr->ip[14]),
-				  addr->mac[0], addr->mac[1],
-				  addr->mac[2], addr->mac[3],
-				  addr->mac[4], addr->mac[5],
-				  CARD_BUS_ID(card), result);
- 			sprintf(dbf_text, "sms6%4x", result);
- 			QETH_DBF_TEXT3(0, trace, dbf_text);
-		} else {
-			qeth_remove_mc_ifa_from_list(
-					&card->ip_mc_new_state.ipm6_ifa,
-					addr);
-			qeth_add_mc_ifa_to_list(
-					&card->ip_mc_current_state.ipm6_ifa,
-					addr);
-		}
-		addr = addr->next;
-	}
-#endif /* QETH_IPV6 */
-}
-
-static int
-qeth_setipms(struct qeth_card *card, int use_setipm_retries)
-{
-	struct qeth_ipm_mac *addr;
-	int result;
-	char dbf_text[15];
-
-	QETH_DBF_CARD3(0, trace, "stim", card);
-
-	if (!qeth_is_supported(IPA_MULTICASTING))
-		return 0;
-	addr = card->ip_mc_current_state.ipm_ifa;
-	while (addr) {
-		if (!qeth_is_ipma_in_list(addr,
-					  card->ip_mc_new_state.ipm_ifa)) {
-			QETH_DBF_TEXT3(0, trace, "setimdel");
-			sprintf(dbf_text, "%08x", *((__u32 *) & addr->ip[0]));
-			QETH_DBF_TEXT3(0, trace, dbf_text);
-			*((__u32 *) (&dbf_text[0])) = *((__u32 *) & addr->mac);
-			*((__u32 *) (&dbf_text[4])) = 
-				*(((__u32 *) & addr->mac) + 1);
-			QETH_DBF_HEX3(0, trace, dbf_text,
-				      QETH_DBF_TRACE_LEN);
-			result = qeth_send_delipm(card,
-						  (__u8 *) & addr->ip[0],
-						  (__u8 *) addr->mac, 4);
-			if (result) {
-				PRINT_ERR("was not able to delete "
-					  "multicast ip %08x/"
-					  "%02x%02x%02x%02x%02x%02x "
-					  "on device %s "
-					  "(result: 0x%x), "
-					  "trying to continue\n",
-					  *((__u32 *) & addr->ip[0]),
-					  addr->mac[0], addr->mac[1],
-					  addr->mac[2], addr->mac[3],
-					  addr->mac[4], addr->mac[5],
-					  CARD_BUS_ID(card), result);
-				sprintf(dbf_text, "smdl%4x", result);
-				QETH_DBF_TEXT3(0, trace, dbf_text);
-			}
-		}
-		addr = addr->next;
-	}
-
-	addr = card->ip_mc_new_state.ipm_ifa;
-	while (addr) {
-		if (qeth_is_ipma_in_list(addr,
-					 card->ip_mc_current_state.ipm_ifa)) {
-			addr = addr->next;
-			continue;
-		}
-		QETH_DBF_TEXT3(0, trace, "setimset");
-		sprintf(dbf_text, "%08x", *((__u32 *) & addr->ip[0]));
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		*((__u32 *) (&dbf_text[0])) = *((__u32 *) & addr->mac);
-		*((__u32 *) (&dbf_text[4])) = *(((__u32 *) & addr->mac) + 1);
-		QETH_DBF_HEX3(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-		result = qeth_send_setipm(card, (__u8 *) & addr->ip[0],
-					  (__u8 *) addr->mac, 4,
-					  use_setipm_retries);
-		if (result) {
-			PRINT_ERR("was not able to set multicast ip %08x/"
-				  "%02x%02x%02x%02x%02x%02x "
-				  "on device %s (result: 0x%x), "
-				  "trying to continue\n",
-				  *((__u32 *) & addr->ip[0]),
-				  addr->mac[0], addr->mac[1],
-				  addr->mac[2], addr->mac[3],
-				  addr->mac[4], addr->mac[5],
-				  CARD_BUS_ID(card), result);
-			sprintf(dbf_text, "smst%4x", result);
-			QETH_DBF_TEXT3(0, trace, dbf_text);
-			qeth_remove_mc_ifa_from_list
-				(&card->ip_mc_current_state.ipm_ifa, addr);
-		}
-		addr = addr->next;
-	}
-	__qeth_setipms_ipv6(card, use_setipm_retries);
-	return 0;
-}
-
-static void
-qeth_clone_ifa(struct in_ifaddr *src, struct in_ifaddr *dest)
-{
-	memcpy(dest, src, sizeof (struct in_ifaddr));
-	dest->ifa_next = NULL;
-}
-
-#ifdef QETH_IPV6
-static void
-qeth_clone_ifa6(struct inet6_ifaddr *src, struct inet6_ifaddr *dest)
-{
-	memcpy(dest, src, sizeof (struct inet6_ifaddr));
-	dest->if_next = NULL;
-}
-#endif /* QETH_IPV6 */
-
-#define QETH_STANDARD_RETVALS \
-		ret_val=-EIO; \
-		if (result == -EFAULT) ret_val = -EFAULT; \
-                if (result==IPA_REPLY_SUCCESS) ret_val=0; \
-		if (result==IPA_REPLY_FAILED) ret_val=-EIO; \
-		if (result==IPA_REPLY_OPNOTSUPP) ret_val=-EOPNOTSUPP
-
-static int
-qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
-{
-	char *data;
-	int result, i, ret_val;
-	int version = 4;
-	struct qeth_card *card;
-	char dbf_text[15];
-	char buff[100];
-
-	card = (struct qeth_card *) dev->priv;
-
-	PRINT_STUPID("CALL: qeth_do_ioctl called with cmd %i (=0x%x).\n", cmd,
-		     cmd);
-	QETH_DBF_CARD2(0, trace, "ioct", card);
-	sprintf(dbf_text, "cmd=%4x", cmd);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-	QETH_DBF_HEX2(0, trace, &rq, sizeof (void *));
-
-	if ((cmd < SIOCDEVPRIVATE) || (cmd > SIOCDEVPRIVATE + 5))
-		return -EOPNOTSUPP;
-	if (copy_from_user(buff, rq->ifr_ifru.ifru_data, sizeof (buff)))
-		return -EFAULT;
-	data = buff;
-
-	if ((!atomic_read(&card->is_registered)) ||
-	    (!atomic_read(&card->is_hardsetup)))
-		return -ENODEV;
-
-	if (atomic_read(&card->shutdown_phase))
-		return -ENODEV;
-
-	spin_lock(&card->ioctl_lock);
-
-	if (atomic_read(&card->shutdown_phase)) {
-		ret_val = -ENODEV;
-		goto out;
-	}
-	if ((!atomic_read(&card->is_registered)) ||
-	    (!atomic_read(&card->is_hardsetup))) {
-		ret_val = -ENODEV;
-		goto out;
-	}
-
-	switch (cmd) {
-	case SIOCDEVPRIVATE + 0:
-		if (!capable(CAP_NET_ADMIN)) {
-			ret_val = -EPERM;
-			break;
-		}
-		result =
-		    qeth_send_setassparms(card, version, IPA_ARP_PROCESSING,
-					  IPA_CMD_ASS_ARP_SET_NO_ENTRIES,
-					  rq->ifr_ifru.ifru_ivalue, 4);
-		QETH_STANDARD_RETVALS;
-		if (result == 3)
-			ret_val = -EINVAL;
-		break;
-	case SIOCDEVPRIVATE + 1:
-		if (!capable(CAP_NET_ADMIN)) {
-			ret_val = -EPERM;
-			break;
-		}
-		result = qeth_queryarp(card, rq, version, IPA_ARP_PROCESSING,
-				       IPA_CMD_ASS_ARP_QUERY_INFO, data, 4);
-
-		QETH_STANDARD_RETVALS;
-		break;
-	case SIOCDEVPRIVATE + 2:
-		if (!capable(CAP_NET_ADMIN)) {
-			ret_val = -EPERM;
-			break;
-		}
-		for (i = 12; i < 24; i++)
-			if (data[i])
-				version = 6;
-		result =
-		    qeth_send_setassparms(card, version, IPA_ARP_PROCESSING,
-					  IPA_CMD_ASS_ARP_ADD_ENTRY,
-					  (long) data, 56);
-		QETH_STANDARD_RETVALS;
-		break;
-	case SIOCDEVPRIVATE + 3:
-		if (!capable(CAP_NET_ADMIN)) {
-			ret_val = -EPERM;
-			break;
-		}
-		for (i = 4; i < 12; i++)
-			if (data[i])
-				version = 6;
-		result =
-		    qeth_send_setassparms(card, version, IPA_ARP_PROCESSING,
-					  IPA_CMD_ASS_ARP_REMOVE_ENTRY,
-					  (long) data, 16);
-		QETH_STANDARD_RETVALS;
-		break;
-	case SIOCDEVPRIVATE + 4:
-		if (!capable(CAP_NET_ADMIN)) {
-			ret_val = -EPERM;
-			break;
-		}
-		result =
-		    qeth_send_setassparms(card, version, IPA_ARP_PROCESSING,
-					  IPA_CMD_ASS_ARP_FLUSH_CACHE, 0, 0);
-		QETH_STANDARD_RETVALS;
-		break;
-	case SIOCDEVPRIVATE + 5:
-
-		result =
-		    qeth_send_snmp_control(card, rq, IPA_CMD_SETADAPTERPARMS,
-					   IPA_SETADP_SET_SNMP_CONTROL, data,
-					   4);
-		QETH_STANDARD_RETVALS;
-		break;
-
-	default:
-		ret_val = -EOPNOTSUPP;
-		goto out;
-	}
-out:
-	spin_unlock(&card->ioctl_lock);
-
-	sprintf(dbf_text, "ret=%4x", ret_val);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	return ret_val;
-}
-
-static void
-qeth_clear_ifamc_list(struct qeth_ipm_mac **ifa_list)
-{
-	struct qeth_ipm_mac *ifa;
-	while (*ifa_list) {
-		ifa = *ifa_list;
-		*ifa_list = ifa->next;
-		kfree(ifa);
-	}
-}
-
-#ifdef QETH_IPV6
-static void
-qeth_clear_ifa6_list(struct inet6_ifaddr **ifa_list)
-{
-	struct inet6_ifaddr *ifa;
-	while (*ifa_list) {
-		ifa = *ifa_list;
-		*ifa_list = ifa->if_next;
-		kfree(ifa);
-	}
-}
-
-static inline void
-__qeth_append_vlan_ipas_v6(struct qeth_card *card)
-{
-#ifdef QETH_VLAN
-	char dbf_text[15];
-	struct vlan_group *card_group;
-	int i;
-	int remove;
-	struct inet6_ifaddr *ifa, *ifanew;
-
-	/*
-	 * append all known VLAN IP Addresses corresponding to the real device
-	 * card->dev->ifindex
-	 */
-	QETH_DBF_TEXT4(0, trace, "to-vip6s");
-	if ((!qeth_is_supported(IPA_FULL_VLAN)) || (!atomic_read(&card->is_open)))
-		return;
-
-	card_group = (struct vlan_group *) card->vlangrp;
-	
-	if (!card_group)
-		return;
-	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-		if (!card_group->vlan_devices[i] ||
-		    !(card_group->vlan_devices[i]->flags & IFF_UP) ||
-		    !(struct inet6_dev *) card_group->vlan_devices[i]->ip6_ptr)
-			continue;
-		ifa = ((struct inet6_dev *)
-		       card_group->vlan_devices[i]->ip6_ptr)->addr_list;
-		
-		while (ifa) {
-			ifanew = kmalloc(sizeof(struct inet6_ifaddr),
-					 GFP_KERNEL);
-			if (!ifanew) {
-				PRINT_WARN("No memory for IP address "
-					   "handling. Some of the IPs "
-					   "will not be set on %s.\n",
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPNMEM");
-			} else {
-				qeth_clone_ifa6(ifa, ifanew);
-				remove = qeth_add_ifa6_to_list
-					(&card->ip_new_state.ip6_ifa, ifanew);
-				QETH_DBF_HEX4(0, trace,
-					      &ifanew->addr.s6_addr,
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX4(0, trace,
-					      &ifanew->addr.s6_addr +
-					      QETH_DBF_TRACE_LEN,
-					      QETH_DBF_TRACE_LEN);
-				sprintf(dbf_text, "pref%4u", ifanew->prefix_len);
-				QETH_DBF_TEXT4(0, trace, dbf_text);
-				if (remove) {
-					kfree(ifanew);
-					QETH_DBF_TEXT4(0, trace, "alrdv6rm");
-				}
-			}
-			ifa = ifa->if_next;
-		}
-	}
-#endif
-}
-
-static inline void
-__qeth_append_vlan_ipas_v6_mc(struct qeth_card *card)
-{
-#ifdef QETH_VLAN
-	struct vlan_group *card_group;
-	int i;
-	int remove;
-	struct inet6_dev *in6_vdev;
-	char buf[MAX_ADDR_LEN];
-	struct qeth_ipm_mac *ipmanew;
-	struct ifmcaddr6 *im6;
-	
-	QETH_DBF_TEXT4(0, trace, "tovipm6s");
-	if (!qeth_is_supported(IPA_FULL_VLAN) || !atomic_read(&card->is_open))
-		return;
-
-	card_group = (struct vlan_group *) card->vlangrp;
-	if (!card_group)
-		return;
-
-	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-		if (!card_group->vlan_devices[i] ||
-		    !(card_group->vlan_devices[i]->flags & IFF_UP))
-			continue;
-		
-		in6_vdev = in6_dev_get(card_group->vlan_devices[i]);
-		if (!in6_vdev) {
-			QETH_DBF_CARD2(0, trace, "id26", card);
-			continue;
-		}
-
-		read_lock(&in6_vdev->lock);
-		for (im6 = in6_vdev->mc_list; im6; im6 = im6->next) {
-			ndisc_mc_map(&im6->mca_addr, buf,
-				     card_group->vlan_devices[i], 0);
-			ipmanew = (struct qeth_ipm_mac *)
-				kmalloc(sizeof(struct qeth_ipm_mac), GFP_KERNEL);
-			if (!ipmanew) {
-				PRINT_WARN("No memory for IPM address "
-					   "handling. Multicast IP "
-					   "%04x:%04x:%04x:%04x:%04x:"
-					   "%04x:%04x:%04x"
-					   "will not be set on %s.\n",
-					   im6->mca_addr.s6_addr16[0],
-					   im6->mca_addr.s6_addr16[1],
-					   im6->mca_addr.s6_addr16[2],
-					   im6->mca_addr.s6_addr16[3],
-					   im6->mca_addr.s6_addr16[4],
-					   im6->mca_addr.s6_addr16[5],
-					   im6->mca_addr.s6_addr16[6],
-					   im6->mca_addr.s6_addr16[7],
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPMNMM");
-			} else {
-				memset(ipmanew, 0, sizeof(struct qeth_ipm_mac));
-				memcpy(ipmanew->mac, buf,OSA_ADDR_LEN);
-				memcpy(ipmanew->ip, im6->mca_addr.s6_addr, 16);
-				ipmanew->next = NULL;
-				remove = qeth_add_mc_ifa_to_list
-					(&card->ip_mc_new_state.ipm6_ifa,
-						 ipmanew);
-				QETH_DBF_HEX4(0, trace, &ipmanew->ip,
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX4(0, trace, &ipmanew->ip +
-					      QETH_DBF_TRACE_LEN,
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX4(0, trace, &ipmanew->mac,
-					      QETH_DBF_TRACE_LEN);
-				
-				if (remove) {
-					QETH_DBF_TEXT4(0, trace, "mlrdv6rm");
-					kfree(ipmanew);
-				}
-			}
-		}
-		read_unlock(&in6_vdev->lock);
-		in6_dev_put(in6_vdev);
-	}
-#endif
-}
-
-static struct inet6_dev *
-__qeth_get_mc_lock_v6(struct qeth_card *card)
-{
-	struct inet6_dev *in6_dev;
-
-	in6_dev = in6_dev_get(card->dev);
-
-	if (!in6_dev) {
-		QETH_DBF_CARD2(0, trace, "id16", card);
-		return ERR_PTR(-ENODEV);
-	}
-	read_lock(&in6_dev->lock);
-	return in6_dev;
-}
-
-static void
-__qeth_takeover_ip_ipms6_mc(struct qeth_card *card, struct inet6_dev *in6_dev)
-{
-	int remove;
-	struct qeth_ipm_mac *ipmanew;
-	struct ifmcaddr6 *im6;
-	char buf[MAX_ADDR_LEN];
-
-	QETH_DBF_TEXT4(0, trace, "to-ipm6s");
-	if (atomic_read(&card->is_open))
-		for (im6 = in6_dev->mc_list; im6; im6 = im6->next) {
-			ndisc_mc_map(&im6->mca_addr, buf, card->dev, 0);
-			ipmanew =
-			    (struct qeth_ipm_mac *)
-			    kmalloc(sizeof (struct qeth_ipm_mac), GFP_ATOMIC);
-			if (!ipmanew) {
-				PRINT_WARN("No memory for IPM address "
-					   "handling. Multicast IP "
-					   "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x"
-					   "will not be set on %s.\n",
-					   im6->mca_addr.s6_addr16[0],
-					   im6->mca_addr.s6_addr16[1],
-					   im6->mca_addr.s6_addr16[2],
-					   im6->mca_addr.s6_addr16[3],
-					   im6->mca_addr.s6_addr16[4],
-					   im6->mca_addr.s6_addr16[5],
-					   im6->mca_addr.s6_addr16[6],
-					   im6->mca_addr.s6_addr16[7],
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPMNMM");
-			} else {
-				memset(ipmanew, 0,
-				       sizeof (struct qeth_ipm_mac));
-				memcpy(ipmanew->mac, buf, OSA_ADDR_LEN);
-				memcpy(ipmanew->ip, im6->mca_addr.s6_addr, 16);
-				ipmanew->next = NULL;
-				remove =
-				    qeth_add_mc_ifa_to_list(&card->
-							    ip_mc_new_state.
-							    ipm6_ifa, ipmanew);
-				QETH_DBF_HEX4(0, trace, &ipmanew->ip,
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX4(0, trace,
-					      &ipmanew->ip + QETH_DBF_TRACE_LEN,
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX4(0, trace, &ipmanew->mac,
-					      QETH_DBF_TRACE_LEN);
-				if (remove) {
-					QETH_DBF_TEXT4(0, trace, "mlrdy6rm");
-					kfree(ipmanew);
-				}
-			}
-		}
-	__qeth_append_vlan_ipas_v6_mc(card);
-
-	read_unlock(&in6_dev->lock);
-	in6_dev_put(in6_dev);
-}
-#endif /* QETH_IPV6 */
-
-static void
-qeth_takeover_ip_ipms6(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	struct inet6_ifaddr *ifa, *ifanew;
-	char dbf_text[15];
-	int remove;
-	struct inet6_dev *in6_dev;
-
-	QETH_DBF_CARD3(0, trace, "tip6", card);
-	/* unicast */
-	/* clear ip_current_state */
-	qeth_clear_ifa6_list(&card->ip_current_state.ip6_ifa);
-	/* take it over */
-	card->ip_current_state.ip6_ifa = card->ip_new_state.ip6_ifa;
-	card->ip_new_state.ip6_ifa = NULL;
-
-	in6_dev = __qeth_get_mc_lock_v6(card);
-	if (PTR_ERR(in6_dev) == -ENODEV)
-		return;
-	/* get new one, we try to have the same order as ifa_list in device
-	   structure, for what reason ever */
-	QETH_DBF_TEXT4(0, trace, "to-ip6s");
-	if ((atomic_read(&card->is_open)) && (card->dev->ip6_ptr) &&
-	    (((struct inet6_dev *) card->dev->ip6_ptr)->addr_list)) {
-		ifa = ((struct inet6_dev *) card->dev->ip6_ptr)->addr_list;
-
-		while (ifa) {
-			ifanew =
-			    kmalloc(sizeof (struct inet6_ifaddr), GFP_ATOMIC);
-			if (!ifanew) {
-				PRINT_WARN("No memory for IP address "
-					   "handling. Some of the IPs "
-					   "will not be set on %s.\n",
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPNMEM");
-			} else {
-				qeth_clone_ifa6(ifa, ifanew);
-				remove =
-				    qeth_add_ifa6_to_list(&card->ip_new_state.
-							  ip6_ifa, ifanew);
-				QETH_DBF_HEX4(0, trace, &ifanew->addr.s6_addr,
-					      QETH_DBF_TRACE_LEN);
-				QETH_DBF_HEX4(0, trace,
-					      &ifanew->addr.s6_addr +
-					      QETH_DBF_TRACE_LEN,
-					      QETH_DBF_TRACE_LEN);
-				sprintf(dbf_text, "pref%4u",
-					ifanew->prefix_len);
-				QETH_DBF_TEXT4(0, trace, dbf_text);
-				if (remove) {
-					kfree(ifanew);
-					QETH_DBF_TEXT4(0, trace, "alrdy6rm");
-				}
-			}
-			ifa = ifa->if_next;
-		}
-	}
-
-	__qeth_append_vlan_ipas_v6(card);
-	
-	__qeth_takeover_ip_ipms6_mc(card, in6_dev);
-#endif /* QETH_IPV6 */
-}
-
-static void
-qeth_clear_ifa4_list(struct in_ifaddr **ifa_list)
-{
-	struct in_ifaddr *ifa;
-	while (*ifa_list) {
-		ifa = *ifa_list;
-		*ifa_list = ifa->ifa_next;
-		kfree(ifa);
-	}
-}
-
-static inline void
-__qeth_append_vlan_ipas_v4(struct qeth_card *card)
-{
-#ifdef QETH_VLAN
-	struct in_ifaddr *ifa, *ifanew;
-	char dbf_text[15];
-	struct vlan_group *card_group;
-	int i;
-	int remove;
-	struct in_device *vin4_dev;
-
-	/*
-	 * append all known VLAN IP Addresses corresponding to the real device
-	 * card->dev->ifindex
-	 */
-	QETH_DBF_TEXT4(0, trace, "to-vips");
-	if (!qeth_is_supported(IPA_FULL_VLAN) || !atomic_read(&card->is_open))
-		return;
-
-	card_group = (struct vlan_group *) card->vlangrp;
-	if (!card_group)
-		return;
-
-	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-		vin4_dev = in_dev_get(card->dev);
-		if (!vin4_dev) {
-			QETH_DBF_TEXT2(0, trace, "nodvhol2");
-			QETH_DBF_TEXT2(0, trace, card->dev_name);
-			continue;
-		}
-		read_lock(&vin4_dev->lock);
-
-		if ((card_group->vlan_devices[i]) &&
-		    (card_group->vlan_devices[i]->flags & IFF_UP)) {
-			ifa = ((struct in_device *)
-			       card_group->vlan_devices[i]->ip_ptr)->ifa_list;
-			while (ifa) {
-				ifanew = kmalloc(sizeof(struct in_ifaddr),
-						 GFP_KERNEL);
-				if (!ifanew) {
-					PRINT_WARN("No memory for IP address "
-						   "handling. Some of the IPs "
-						   "will not be set on %s.\n",
-						   card->dev_name);
-					QETH_DBF_TEXT2(0, trace, "TOIPNMEM");
-				} else {
-					qeth_clone_ifa(ifa, ifanew);
-					remove = qeth_add_ifa_to_list
-						(&card->ip_new_state.ip_ifa,
-						 ifanew);
-					*((__u32*) (&dbf_text[0])) =
-						*((__u32*) &ifanew->ifa_address);
-					*((__u32*) (&dbf_text[4])) =
-						*((__u32*) &ifanew->ifa_mask);
-					QETH_DBF_TEXT4(0, trace, dbf_text);
-					if (remove) {
-						kfree(ifanew);
-						QETH_DBF_TEXT4(0, trace,
-							       "alrdv4rm");
-					}
-				}
-				ifa = ifa->ifa_next;
-			}
-		}		
-
-		read_unlock(&vin4_dev->lock);
-		in_dev_put(vin4_dev);
-	}
-#endif /* QETH_VLAN */
-
-}
-
-static inline void
-__qeth_append_vlan_ipas_v4_mc(struct qeth_card *card)
-{
-#ifdef QETH_VLAN
-	char dbf_text[15];
-	int i;
-	int remove;
-	struct vlan_group *card_group;
-	struct in_device *vin4_dev;
-	struct qeth_ipm_mac *ipmanew;
-	struct ip_mc_list *im4;
-	char buf[MAX_ADDR_LEN];
-	__u32 maddr;
-	
-	QETH_DBF_TEXT4(0, trace, "to-vipms");
-	if (!qeth_is_supported(IPA_FULL_VLAN) || !atomic_read(&card->is_open))
-		return;
-
-	card_group = (struct vlan_group *) card->vlangrp;
-	if (!card_group)
-		return;
-
-	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-		if (!card_group->vlan_devices[i] ||
-		    !(card_group->vlan_devices[i]->flags & IFF_UP))
-			continue;
-
-		vin4_dev = in_dev_get(card_group->vlan_devices[i]);
-		if (!vin4_dev) {
-			QETH_DBF_TEXT2(0, trace, "novdhol3");
-			QETH_DBF_TEXT2(0, trace, card->dev_name);
-			QETH_DBF_TEXT2(0, trace,
-				       card_group->vlan_devices[i]->name);
-			continue;
-		}
-		read_lock(&vin4_dev->lock);
-		for (im4 = vin4_dev->mc_list; im4; im4 = im4->next) {
-			qeth_get_mac_for_ipm(im4->multiaddr, buf, vin4_dev->dev);
-			ipmanew = (struct qeth_ipm_mac *)
-				kmalloc(sizeof(struct qeth_ipm_mac), GFP_KERNEL);
-			if (!ipmanew) {
-				PRINT_WARN("No memory for IPM address "
-					   "handling. Multicast VLAN IP %08x"
-					   "will not be set on %s.\n",
-					   (__u32) im4->multiaddr,
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPMNMM");
-			} else {
-				memset(ipmanew, 0, sizeof(struct qeth_ipm_mac));
-				memcpy(ipmanew->mac, buf, OSA_ADDR_LEN);
-				maddr = im4->multiaddr;
-				memcpy(&(ipmanew->ip[0]), &maddr, 4);
-				memset(&(ipmanew->ip[4]), 0xff, 12);
-				ipmanew->next = NULL;
-				remove = qeth_add_mc_ifa_to_list
-					(&card->ip_mc_new_state.ipm_ifa,
-					 ipmanew);
-				sprintf(dbf_text, "%08x",
-					*((__u32 *) &ipmanew->ip));
-				QETH_DBF_TEXT4(0, trace, dbf_text);
-				QETH_DBF_HEX4(0, trace, &ipmanew->mac,
-					      QETH_DBF_TRACE_LEN);
-				if (remove) {
-					QETH_DBF_TEXT4(0, trace, "mlrdv4rm");
-					kfree(ipmanew);
-				}
-			}
-		}
-		read_unlock(&vin4_dev->lock);
-		in_dev_put(vin4_dev);
-	}
-#endif /* QETH_VLAN */
-
-}
-
-static struct in_device *
-__qeth_get_mc_lock(struct qeth_card *card)
-{
-	struct in_device *in4_dev;
-
-	/* multicast */
-	/* clear ip_mc_current_state */
-	qeth_clear_ifamc_list(&card->ip_mc_current_state.ipm_ifa);
-	/* take it over */
-	card->ip_mc_current_state.ipm_ifa = card->ip_mc_new_state.ipm_ifa;
-	/* get new one, we try to have the same order as ifa_list in device
-	   structure, for what reason ever */
-	card->ip_mc_new_state.ipm_ifa = NULL;
-
-	in4_dev = in_dev_get(card->dev);
-	if (!in4_dev) {
-		QETH_DBF_TEXT2(0, trace, "nodvhol1");
-		QETH_DBF_TEXT2(0, trace, card->dev_name);
-		return ERR_PTR(-ENODEV);
-	}
-	read_lock(&in4_dev->lock);
-	return in4_dev;
-}
-
-static void
-__qeth_takeover_ip_ipms_mc(struct qeth_card *card, struct in_device *in4_dev)
-{
-	char dbf_text[15];
-	int remove;
-	struct qeth_ipm_mac *ipmanew;
-	struct ip_mc_list *im4;
-	char buf[MAX_ADDR_LEN];
-	__u32 maddr;
-
-	QETH_DBF_TEXT4(0, trace, "to-ipms");
-	if (atomic_read(&card->is_open))
-		for (im4 = in4_dev->mc_list; im4; im4 = im4->next) {
-			qeth_get_mac_for_ipm(im4->multiaddr, buf, in4_dev->dev);
-			ipmanew =
-			    (struct qeth_ipm_mac *)
-			    kmalloc(sizeof (struct qeth_ipm_mac), GFP_ATOMIC);
-			if (!ipmanew) {
-				PRINT_WARN("No memory for IPM address "
-					   "handling. Multicast IP %08x"
-					   "will not be set on %s.\n",
-					   (__u32) im4->multiaddr,
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPMNMM");
-			} else {
-				memset(ipmanew, 0,
-				       sizeof (struct qeth_ipm_mac));
-				memcpy(ipmanew->mac, buf, OSA_ADDR_LEN);
-				maddr = im4->multiaddr;
-				memcpy(&(ipmanew->ip[0]), &maddr, 4);
-				memset(&(ipmanew->ip[4]), 0xff, 12);
-				ipmanew->next = NULL;
-				remove =
-				    qeth_add_mc_ifa_to_list(&card->
-							    ip_mc_new_state.
-							    ipm_ifa, ipmanew);
-				sprintf(dbf_text, "%08x",
-					*((__u32 *) & ipmanew->ip));
-				QETH_DBF_TEXT4(0, trace, dbf_text);
-				QETH_DBF_HEX4(0, trace, &ipmanew->mac,
-					      QETH_DBF_TRACE_LEN);
-				if (remove) {
-					QETH_DBF_TEXT4(0, trace, "mlrdy4rm");
-					kfree(ipmanew);
-				}
-			}
-		}
-	__qeth_append_vlan_ipas_v4(card);
-
-	read_unlock(&in4_dev->lock);
-	in_dev_put(in4_dev);
-
-}
-
-static void
-qeth_takeover_ip_ipms(struct qeth_card *card)
-{
-	struct in_ifaddr *ifa, *ifanew;
-	char dbf_text[15];
-	int remove;
-	struct in_device *in4_dev;
-
-	QETH_DBF_CARD3(0, trace, "tips", card);
-	/* unicast */
-	/* clear ip_current_state */
-	qeth_clear_ifa4_list(&card->ip_current_state.ip_ifa);
-	/* take it over */
-	card->ip_current_state.ip_ifa = card->ip_new_state.ip_ifa;
-	card->ip_new_state.ip_ifa = NULL;
-
-	in4_dev = __qeth_get_mc_lock(card);
-	if (PTR_ERR(in4_dev) == -ENODEV)
-		return;
-
-	/* get new one, we try to have the same order as ifa_list in device
-	   structure, for what reason ever */
-	QETH_DBF_TEXT4(0, trace, "to-ips");
-	if ((atomic_read(&card->is_open)) && (card->dev->ip_ptr) &&
-	    (((struct in_device *) card->dev->ip_ptr)->ifa_list)) {
-		ifa = ((struct in_device *) card->dev->ip_ptr)->ifa_list;
-
-		while (ifa) {
-			ifanew = kmalloc(sizeof (struct in_ifaddr), GFP_ATOMIC);
-			if (!ifanew) {
-				PRINT_WARN("No memory for IP address "
-					   "handling. Some of the IPs "
-					   "will not be set on %s.\n",
-					   card->dev_name);
-				QETH_DBF_TEXT2(0, trace, "TOIPNMEM");
-			} else {
-				qeth_clone_ifa(ifa, ifanew);
-				remove =
-				    qeth_add_ifa_to_list(&card->ip_new_state.
-							 ip_ifa, ifanew);
-				*((__u32 *) (&dbf_text[0])) =
-				    *((__u32 *) & ifanew->ifa_address);
-				*((__u32 *) (&dbf_text[4])) =
-				    *((__u32 *) & ifanew->ifa_mask);
-				QETH_DBF_TEXT4(0, trace, dbf_text);
-				if (remove) {
-					kfree(ifanew);
-					QETH_DBF_TEXT4(0, trace, "alrdy4rm");
-				}
-			}
-
-			ifa = ifa->ifa_next;
-		}
-	}
-	__qeth_append_vlan_ipas_v4(card);
-
-	__qeth_takeover_ip_ipms_mc(card, in4_dev);
-}
-
-static void
-qeth_get_unique_id(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	struct ipa_cmd cmd;
-	int result;
-	char dbf_text[15];
-
-	if (!qeth_is_supported(IPA_IPv6)) {
-		card->unique_id = UNIQUE_ID_IF_CREATE_ADDR_FAILED |
-		    UNIQUE_ID_NOT_BY_CARD;
-		return;
-	}
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_CREATE_ADDR, 6);
-
-	*((__u16 *) & cmd.data.create_destroy_addr.unique_id[6]) =
-	    card->unique_id;
-
-	result = qeth_send_ipa_cmd(card, &cmd, 1, IPA_CMD_STATE);
-
-	if (result) {
-		card->unique_id = UNIQUE_ID_IF_CREATE_ADDR_FAILED |
-		    UNIQUE_ID_NOT_BY_CARD;
-		PRINT_WARN("couldn't get a unique id from the card on device "
-			   "%s (result=x%x), using default id. ipv6 "
-			   "autoconfig on other lpars may lead to duplicate "
-			   "ip addresses. please use manually "
-			   "configured ones.\n",
-			   CARD_BUS_ID(card), result);
-		QETH_DBF_CARD2(0, trace, "unid fld", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	} else {
-		card->unique_id =
-		    *((__u16 *) & cmd.data.create_destroy_addr.unique_id[6]);
-		QETH_DBF_CARD2(0, setup, "uniqueid", card);
-		sprintf(dbf_text, "%4x", card->unique_id);
-		QETH_DBF_TEXT2(0, setup, dbf_text);
-	}
-#else /* QETH_IPV6 */
-	card->unique_id =
-	    UNIQUE_ID_IF_CREATE_ADDR_FAILED | UNIQUE_ID_NOT_BY_CARD;
-#endif /* QETH_IPV6 */
-}
-
-static void
-qeth_put_unique_id(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	struct ipa_cmd cmd;
-	int result;
-	char dbf_text[15];
-
-	/* is also true, if ipv6 is not supported on the card */
-	if ((card->unique_id & UNIQUE_ID_NOT_BY_CARD) == UNIQUE_ID_NOT_BY_CARD)
-		return;
-
-	qeth_fill_ipa_cmd(card, &cmd, IPA_CMD_DESTROY_ADDR, 6);
-	*((__u16 *) & cmd.data.create_destroy_addr.unique_id[6]) =
-	    card->unique_id;
-	memcpy(&cmd.data.create_destroy_addr.unique_id[0], card->dev->dev_addr,
-	       OSA_ADDR_LEN);
-
-	result = qeth_send_ipa_cmd(card, &cmd, 1, IPA_CMD_STATE);
-
-	if (result) {
-		QETH_DBF_CARD2(0, trace, "unibkfld", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	}
-#else /* QETH_IPV6 */
-	card->unique_id =
-	    UNIQUE_ID_IF_CREATE_ADDR_FAILED | UNIQUE_ID_NOT_BY_CARD;
-#endif /* QETH_IPV6 */
-}
-
-static inline void
-__qeth_setparms_hstr(struct qeth_card *card)
-{
-	char dbf_text[15];
-	int result;
-
-	if ((card->link_type != QETH_MPC_LINK_TYPE_HSTR) &&
-	    (card->link_type != QETH_MPC_LINK_TYPE_LANE_TR))
-		return;
-
-	QETH_DBF_CARD3(0, trace, "hstr", card);
-	
-	if (qeth_is_adp_supported(IPA_SETADP_SET_BROADCAST_MODE)) {
-		result = qeth_send_setadapterparms_mode
-			(card, IPA_SETADP_SET_BROADCAST_MODE,
-			 card->options.broadcast_mode);
-		if (result) {
-			PRINT_WARN("couldn't set broadcast mode on "
-				   "device %s: x%x\n",
-				   CARD_BUS_ID(card), result);
-			QETH_DBF_CARD1(0, trace, "STBRDCST", card);
-			sprintf(dbf_text, "%4x", result);
-			QETH_DBF_TEXT1(1, trace, dbf_text);
-		}
-	} else if (card->options.broadcast_mode) {
-		PRINT_WARN("set adapter parameters not available "
-			   "to set broadcast mode, using ALLRINGS "
-			   "on device %s:\n", CARD_BUS_ID(card));
-		QETH_DBF_CARD1(0, trace, "NOBC", card);
-	}
-	
-	if (qeth_is_adp_supported(IPA_SETADP_SET_BROADCAST_MODE)) {
-		result = qeth_send_setadapterparms_mode
-			(card, IPA_SETADP_ALTER_MAC_ADDRESS,
-			 card->options.macaddr_mode);
-		if (result) {
-			PRINT_WARN("couldn't set macaddr mode on "
-				   "device %s: x%x\n", CARD_BUS_ID(card),
-				   result);
-			QETH_DBF_CARD1(0, trace, "STMACMOD", card);
-			sprintf(dbf_text, "%4x", result);
-			QETH_DBF_TEXT1(1, trace, dbf_text);
-		}
-	} else if (card->options.macaddr_mode) {
-		PRINT_WARN("set adapter parameters not available "
-			   "to set macaddr mode, using NONCANONICAL "
-			   "on device %s:\n", CARD_BUS_ID(card));
-		QETH_DBF_CARD1(0, trace, "NOMA", card);
-	}
-}
-
-static void
-qeth_do_setadapterparms_stuff(struct qeth_card *card)
-{
-	int result;
-	char dbf_text[15];
-
-	if (!qeth_is_supported(IPA_SETADAPTERPARMS)) {
-		return;
-	}
-
-	QETH_DBF_CARD4(0, trace, "stap", card);
-
-	result = qeth_send_setadapterparms_query(card);
-
-	if (result) {
-		PRINT_WARN("couldn't set adapter parameters on device %s: "
-			   "x%x\n", CARD_BUS_ID(card), result);
-		QETH_DBF_CARD1(0, trace, "SETADPFL", card);
-		sprintf(dbf_text, "%4x", result);
-		QETH_DBF_TEXT1(1, trace, dbf_text);
-		return;
-	}
-
-	sprintf(dbf_text, "spap%4x", card->adp_supported);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	if (qeth_is_adp_supported(IPA_SETADP_ALTER_MAC_ADDRESS)) {
-		QETH_DBF_CARD3(0, trace, "rdmc", card);
-		QETH_DBF_CARD2(0, setup, "rdmc", card);
-
-		result = qeth_send_setadapterparms_change_addr(card,
-							       IPA_SETADP_ALTER_MAC_ADDRESS,
-							       CHANGE_ADDR_READ_MAC,
-							       card->dev->
-							       dev_addr,
-							       OSA_ADDR_LEN);
-		if (result) {
-			PRINT_WARN("couldn't get MAC address on "
-				   "device %s: x%x\n",
-				   CARD_BUS_ID(card), result);
-			QETH_DBF_CARD1(0, trace, "NOMACADD", card);
-			sprintf(dbf_text, "%4x", result);
-			QETH_DBF_TEXT1(1, trace, dbf_text);
-		} else {
-			QETH_DBF_HEX2(0, setup, card->dev->dev_addr,
-				      __max(OSA_ADDR_LEN, QETH_DBF_SETUP_LEN));
-			QETH_DBF_HEX3(0, trace, card->dev->dev_addr,
-				      __max(OSA_ADDR_LEN, QETH_DBF_TRACE_LEN));
-		}
-	}
-	__qeth_setparms_hstr(card);
-}
-
-static inline void
-__qeth_start_vlan_assist(struct qeth_card *card)
-{
-#ifdef QETH_VLAN
-	char dbf_text[15];
-	int result;
-
-	if (!qeth_is_supported(IPA_FULL_VLAN)) {
-		PRINT_WARN("VLAN not supported on %s\n",
-			   card->dev_name);
-		QETH_DBF_TEXT2(0, trace, "vlnotsup");
-		return;
-	}
-	result = qeth_send_setassparms_simple_without_data(card,
-							   IPA_VLAN_PRIO,
-							   IPA_CMD_ASS_START);
-	QETH_DBF_TEXT2(0, trace, "enavlan");
-	if (result) {
-		PRINT_WARN("Could not start vlan "
-			   "assist on %s: 0x%x, continuing\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "VLAN%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		return;
-	}
-	card->dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
-#endif /* QETH_VLAN */
-}
-
-static inline void
-__qeth_start_mc_assist(struct qeth_card *card)
-{
-	char dbf_text[15];
-	int result;
-
-	if (!qeth_is_supported(IPA_MULTICASTING)) {
-		PRINT_WARN("multicasting not supported on %s\n",
-			   card->dev_name);
-		QETH_DBF_TEXT2(0, trace, "mcnotsup");
-		return;
-	}
-	result = qeth_send_setassparms_simple_without_data(card,
-							   IPA_MULTICASTING,
-							   IPA_CMD_ASS_START);
-	QETH_DBF_TEXT2(0, trace, "enamcass");
-	if (result) {
-		PRINT_WARN("Could not start multicast "
-			   "assist on %s: 0x%x, continuing\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "MCAS%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		return;
-	}
-	card->dev->flags |= IFF_MULTICAST;
-}
-
-static int
-__qeth_softsetup_enable_ipv6(struct qeth_card *card, int do_a_startlan6)
-{
-	int result;
-	char dbf_text[15];
-
-	if (do_a_startlan6) {
-		QETH_DBF_TEXT2(0, trace, "startln6");
-		netif_stop_queue(card->dev);
-		result = qeth_send_startlan(card, 6);
-		if (result) {
-			sprintf(dbf_text, "stl6%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			atomic_set(&card->is_softsetup, 0);
-			/* do not return an error */
-			if ((result == 0xe080) || (result == 0xf080))
-				result = 0;
-			return result;
-		}
-	}
-	netif_wake_queue(card->dev);
-
-	QETH_DBF_TEXT2(0, trace, "qipassi6");
-	result = qeth_send_qipassist(card, 6);
-	if (result) {
-		PRINT_WARN("couldn't send QIPASSIST6 on %s: 0x%x\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "QIP6%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		atomic_set(&card->is_softsetup, 0);
-		return result;
-	}
-		
-	sprintf(dbf_text, "%4x%4x", card->ipa6_supported, card->ipa6_enabled);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	QETH_DBF_TEXT2(0, trace, "enaipv46");
-	result = qeth_send_setassparms_simple_with_data(card, IPA_IPv6,
-							IPA_CMD_ASS_START, 3);
-	if (result) {
-		PRINT_WARN("Could not enable IPv4&6 assist "
-			   "on %s: 0x%x, continuing\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "I46A%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		/* go on */
-	}
-
-	QETH_DBF_TEXT2(0, trace, "enaipv6");
-	result = qeth_send_setassparms_simple_without_data6(card, IPA_IPv6,
-							    IPA_CMD_ASS_START);
-	if (result) {
-		PRINT_WARN("Could not start IPv6 assist "
-			   "on %s: 0x%x, continuing\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "I6AS%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		/* go on */
-	}
-
-	QETH_DBF_TEXT2(0, trace, "enapstr6");
-	result = qeth_send_setassparms_simple_without_data6(card, IPA_PASSTHRU,
-							    IPA_CMD_ASS_START);
-	if (result) {
-		PRINT_WARN("Could not enable passthrough "
-			   "on %s: 0x%x, continuing\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "PSTR%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		/* go on */
-	}
-	return 0;
-}
-
-static int
-__qeth_softsetup_start_assists(struct qeth_card *card)
-{
-	int result;
-	char dbf_text[15];
-	int do_a_startlan6 = 0;
-
-	if (atomic_read(&card->is_softsetup))
-		return 0;
-
-	atomic_set(&card->enable_routing_attempts4, QETH_ROUTING_ATTEMPTS);
-#ifdef QETH_IPV6
-	atomic_set(&card->enable_routing_attempts6, QETH_ROUTING_ATTEMPTS);
-#endif /* QETH_IPV6 */
-	if ((!atomic_read(&card->is_startlaned)) &&
-	    (atomic_read(&card->startlan_attempts))) {
-		atomic_dec(&card->startlan_attempts);
-		QETH_DBF_TEXT2(0, trace, "startlan");
-		netif_stop_queue(card->dev);
-		result = qeth_send_startlan(card, 4);
-		if (result) {
-			PRINT_WARN("couldn't send STARTLAN on %s "
-				   "(CHPID 0x%X): 0x%x (%s)\n",
-				   card->dev_name, card->chpid, result,
-				   (result == 0xe080) ?
-				   "startlan disabled (link "
-				   "failure -- please check the "
-				   "network, plug in the cable or "
-				   "enable the OSA port" :
-				   (result==0xf080) ?
-				   "startlan disabled (VM: LAN " \
-				   "is offline for functions " \
-				   "requiring LAN access.":
-				   "unknown return code");
-			sprintf(dbf_text, "stln%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			atomic_set(&card->is_softsetup, 0);
-			atomic_set(&card->is_startlaned, 0);
-			/* do not return an error */
-			if ((result == 0xe080) || (result == 0xf080)) {
-				result = 0;
-			}
-			return result;
-		}
-		do_a_startlan6 = 1;
-	}
-	netif_wake_queue(card->dev);
-	
-	qeth_do_setadapterparms_stuff(card);
-	
-	if (!qeth_is_supported(IPA_ARP_PROCESSING)) {
-		PRINT_WARN("oops... ARP processing not supported "
-			   "on %s!\n", card->dev_name);
-		QETH_DBF_TEXT1(0, trace, "NOarpPRC");
-	} else {
-		QETH_DBF_TEXT2(0, trace, "enaARPpr");
-		result = qeth_send_setassparms_simple_without_data
-			(card, IPA_ARP_PROCESSING, IPA_CMD_ASS_START);
-		if (result) {
-			PRINT_WARN("Could not start ARP processing "
-				   "assist on %s: 0x%x\n",
-				   card->dev_name, result);
-			sprintf(dbf_text, "ARPp%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			atomic_set(&card->is_softsetup, 0);
-			return result;
-		}
-	}
-	
-	if (qeth_is_supported(IPA_IP_FRAGMENTATION)) {
-		PRINT_INFO("IP fragmentation supported on "
-			   "%s... :-)\n", card->dev_name);
-		/* start it */
-		QETH_DBF_TEXT2(0, trace, "enaipfrg");
-		result = qeth_send_setassparms_simple_without_data
-			(card, IPA_IP_FRAGMENTATION, IPA_CMD_ASS_START);
-		if (result) {
-			PRINT_WARN("Could not start IP fragmenting "
-				   "assist on %s: 0x%x, continuing\n",
-				   card->dev_name, result);
-			sprintf(dbf_text, "IFRG%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			/* go on */
-		}
-	}
-	if (card->options.fake_ll == FAKE_LL) {
-		if (qeth_is_supported(IPA_SOURCE_MAC_AVAIL)) {
-			/* start it */
-			QETH_DBF_TEXT2(0, trace, "enainsrc");
-			result = qeth_send_setassparms_simple_without_data
-				(card, IPA_SOURCE_MAC_AVAIL, IPA_CMD_ASS_START);
-			if (result) {
-				PRINT_WARN
-					("Could not start inbound source "
-					 "assist on %s: 0x%x, continuing\n",
-					 card->dev_name, result);
-				sprintf(dbf_text, "INSR%4x", result);
-				QETH_DBF_TEXT2(0, trace, dbf_text);
-				/* go on */
-			}
-		} else {
-			PRINT_INFO("Inbound source addresses not "
-				   "supported on %s\n", card->dev_name);
-		}
-	}
-	__qeth_start_vlan_assist(card);
-	__qeth_start_mc_assist(card);
-	
-	if (!qeth_is_supported(IPA_IPv6)) {
-		QETH_DBF_TEXT2(0, trace, "ipv6ntsp");
-		PRINT_WARN("IPv6 not supported on %s\n", card->dev_name);
-	} else {
-		result = __qeth_softsetup_enable_ipv6(card, do_a_startlan6);
-		if (result != 0)
-			return result;
-	}
-	
-	card->broadcast_capable = 0;
-	if (!qeth_is_supported(IPA_FILTERING)) {
-		QETH_DBF_TEXT2(0, trace, "filtntsp");
-		PRINT_WARN("Broadcasting not supported on %s\n",
-			   card->dev_name);
-	} else {
-		QETH_DBF_TEXT2(0, trace, "enafiltr");
-		result = qeth_send_setassparms_simple_without_data
-			(card, IPA_FILTERING, IPA_CMD_ASS_START);
-		if (result) {
-			PRINT_WARN("Could not enable broadcast "
-				   "filtering on %s: "
-				   "0x%x, continuing\n",
-				   card->dev_name, result);
-			sprintf(dbf_text, "FLT1%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			goto go_on_filt;
-		}
-		result = qeth_send_setassparms_simple_with_data
-			(card, IPA_FILTERING, IPA_CMD_ASS_CONFIGURE, 1);
-		if (result) {
-			PRINT_WARN("Could not set up broadcast "
-				   "filtering on %s: "
-				   "0x%x, continuing\n",
-				   card->dev_name, result);
-			sprintf(dbf_text, "FLT2%4x", result);
-			QETH_DBF_TEXT2(0, trace, dbf_text);
-			goto go_on_filt;
-		}
-		card->dev->flags |= IFF_BROADCAST;
-		card->broadcast_capable = 1;
-	}
-go_on_filt:
-	if (card->options.checksum_type == HW_CHECKSUMMING) {
-		if (!qeth_is_supported(IPA_INBOUND_CHECKSUM)) {
-			PRINT_WARN("Inbound HW checksumming not "
-				   "supported on %s, continuing "
-				   "using inbound sw checksumming\n",
-				   card->dev_name);
-			QETH_DBF_TEXT2(0, trace, "ibckntsp");
-			card->options.checksum_type = SW_CHECKSUMMING;
-		} else {
-			QETH_DBF_TEXT2(0, trace, "ibcksupp");
-			result = qeth_send_setassparms_simple_without_data
-				(card, IPA_INBOUND_CHECKSUM,
-				 IPA_CMD_ASS_START);
-			if (result) {
-				PRINT_WARN("Could not start inbound "
-					   "checksumming on %s: 0x%x, "
-					   "continuing using "
-					   "inbound sw checksumming\n",
-					   card->dev_name, result);
-				sprintf(dbf_text, "SIBC%4x", result);
-				QETH_DBF_TEXT2(0, trace, dbf_text);
-				card->options.checksum_type = SW_CHECKSUMMING;
-				goto go_on_checksum;
-			}
-			result=qeth_send_setassparms_simple_with_data
-				(card,IPA_INBOUND_CHECKSUM,
-				 IPA_CMD_ASS_ENABLE, card->csum_enable_mask);
-			if (result) {
-				PRINT_WARN("Could not enable inbound " \
-					   "checksumming on %s: 0x%x, " \
-					   "continuing using " \
-					   "inbound sw checksumming\n",
-					   card->dev_name,result);
-				sprintf(dbf_text,"EIBC%4x",result);
-				QETH_DBF_TEXT2(0,trace,dbf_text);
-				card->options.checksum_type = SW_CHECKSUMMING;
-				goto go_on_checksum;
-
-			}
-		}
-	}
-go_on_checksum:	
-	atomic_set(&card->is_softsetup, 1);
-	return 0;
-}
-
-static inline void
-__qeth_softsetup_routingv4(struct qeth_card *card)
-{
-	int result;
-	char dbf_text[15];
-
-	if (!atomic_read(&card->enable_routing_attempts4))
-		return;
-
-	if (!card->options.routing_type4) {
-		atomic_set(&card->enable_routing_attempts4, 0);
-		atomic_set(&card->rt4fld, 0);
-		return;
-	}
-
-	sprintf(dbf_text, "strtg4%2x", card->options.routing_type4);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-	result = qeth_send_setrtg(card, card->options.routing_type4, 4);
-	if (!result) {	/* routing set correctly */
-		atomic_set(&card->enable_routing_attempts4, 0);
-		atomic_set(&card->rt4fld, 0);
-		return;
-	}
-	if (atomic_dec_return(&card->enable_routing_attempts4)) {
-		PRINT_WARN("couldn't set up v4 routing type "
-			   "on %s: 0x%x (%s).\nWill try "
-			   "next time again.\n",
-			   card->dev_name, result,
-			   ((result == 0xe010) || (result == 0xe008)) ?
-			   "primary already defined"
-			   : ((result == 0xe011) || (result == 0xe009)) ?
-			   "secondary already defined"
-			   : (result == 0xe012) ? "invalid indicator" :
-			   "unknown return code");
-		sprintf(dbf_text, "sRT4%4x", result);
-		atomic_set(&card->rt4fld, 1);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	} else {
-		PRINT_WARN("couldn't set up v4 routing type "
-			   "on %s: 0x%x (%s).\nTrying to "
-			   "continue without routing.\n",
-			   card->dev_name, result,
-			   ((result == 0xe010) || (result == 0xe008)) ?
-			   "primary already defined"
-			   : ((result == 0xe011) || (result == 0xe009)) ?
-			   "secondary already defined"
-			   : (result == 0xe012) ? "invalid indicator" :
-			   "unknown return code");
-		sprintf(dbf_text, "SRT4%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		atomic_set(&card->rt4fld, 1);
-	}
-}
-
-static void
-__qeth_softsetup_routingv6(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	int result;
-	char dbf_text[15];
-
-	if (!atomic_read(&card->enable_routing_attempts6))
-		return;
-
-	if (!card->options.routing_type6 ||
-	    ((card->type == QETH_CARD_TYPE_OSAE) &&
-	    ((card->options.routing_type6&ROUTER_MASK) == MULTICAST_ROUTER) &&
-	    !qeth_is_supported6(IPA_OSA_MC_ROUTER_AVAIL))) {
-		atomic_set(&card->enable_routing_attempts6, 0);
-		atomic_set(&card->rt6fld, 0);
-		return;
-	}
-	sprintf(dbf_text, "strtg6%2x", card->options.routing_type6);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-	result = qeth_send_setrtg(card, card->options.routing_type6, 6);
-	if (!result) {	/* routing set correctly */
-		atomic_set(&card->enable_routing_attempts6, 0);
-		atomic_set(&card->rt6fld, 0);
-		return;
-	}
-	if (atomic_dec_return(&card->enable_routing_attempts6)) {
-		PRINT_WARN("couldn't set up v6 routing type "
-			   "on %s: 0x%x (%s).\nWill try "
-			   "next time again.\n",
-			   card->dev_name, result,
-			   ((result == 0xe010) || (result == 0xe008)) ?
-			   "primary already defined"
-			   : ((result == 0xe011) || (result == 0xe009)) ?
-			   "secondary already defined"
-			   : (result == 0xe012) ? "invalid indicator" :
-			   "unknown return code");
-		sprintf(dbf_text, "sRT6%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		atomic_set(&card->rt6fld, 1);
-	} else {
-		PRINT_WARN("couldn't set up v6 routing type "
-			   "on %s: 0x%x (%s).\nTrying to "
-			   "continue without routing.\n",
-			   card->dev_name, result,
-			   ((result == 0xe010) || (result == 0xe008)) ?
-			   "primary already defined"
-			   : ((result == 0xe011) || (result == 0xe009)) ?
-			   "secondary already defined"
-			   : (result == 0xe012) ? "invalid indicator" :
-			   "unknown return code");
-		sprintf(dbf_text, "SRT6%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		atomic_set(&card->rt6fld, 1);
-	}
-#endif /* QETH_IPV6 */
-}
-
-static int
-qeth_softsetup_card(struct qeth_card *card, int wait_for_lock)
-{
-	int result;
-	char dbf_text[15];
-	int use_setip_retries = 1;
-
-	if (wait_for_lock == QETH_WAIT_FOR_LOCK) {
-		down(&card->softsetup_sema);
-	} else if (wait_for_lock == QETH_DONT_WAIT_FOR_LOCK) {
-		if (!down_trylock(&card->softsetup_sema)) {
-			return -EAGAIN;
-		}
-	} else if (wait_for_lock == QETH_LOCK_ALREADY_HELD) {
-		use_setip_retries = 0;	/* we are in recovery and don't want
-					   to repeat setting ips on and on */
-	} else {
-		return -EINVAL;
-	}
-
-	qeth_save_dev_flag_state(card);
-
-	QETH_DBF_CARD1(0, trace, wait_for_lock?"sscw":"sscn", card);
-
-	result = __qeth_softsetup_start_assists(card);
-	if (result)
-		goto out;
-
-	__qeth_softsetup_routingv4(card);
-	__qeth_softsetup_routingv6(card);
-
-	QETH_DBF_TEXT2(0, trace, "delvipa");
-	qeth_set_vipas(card, 0);
-	QETH_DBF_TEXT2(0, trace, "toip/ms");
-	qeth_takeover_ip_ipms(card);
-	qeth_takeover_ip_ipms6(card);
-	QETH_DBF_TEXT2(0, trace, "setvipa");
-	qeth_set_vipas(card, 1);
-
-	result = qeth_setips(card, use_setip_retries);
-	if (result) {		/* by now, qeth_setips does not return errors */
-		PRINT_WARN("couldn't set up IPs on %s: 0x%x\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "SSIP%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		atomic_set(&card->is_softsetup, 0);
-		goto out;
-	}
-	result = qeth_setipms(card, use_setip_retries);
-	if (result) {		/* by now, qeth_setipms does not return errors */
-		PRINT_WARN("couldn't set up multicast IPs on %s: 0x%x\n",
-			   card->dev_name, result);
-		sprintf(dbf_text, "ssim%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		atomic_set(&card->is_softsetup, 0);
-		goto out;
-	}
-out:
-	if (!result) {
-		netif_wake_queue(card->dev);
-	}
-	if (wait_for_lock != QETH_LOCK_ALREADY_HELD)
-		up(&card->softsetup_sema);
-	return result;
-}
-
-static int
-qeth_softsetup_thread(void *param)
-{
-	char name[15];
-	struct qeth_card *card = (struct qeth_card *) param;
-
-	/* set a nice name ... */
-	sprintf(name, "qethsoftd%s", CARD_BUS_ID(card));
-	daemonize(name);
-
-	QETH_DBF_CARD2(0, trace, "ssth", card);
-
-	atomic_set(&card->softsetup_thread_is_running, 1);
-	for (;;) {
-		if (atomic_read(&card->shutdown_phase))
-			goto out;
-		down_interruptible(&card->softsetup_thread_sem);
-		QETH_DBF_CARD2(0, trace, "ssst", card);
-		if (atomic_read(&card->shutdown_phase))
-			goto out;
-		while (qeth_softsetup_card(card, QETH_DONT_WAIT_FOR_LOCK)
-		       == -EAGAIN) {
-			if (atomic_read(&card->shutdown_phase))
-				goto out;
-			qeth_wait_nonbusy(QETH_IDLE_WAIT_TIME);
-		}
-		QETH_DBF_CARD2(0, trace, "sssd", card);
-		netif_wake_queue(card->dev);
-	}
-out:
-	atomic_set(&card->softsetup_thread_is_running, 0);
-
-	QETH_DBF_CARD2(0, trace, "lsst", card);
-
-	return 0;
-}
-
-static void
-qeth_softsetup_thread_starter(void *data)
-{
-	struct qeth_card *card = (struct qeth_card *) data;
-
-	QETH_DBF_CARD4(0, trace, "ssts", card);
-	sema_init(&card->softsetup_thread_sem, 0);
-	kernel_thread(qeth_softsetup_thread, card, SIGCHLD);
-}
-
-static void
-qeth_start_reinit_thread(struct qeth_card *card)
-{
-	/* we allow max 2 reinit threads, one could be just about to
-	 * finish and the next would be waiting. another waiting
-	 * reinit_thread is not necessary. */
-	if (atomic_read(&card->reinit_counter) < 2) {
-		atomic_inc(&card->reinit_counter);
-		if (atomic_read(&card->shutdown_phase)) {
-			atomic_dec(&card->reinit_counter);
-			return;
-		}
-		QETH_DBF_CARD2(0, trace, "stri", card);
-		PRINT_STUPID("starting reinit-thread\n");
-		kernel_thread(qeth_reinit_thread, card, SIGCHLD);
-	}
-}
-
-static void
-qeth_recover(void *data)
-{
-	struct qeth_card *card;
-	int i;
-	char dbf_text[15];
-
-	card = (struct qeth_card *) data;
-
-	QETH_DBF_CARD2(0, trace, "recv", card);
-
-	if (atomic_compare_and_swap(0, 1, &card->in_recovery))
-		return;
-
-	i = atomic_read(&card->problem);
-
-	sprintf(dbf_text, "PROB%4x", i);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	if (i != PROBLEM_TX_TIMEOUT)
-		PRINT_WARN("recovery was scheduled on device %s (%s) with "
-			   "problem 0x%x\n",
-			   CARD_BUS_ID(card), card->dev_name, i);
-	switch (i) {
-	case PROBLEM_RECEIVED_IDX_TERMINATE:
-		if (atomic_read(&card->in_recovery))
-			atomic_set(&card->break_out, QETH_BREAKOUT_AGAIN);
-		break;
-	case PROBLEM_CARD_HAS_STARTLANED:
-		PRINT_WARN("You are lucky! Somebody either fixed the "
-			   "network problem, plugged the cable back in "
-			   "or enabled the OSA port on %s (CHPID 0x%X). "
-			   "The link has come up.\n",
-			   card->dev_name, card->chpid);
-		sprintf(dbf_text, "CBIN%4x", i);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		atomic_set(&card->is_softsetup, 0);
-		qeth_set_dev_flag_running(card);
-		atomic_set(&card->enable_routing_attempts4,
-			   QETH_ROUTING_ATTEMPTS);
-		qeth_clear_ifa4_list(&card->ip_new_state.ip_ifa);
-#ifdef QETH_IPV6
-		atomic_set(&card->enable_routing_attempts6,
-			   QETH_ROUTING_ATTEMPTS);
-		qeth_clear_ifa6_list(&card->ip_new_state.ip6_ifa);
-#endif /* QETH_IPV6 */
-		qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm_ifa);
-#ifdef QETH_IPV6
-		qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm6_ifa);
-#endif /* QETH_IPV6 */
-		qeth_refresh_vipa_states(card);
-		qeth_start_softsetup_thread(card);
-		atomic_set(&card->in_recovery, 0);
-		break;
-	case PROBLEM_RESETTING_EVENT_INDICATOR:
-		/* we do nothing here */
-		break;
-	case PROBLEM_ACTIVATE_CHECK_CONDITION:
-	case PROBLEM_GENERAL_CHECK:
-	case PROBLEM_USER_TRIGGERED_RECOVERY:
-	case PROBLEM_AFFE:
-	case PROBLEM_MACHINE_CHECK:
-	case PROBLEM_BAD_SIGA_RESULT:
-	case PROBLEM_TX_TIMEOUT:
-		qeth_start_reinit_thread(card);
-		break;
-	}
-}
-
-static inline void
-qeth_schedule_recovery(struct qeth_card *card)
-{
-	if (card) {
-		INIT_WORK(&card->tqueue, qeth_recover, card);
-		schedule_work(&card->tqueue);
-	} else {
-		QETH_DBF_TEXT2(1, trace, "scdnocrd");
-		PRINT_WARN("recovery requested to be scheduled "
-			   "with no card!\n");
-	}
-}
-
-static void
-qeth_qdio_input_handler(struct ccw_device *cdev, unsigned int status,
-			unsigned int qdio_error, unsigned int siga_error,
-			unsigned int queue,
-			int first_element, int count, unsigned long card_ptr)
-{
-	struct net_device *dev;
-	struct qeth_card *card;
-	int problem;
-	int sbalf15;
-	char dbf_text[15];
-
-	sprintf(dbf_text, "qibhn%s", cdev->dev.bus_id);
-	QETH_DBF_HEX6(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-
-	card = (struct qeth_card *) card_ptr;
-
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.inbound_start_time = NOW;
-#endif /* QETH_PERFORMANCE_STATS */
-	dev = card->dev;
-
-	if (status & QDIO_STATUS_LOOK_FOR_ERROR) {
-		if (status & QDIO_STATUS_ACTIVATE_CHECK_CONDITION) {
-			problem = PROBLEM_ACTIVATE_CHECK_CONDITION;
-			atomic_set(&card->problem, problem);
-			QETH_DBF_TEXT1(0, trace, "IHACTQCK");
-			sprintf(dbf_text, "%4x%4x", first_element, count);
-			QETH_DBF_TEXT1(0, trace, dbf_text);
-			sprintf(dbf_text, "%4x%4x", queue, status);
-			QETH_DBF_TEXT1(0, trace, dbf_text);
-			QETH_DBF_CARD1(1, trace, "qscd", card);
-			qeth_schedule_recovery(card);
-			return;
-		}
-		sbalf15 = (card->inbound_qdio_buffers[(first_element + count - 1)
-						      & QDIO_MAX_BUFFERS_PER_Q].
-			   element[15].flags) && 0xff;
-		PRINT_STUPID("inbound qdio transfer error on device %s. "
-			     "qdio_error=0x%x (more than one: %c), "
-			     "siga_error=0x%x (more than one: %c), "
-			     "sbalf15=x%x, bufno=x%x\n", cdev->dev.bus_id,
-			     qdio_error,
-			     (status & QDIO_STATUS_MORE_THAN_ONE_QDIO_ERROR) ?
-			     'y' : 'n', siga_error,
-			     (status & QDIO_STATUS_MORE_THAN_ONE_SIGA_ERROR) ?
-			     'y' : 'n', sbalf15, first_element);
-		QETH_DBF_CARD1(0, trace, "IQTI", card);
-		QETH_DBF_CARD1(0, qerr, "IQTI", card);
-		sprintf(dbf_text, "%4x%4x", first_element, count);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_TEXT1(0, qerr, dbf_text);
-		sprintf(dbf_text, "%2x%4x%2x", queue, status, sbalf15);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_TEXT1(0, qerr, dbf_text);
-		sprintf(dbf_text, "%4x%4x", qdio_error, siga_error);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_TEXT1(0, qerr, dbf_text);
-		/* we inform about error more detailed in
-		 * qeth_read_in_buffer() */
-	}
-
-	for (;;) {
-		qeth_read_in_buffer(card, first_element);
-		qeth_queue_input_buffer(card, first_element,
-					QDIO_FLAG_UNDER_INTERRUPT);
-		count--;
-		if (count)
-			first_element = (first_element + 1) &
-			    (QDIO_MAX_BUFFERS_PER_Q - 1);
-		else
-			break;
-	}
-}
-
-static void
-__qeth_try_to_flush_packets(struct qeth_card *card, int last_pci_hit,
-			    unsigned int queue)
-{
-	int switch_state;
-
-	switch_state = (atomic_read(&card->outbound_used_buffers[queue]) <=
-			LOW_WATERMARK_PACK);
-	/* first_element is the last buffer that we got back from hydra */
-	if (!switch_state && !last_pci_hit)
-		return;
-	QETH_DBF_CARD3(0, trace, "stchcw", card);
-	if (atomic_swap(&card->outbound_ringbuffer_lock[queue], QETH_LOCK_FLUSH)
-	    == QETH_LOCK_UNLOCKED) {
-		/* 
-		 * we stop the queue as we try to not run onto the 
-		 * outbound_ringbuffer_lock -- this will not prevent it totally,
-		 * but reduce it. in high traffic situations, it saves around
-		 * 20us per second, hopefully this is amortized by calling 
-		 * netif_...
-		 */
-		netif_stop_queue(card->dev);
-		qeth_flush_packed_packets(card, queue,
-					  QDIO_FLAG_UNDER_INTERRUPT);
-		/* 
-		 * only switch state to non-packing, if the amount of used
-		 * buffers decreased
-		 */
-		if (switch_state)
-			card->send_state[queue] = SEND_STATE_DONT_PACK;
-		netif_wake_queue(card->dev);
-		atomic_set(&card->outbound_ringbuffer_lock[queue],
-			   QETH_LOCK_UNLOCKED);
-	}
-	/* 
-	 * if the lock was UNLOCKED, we flush ourselves, otherwise this is done
-	 * in do_send_packet when the lock is released
-	 */
-#ifdef QETH_PERFORMANCE_STATS
-	card->perf_stats.sc_p_dp++;
-#endif /* QETH_PERFORMANCE_STATS */
-}
-
-static void
-qeth_qdio_output_handler(struct ccw_device *cdev,
-			 unsigned int status,
-			 unsigned int qdio_error,
-			 unsigned int siga_error,
-			 unsigned int queue,
-			 int first_element, int count, unsigned long card_ptr)
-{
-	struct qeth_card *card;
-	int mycnt, problem, buffers_used;
-	int sbalf15;
-	char dbf_text[15];
-	int last_pci_hit = 0;
-	int last_pci;
-
-	sprintf(dbf_text, "qouthn%s", cdev->dev.bus_id);
-	QETH_DBF_HEX6(0, trace, dbf_text, QETH_DBF_TRACE_LEN);
-
-	mycnt = count;
-	card = (struct qeth_card *) card_ptr;
-
-	if (status & QDIO_STATUS_LOOK_FOR_ERROR) {
-		if (status & QDIO_STATUS_ACTIVATE_CHECK_CONDITION) {
-			problem = PROBLEM_ACTIVATE_CHECK_CONDITION;
-			atomic_set(&card->problem, problem);
-			QETH_DBF_TEXT1(0, trace, "OHACTQCK");
-			sprintf(dbf_text, "%4x%4x", first_element, count);
-			QETH_DBF_TEXT1(0, trace, dbf_text);
-			sprintf(dbf_text, "%4x%4x", queue, status);
-			QETH_DBF_TEXT1(0, trace, dbf_text);
-			QETH_DBF_CARD1(1, trace, "qscd", card);
-			qeth_schedule_recovery(card);
-			goto out;
-		}
-		sbalf15 = (card->outbound_ringbuffer[queue]->
-			   buffer[(first_element + count - 1) & QDIO_MAX_BUFFERS_PER_Q].
-			   element[15].flags) & 0xff;
-		PRINT_STUPID("outbound qdio transfer error on device %s, "
-			     "queue=%i. qdio_error=0x%x (more than one: %c),"
-			     " siga_error=0x%x (more than one: %c), "
-			     "sbalf15=x%x, bufno=x%x\n",
-			     cdev->dev.bus_id, queue, qdio_error, status &
-			     QDIO_STATUS_MORE_THAN_ONE_QDIO_ERROR ? 'y' : 'n',
-			     siga_error, status &
-			     QDIO_STATUS_MORE_THAN_ONE_SIGA_ERROR ? 'y' : 'n',
-			     sbalf15, first_element);
-		QETH_DBF_CARD1(0, trace, "IQTO", card);
-		QETH_DBF_CARD1(0, qerr, "IQTO", card);
-		sprintf(dbf_text, "%4x%4x", first_element, count);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_TEXT1(0, qerr, dbf_text);
-		sprintf(dbf_text, "%2x%4x%2x", queue, status, sbalf15);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_TEXT1(0, qerr, dbf_text);
-		sprintf(dbf_text, "%4x%4x", qdio_error, siga_error);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_TEXT1(0, qerr, dbf_text);
-		/* we maybe do recovery or dst_link_failures
-		 * in qeth_free_buffer */
-	}
-
-	if (mycnt) {
-		last_pci = atomic_read(&card->last_pci_pos[queue]);
-		for (;;) {
-			qeth_free_buffer(card, queue, first_element,
-					 qdio_error, siga_error);
-			if (first_element == last_pci)
-				last_pci_hit = 1;
-			mycnt--;
-			if (mycnt > 0)
-				first_element = (first_element + 1) &
-				    (QDIO_MAX_BUFFERS_PER_Q - 1);
-			else
-				break;
-		}
-	}
-
-	buffers_used = atomic_add_return(-count,
-					 &card->outbound_used_buffers[queue])
-		       + count;
-
-	switch (card->send_state[queue]) {
-	case SEND_STATE_PACK:
-		__qeth_try_to_flush_packets(card, last_pci_hit, queue);
-		break;
-	default:
-		break;
-	}
-
-	/* we don't have to start the queue, if it was started already */
-	if (buffers_used < QDIO_MAX_BUFFERS_PER_Q - 1)
-		return;
-
-out:
-	netif_wake_queue(card->dev);
-}
-
-static long
-__qeth_check_irb_error(struct ccw_device *cdev, struct irb *irb)
-{
-	if (!IS_ERR(irb))
-		return 0;
-
-	switch (PTR_ERR(irb)) {
-	case -EIO:
-		PRINT_WARN("i/o-error on device %s\n", cdev->dev.bus_id);
-		break;
-	case -ETIMEDOUT:
-		PRINT_WARN("timeout on device %s\n", cdev->dev.bus_id);
-		break;
-	default:
-		PRINT_WARN("unknown error %ld on device %s\n", PTR_ERR(irb),
-			   cdev->dev.bus_id);
-	}
-	return PTR_ERR(irb);
-}
-
-static void
-qeth_interrupt_handler_read(struct ccw_device *cdev, unsigned long intparm,
-			    struct irb *irb)
-{
-	int cstat, dstat;
-	int problem;
-	struct qeth_card *card;
-	int rqparam;
-	char dbf_text[15];
-	int result;
-
-	if (__qeth_check_irb_error(cdev, irb))
-		return;
-
-	cstat = irb->scsw.cstat;
-	dstat = irb->scsw.dstat;
-	rqparam = intparm;
-
-	sprintf(dbf_text, "rint%s", cdev->dev.bus_id);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x%4x", cstat, dstat);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x", rqparam);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-
-	card = CARD_FROM_CDEV(cdev);
-	if (!card)
-		return;
-
-	if (irb->scsw.fctl & (SCSW_FCTL_CLEAR_FUNC | SCSW_FCTL_HALT_FUNC)) {
-		atomic_set(&card->clear_succeeded0, 1);
-		wake_up(&card->wait_q);
-		return;
-	}
-
-	if (!rqparam) {
-		PRINT_STUPID("got unsolicited interrupt in read handler "
-			     "for %s\n", cdev->dev.bus_id);
-		return;
-	}
-
-	if ((dstat == 0) && (cstat == 0))
-		return;
-
-	if (irb->esw.esw0.erw.cons) {
-		PRINT_WARN("sense data available on read channel.\n");
-		HEXDUMP16(WARN, "irb: ", irb);
-		HEXDUMP16(WARN, "sense data: ", irb->ecw);
-		sprintf(dbf_text, "RSNS%s", cdev->dev.bus_id);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_HEX0(0, sense, irb, QETH_DBF_SENSE_LEN);
-	}
-
-	if (cstat != 0) {
-		PRINT_WARN("got nonzero-nonpci channel status in read_"
-			   "handler (device %s, devstat 0x%02x, schstat "
-			   "0x%02x, rqparam 0x%x)\n", cdev->dev.bus_id,
-			   dstat, cstat, rqparam);
-	}
-
-	problem = qeth_get_cards_problem(cdev, card->dma_stuff->recbuf,
-					 dstat, cstat, rqparam,
-					 (char *) irb, (char *) irb->ecw);
-
-	/* detect errors in dstat here */
-	if ((dstat & DEV_STAT_UNIT_EXCEP) || (dstat & DEV_STAT_UNIT_CHECK)) {
-		PRINT_WARN("unit check/exception in read_handler "
-			   "(device %s, devstat 0x%02x, schstat 0x%02x, "
-			   "rqparam 0x%x)\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-
-		if (!atomic_read(&card->is_hardsetup)) {
-			if ((problem) && (qeth_is_to_recover(card, problem)))
-				atomic_set(&card->break_out,
-					   QETH_BREAKOUT_AGAIN);
-			else
-				atomic_set(&card->break_out,
-					   QETH_BREAKOUT_LEAVE);
-			goto wakeup_out;
-		} else
-			goto recover;
-	}
-
-	if (!(dstat & DEV_STAT_CHN_END)) {
-		PRINT_WARN("didn't get device end in read_handler "
-			   "(device %s, devstat 0x%02x, schstat 0x%02x, "
-			   "rqparam 0x%x)\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-		goto wakeup_out;
-	}
-
-	if ((rqparam == IDX_ACTIVATE_WRITE_STATE) || (rqparam == NOP_STATE)) {
-		goto wakeup_out;
-	}
-
-	/* at this point, (maybe channel end and) device end has appeared */
-
-	/* we don't start the next read until we have examined the buffer. */
-	if ((rqparam != IDX_ACTIVATE_READ_STATE) &&
-	    (rqparam != IDX_ACTIVATE_WRITE_STATE))
-		qeth_issue_next_read(card);
-
-recover:
-	if (qeth_is_to_recover(card, problem)) {
-		QETH_DBF_CARD2(1, trace, "rscd", card);
-		qeth_schedule_recovery(card);
-		goto wakeup_out;
-	}
-
-	if (!IS_IPA(card->dma_stuff->recbuf) ||
-	    IS_IPA_REPLY(card->dma_stuff->recbuf)) {
-		/* setup or unknown data */
-		result = qeth_look_for_arp_data(card);
-		switch (result) {
-		case ARP_RETURNCODE_ERROR:
-		case ARP_RETURNCODE_LASTREPLY:
-			qeth_wakeup_ioctl(card);
-			return;
-		default:
-			break;
-		}
-	}
-
-wakeup_out:
-	memcpy(card->ipa_buf, card->dma_stuff->recbuf, QETH_BUFSIZE);
-	qeth_wakeup(card);
-}
-
-static void
-qeth_interrupt_handler_write(struct ccw_device *cdev, unsigned long intparm,
-			     struct irb *irb)
-{
-	int cstat, dstat, rqparam;
-	struct qeth_card *card;
-	int problem;
-	char dbf_text[15];
-
-	if (__qeth_check_irb_error(cdev, irb))
-		return;
-
-	cstat = irb->scsw.cstat;
-	dstat = irb->scsw.dstat;
-	rqparam = intparm;
-
-	sprintf(dbf_text, "wint%s", cdev->dev.bus_id);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x%4x", cstat, dstat);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x", rqparam);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-
-	card = CARD_FROM_CDEV(cdev);
-	if (!card)
-		return;
-
-	if (irb->scsw.fctl & (SCSW_FCTL_CLEAR_FUNC | SCSW_FCTL_HALT_FUNC)) {
-		atomic_set(&card->clear_succeeded1, 1);
-		wake_up(&card->wait_q);
-		goto out;
-	}
-
-	if (!rqparam) {
-		PRINT_STUPID("got unsolicited interrupt in write handler "
-			     "for %s\n", cdev->dev.bus_id);
-		return;
-	}
-
-	if ((dstat == 0) && (cstat == 0))
-		goto out;
-
-	if (irb->esw.esw0.erw.cons) {
-		PRINT_WARN("sense data available on write channel.\n");
-		HEXDUMP16(WARN, "irb: ", irb);
-		HEXDUMP16(WARN, "sense data: ", irb->ecw);
-		sprintf(dbf_text, "WSNS%s", cdev->dev.bus_id);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_HEX0(0, sense, irb, QETH_DBF_SENSE_LEN);
-	}
-
-	if (cstat != 0) {
-		PRINT_WARN("got nonzero channel status in write_handler "
-			   "(device %s, devstat 0x%02x, schstat 0x%02x, "
-			   "rqparam 0x%x)\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-	}
-
-	problem = qeth_get_cards_problem(cdev, NULL,
-					 dstat, cstat, rqparam,
-					 (char *) irb, (char *) irb->ecw);
-
-	/* detect errors in dstat here */
-	if ((dstat & DEV_STAT_UNIT_EXCEP) || (dstat & DEV_STAT_UNIT_CHECK)) {
-		PRINT_WARN("unit check/exception in write_handler "
-			   "(device %s, devstat 0x%02x, schstat 0x%02x, "
-			   "rqparam 0x%x)\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-		if (!atomic_read(&card->is_hardsetup)) {
-			if (problem == PROBLEM_RESETTING_EVENT_INDICATOR) {
-				atomic_set(&card->break_out,
-					   QETH_BREAKOUT_AGAIN);
-				qeth_wakeup(card);
-				goto out;
-			}
-			atomic_set(&card->break_out, QETH_BREAKOUT_LEAVE);
-			goto out;
-		} else
-			goto recover;
-	}
-
-	if (dstat == DEV_STAT_DEV_END)
-		goto out;
-
-	if (!(dstat & DEV_STAT_CHN_END)) {
-		PRINT_WARN("didn't get device end in write_handler "
-			   "(device %s, devstat 0x%02x, schstat 0x%02x, "
-			   "rqparam 0x%x)\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-		goto out;
-	}
-
-recover:
-	if (qeth_is_to_recover(card, problem)) {
-		QETH_DBF_CARD2(1, trace, "wscd", card);
-		qeth_schedule_recovery(card);
-		goto out;
-	}
-
-	/* at this point, (maybe channel end and) device end has appeared */
-	if ((rqparam == IDX_ACTIVATE_READ_STATE) ||
-	    (rqparam == IDX_ACTIVATE_WRITE_STATE) || (rqparam == NOP_STATE)) {
-		qeth_wakeup(card);
-		goto out;
-	}
-
-	/* well, a write has been done successfully. */
-
-out:
-	/* all statuses are final statuses on the write channel */
-	atomic_set(&card->write_busy, 0);
-}
-
-static void
-qeth_interrupt_handler_qdio(struct ccw_device *cdev, unsigned long intparm,
-			    struct irb *irb)
-{
-	int cstat, dstat, rqparam;
-	char dbf_text[15];
-	struct qeth_card *card;
-
-	if (__qeth_check_irb_error(cdev, irb))
-		return;
-
-	cstat = irb->scsw.cstat;
-	dstat = irb->scsw.dstat;
-	rqparam = intparm;
-
-	sprintf(dbf_text, "qint%s", cdev->dev.bus_id);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x%4x", cstat, dstat);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	sprintf(dbf_text, "%4x", rqparam);
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-
-	card = CARD_FROM_CDEV(cdev);
-	if (!card)
-		return;
-
-	if (irb->scsw.fctl & (SCSW_FCTL_CLEAR_FUNC | SCSW_FCTL_HALT_FUNC)) {
-		atomic_set(&card->clear_succeeded2, 1);
-		wake_up(&card->wait_q);
-		return;
-	}
-
-	if (!rqparam) {
-		PRINT_STUPID("got unsolicited interrupt in qdio handler, "
-			     "device%s\n", cdev->dev.bus_id);
-		return;
-	}
-
-	if ((dstat == 0) && (cstat == 0))
-		return;
-
-	if (irb->esw.esw0.erw.cons) {
-		PRINT_WARN("sense data available on qdio channel.\n");
-		HEXDUMP16(WARN, "irb: ", irb);
-		HEXDUMP16(WARN, "sense data: ", irb->ecw);
-		sprintf(dbf_text, "QSNS%s", cdev->dev.bus_id);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		QETH_DBF_HEX0(0, sense, irb, QETH_DBF_SENSE_LEN);
-	}
-
-	if (rqparam == NOP_STATE) {
-		qeth_wakeup(card);
-		return;
-	}
-
-	if (cstat != 0) {
-		sprintf(dbf_text, "qchk%s", cdev->dev.bus_id);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		sprintf(dbf_text, "%4x%4x", cstat, dstat);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		sprintf(dbf_text, "%4x", rqparam);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		PRINT_WARN("got nonzero channel status in qdio_handler "
-			   "(device %s, devstat 0x%02x, schstat 0x%02x)\n",
-			   cdev->dev.bus_id, dstat, cstat);
-	}
-
-	if (dstat & ~(DEV_STAT_CHN_END | DEV_STAT_DEV_END)) {
-		PRINT_WARN("got the following dstat on the qdio channel: "
-			   "device %s, dstat 0x%02x, cstat 0x%02x, "
-			   "rqparam=%i\n",
-			   cdev->dev.bus_id, dstat, cstat, rqparam);
-	}
-
-}
-
-static int
-qeth_register_netdev(struct qeth_card *card)
-{
-	int result;
-
-	QETH_DBF_CARD3(0, trace, "rgnd", card);
-
-	/* sysfs magic */
-	SET_NETDEV_DEV(card->dev, &card->gdev->dev);
-	result = register_netdev(card->dev);
-
-	return result;
-}
-
-static void
-qeth_unregister_netdev(struct qeth_card *card)
-{
-	QETH_DBF_CARD3(0, trace, "nrgn", card);
-
-	unregister_netdev(card->dev);
-}
-
-static int
-qeth_stop(struct net_device *dev)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-	QETH_DBF_CARD2(0, trace, "stop", card);
-	QETH_DBF_CARD2(0, setup, "stop", card);
-
-	qeth_save_dev_flag_state(card);
-
-	netif_stop_queue(dev);
-	atomic_set(&card->is_open, 0);
-
-	return 0;
-}
-
-static void
-qeth_softshutdown(struct qeth_card *card)
-{
-	QETH_DBF_CARD3(0, trace, "ssht", card);
-
-	qeth_send_stoplan(card);
-}
-
-static void
-__qeth_clear_card_halt_clear(struct qeth_card *card, int halt)
-{
-	unsigned long flags0, flags1, flags2;
-	int ret0, ret1, ret2;
-
-	atomic_set(&card->clear_succeeded0, 0);
-	atomic_set(&card->clear_succeeded1, 0);
-	atomic_set(&card->clear_succeeded2, 0);
-	
-	spin_lock_irqsave(get_ccwdev_lock(CARD_RDEV(card)), flags0);
-	if (halt)
-		ret0 = ccw_device_halt(CARD_RDEV(card), CLEAR_STATE);
-	else
-		ret0 = ccw_device_clear(CARD_RDEV(card), CLEAR_STATE);
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_RDEV(card)), flags0);
-	
-	spin_lock_irqsave(get_ccwdev_lock(CARD_WDEV(card)), flags1);
-	if (halt)
-		ret1 = ccw_device_halt(CARD_WDEV(card), CLEAR_STATE);
-	else
-		ret1 = ccw_device_clear(CARD_WDEV(card), CLEAR_STATE);
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_WDEV(card)), flags1);
-	
-	spin_lock_irqsave(get_ccwdev_lock(CARD_DDEV(card)), flags2);
-	if (halt)
-		ret2 = ccw_device_halt(CARD_DDEV(card), CLEAR_STATE);
-	else
-		ret2 = ccw_device_clear(CARD_DDEV(card), CLEAR_STATE);
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_DDEV(card)), flags2);
-
-	/* The device owns us an interrupt. */
-	if ((ret0 == 0) && (atomic_read(&card->clear_succeeded0) == 0))
-		wait_event(card->wait_q,
-			   atomic_read(&card->clear_succeeded0) == 1);
-	if ((ret1 == 0) && (atomic_read(&card->clear_succeeded1) == 0))
-		wait_event(card->wait_q,
-			   atomic_read(&card->clear_succeeded1) == 1);
-	if ((ret2 == 0) && (atomic_read(&card->clear_succeeded2) == 0))
-		wait_event(card->wait_q,
-			   atomic_read(&card->clear_succeeded2) == 1);
-}
-
-static void
-qeth_clear_card(struct qeth_card *card, int qdio_clean, int use_halt)
-{
-	QETH_DBF_CARD3(0, trace, qdio_clean?"clrq":"clr", card);
-	QETH_DBF_CARD1(0, setup, qdio_clean?"clrq":"clr", card);
-
-	atomic_set(&card->write_busy, 0);
-	if (qdio_clean)
-		qdio_cleanup(CARD_DDEV(card),
-			     (card->type == QETH_CARD_TYPE_IQD) ?
-			     QDIO_FLAG_CLEANUP_USING_HALT :
-			     QDIO_FLAG_CLEANUP_USING_CLEAR);
-
-	if (use_halt)
-		__qeth_clear_card_halt_clear(card, 1);
-
-	__qeth_clear_card_halt_clear(card, 0);
-}
-
-static void
-qeth_free_card_stuff(struct qeth_card *card)
-{
-	int i, j;
-	struct qeth_vipa_entry *e, *e2;
-
-	if (!card)
-		return;
-
-	QETH_DBF_CARD3(0, trace, "freest", card);
-	QETH_DBF_CARD1(0, setup, "freest", card);
-
-	write_lock(&card->vipa_list_lock);
-	e = card->vipa_list;
-	while (e) {
-		e2 = e->next;
-		kfree(e);
-		e = e2;
-	}
-	write_unlock(&card->vipa_list_lock);
-
-	for (i = 0; i < card->options.inbound_buffer_count; i++) {
-		for (j = 0; j < BUFFER_MAX_ELEMENTS; j++) {
-			if (card->inbound_buffer_pool_entry[i][j]) {
-				kfree(card->inbound_buffer_pool_entry[i][j]);
-				card->inbound_buffer_pool_entry[i][j] = NULL;
-			}
-		}
-	}
-	for (i = 0; i < card->no_queues; i++)
-		if (card->outbound_ringbuffer[i])
-			vfree(card->outbound_ringbuffer[i]);
-
-	if (card->stats)
-		kfree(card->stats);
-	if (card->dma_stuff)
-		kfree(card->dma_stuff);
-	if (card->dev)
-		free_netdev(card->dev);
-
-}
-
-static void
-qeth_free_card(struct qeth_card *card)
-{
-
-	if (!card)
-		return;
-
-	QETH_DBF_CARD3(0, trace, "free", card);
-	QETH_DBF_CARD1(0, setup, "free", card);
-
-	vfree(card);		/* we checked against NULL already */
-}
-
-/* also locked from outside (setup_lock) */
-static void
-qeth_remove_card_from_list(struct qeth_card *card)
-{
-	struct qeth_card *cn;
-	unsigned long flags0, flags1, flags2;
-
-	write_lock(&list_lock);
-	if (!card) {
-		QETH_DBF_TEXT2(0, trace, "RMCWNOCD");
-		PRINT_WARN("qeth_remove_card_from_list call with no card!\n");
-		write_unlock(&list_lock);
-		return;
-	}
-
-	QETH_DBF_CARD3(0, trace, "rmcl", card);
-
-	/* check first, if card is in list */
-	if (!firstcard) {
-		QETH_DBF_TEXT2(0, trace, "NOCRDINL");
-		PRINT_WARN
-		    ("qeth_remove_card_from_list called on empty card list!!\n");
-		write_unlock(&list_lock);
-		return;
-	}
-
-	spin_lock_irqsave(get_ccwdev_lock(CARD_RDEV(card)), flags0);
-	spin_lock_irqsave(get_ccwdev_lock(CARD_WDEV(card)), flags1);
-	spin_lock_irqsave(get_ccwdev_lock(CARD_DDEV(card)), flags2);
-
-	if (firstcard == card)
-		firstcard = card->next;
-	else {
-		cn = firstcard;
-		while (cn->next) {
-			if (cn->next == card) {
-				cn->next = card->next;
-				card->next = NULL;
-				break;
-			}
-			cn = cn->next;
-		}
-	}
-
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_DDEV(card)), flags2);
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_WDEV(card)), flags1);
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_RDEV(card)), flags0);
-
-	write_unlock(&list_lock);
-
-}
-
-static void
-qeth_delete_all_ips(struct qeth_card *card)
-{
-	struct qeth_vipa_entry *e;
-
-	if (atomic_read(&card->is_softsetup)) {
-		qeth_clear_ifa4_list(&card->ip_new_state.ip_ifa);
-		qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm_ifa);
-
-#ifdef QETH_IPV6
-		qeth_clear_ifa6_list(&card->ip_new_state.ip6_ifa);
-		qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm6_ifa);
-#endif /* QETH_IPV6 */
-
-		write_lock(&card->vipa_list_lock);
-		e = card->vipa_list;
-		while (e) {
-			e->state = VIPA_2_B_REMOVED;
-			e = e->next;
-		}
-		write_unlock(&card->vipa_list_lock);
-		qeth_start_softsetup_thread(card);
-	}
-}
-
-static void
-qeth_remove_card(struct qeth_card *card, int method)
-{
-	if (!card)
-		return;
-
-	QETH_DBF_CARD2(0, trace, "rmcd", card);
-	QETH_DBF_CARD1(0, setup, "rmcd", card);
-
-	if (method == QETH_REMOVE_CARD_PROPER) {
-		atomic_set(&card->shutdown_phase, QETH_REMOVE_CARD_PROPER);
-		if (atomic_read(&card->is_open)) {
-			qeth_stop(card->dev);
-			qeth_wait_nonbusy(QETH_REMOVE_WAIT_TIME);
-		}
-		qeth_delete_all_ips(card);
-	} else {
-		atomic_set(&card->shutdown_phase, QETH_REMOVE_CARD_QUICK);
-	}
-	atomic_set(&card->write_busy, 0);
-
-	QETH_DBF_TEXT4(0, trace, "freeskbs");
-	qeth_free_all_skbs(card);
-
-	QETH_DBF_TEXT2(0, trace, "upthrsem");
-
-	up(&card->softsetup_thread_sem);
-	up(&card->reinit_thread_sem);
-	while ((atomic_read(&card->softsetup_thread_is_running)) ||
-	       (atomic_read(&card->reinit_counter))) {
-		qeth_wait_nonbusy(QETH_WAIT_FOR_THREAD_TIME);
-	}
-
-	if (method == QETH_REMOVE_CARD_PROPER) {
-		QETH_DBF_TEXT4(0, trace, "softshut");
-		qeth_softshutdown(card);
-		qeth_wait_nonbusy(QETH_REMOVE_WAIT_TIME);
-	}
-
-	atomic_set(&card->is_startlaned, 0);	/* paranoia, qeth_stop
-						   should prevent
-						   further calls of
-						   hard_start_xmit */
-
-	if (atomic_read(&card->is_registered)) {
-		QETH_DBF_TEXT2(0, trace, "unregdev");
-		qeth_unregister_netdev(card);
-		qeth_wait_nonbusy(QETH_REMOVE_WAIT_TIME);
-		atomic_set(&card->is_registered, 0);
-	}
-
-	qeth_put_unique_id(card);
-
-	QETH_DBF_TEXT2(0, trace, "clrcard");
-	if (atomic_read(&card->is_hardsetup)) {
-		PRINT_STUPID("clearing card %s\n", card->dev_name);
-		qeth_clear_card(card, 1, 0);
-	}
-
-	atomic_set(&card->is_hardsetup, 0);
-	atomic_set(&card->is_softsetup, 0);
-
-	QETH_DBF_TEXT2(0, trace, "cardrmvd");
-
-}
-
-static void
-qeth_set_multicast_list(struct net_device *dev)
-{
-	struct qeth_card *card = dev->priv;
-
-	QETH_DBF_CARD2(0, trace, "smcl", card);
-
-	qeth_start_softsetup_thread(card);
-}
-
-static int
-qeth_set_mac_address(struct net_device *dev, void *addr)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-	QETH_DBF_CARD2(0, trace, "stmc", card);
-
-	return -EOPNOTSUPP;
-}
-
-static int
-qeth_neigh_setup(struct net_device *dev, struct neigh_parms *np)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-	QETH_DBF_CARD2(0, trace, "ngst", card);
-
-	return 0;
-}
-
-static void
-qeth_generate_tokens(struct qeth_card *card)
-{
-	card->token.issuer_rm_w = 0x00010103UL;
-	card->token.cm_filter_w = 0x00010108UL;
-	card->token.cm_connection_w = 0x0001010aUL;
-	card->token.ulp_filter_w = 0x0001010bUL;
-	card->token.ulp_connection_w = 0x0001010dUL;
-}
-
-static int
-qeth_peer_func_level(int level)
-{
-	if ((level & 0xff) == 8)
-		return (level & 0xff) + 0x400;
-	if (((level >> 8) & 3) == 1)
-		return (level & 0xff) + 0x200;
-	return level;		/* hmmm... don't know what to do with that level. */
-}
-
-/* returns last four digits of bus_id */
-/* FIXME: device driver shouldn't be aware of bus_id format - but don't know
-   what else to use... (CH) */
-static inline __u16
-__raw_devno_from_bus_id(char *id)
-{
-	id += (strlen(id) - 4); 
-	return (__u16) simple_strtoul(id, &id, 16);
-}
-
-static int
-qeth_idx_activate_read(struct qeth_card *card)
-{
-	int result, result2;
-	__u16 temp;
-	unsigned long flags;
-	char dbf_text[15];
-
-	result = result2 = 0;
-
-	memcpy(&card->dma_stuff->write_ccw, WRITE_CCW, sizeof (struct ccw1));
-	card->dma_stuff->write_ccw.count = IDX_ACTIVATE_SIZE;
-	card->dma_stuff->write_ccw.cda =
-	    QETH_GET_ADDR(card->dma_stuff->sendbuf);
-
-	memcpy(card->dma_stuff->sendbuf, IDX_ACTIVATE_READ, IDX_ACTIVATE_SIZE);
-	memcpy(QETH_TRANSPORT_HEADER_SEQ_NO(card->dma_stuff->sendbuf),
-	       &card->seqno.trans_hdr, QETH_SEQ_NO_LENGTH);
-
-	memcpy(QETH_IDX_ACT_ISSUER_RM_TOKEN(card->dma_stuff->sendbuf),
-	       &card->token.issuer_rm_w, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_IDX_ACT_FUNC_LEVEL(card->dma_stuff->sendbuf),
-	       &card->func_level, 2);
-
-	temp = __raw_devno_from_bus_id(CARD_DDEV_ID(card));
-	memcpy(QETH_IDX_ACT_QDIO_DEV_CUA(card->dma_stuff->sendbuf), &temp, 2);
-	temp = (card->cula << 8) + card->unit_addr2;
-	memcpy(QETH_IDX_ACT_QDIO_DEV_REALADDR(card->dma_stuff->sendbuf),
-	       &temp, 2);
-
-	QETH_DBF_TEXT2(0, trace, "iarw");
-	QETH_DBF_TEXT2(0, trace, CARD_RDEV_ID(card));
-	QETH_DBF_HEX2(0, control, card->dma_stuff->sendbuf,
-		      QETH_DBF_CONTROL_LEN);
-
-	spin_lock_irqsave(get_ccwdev_lock(CARD_RDEV(card)), flags);
-	result = ccw_device_start(CARD_RDEV(card), &card->dma_stuff->write_ccw,
-				  IDX_ACTIVATE_WRITE_STATE, 0, 0);
-	if (result) {
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO);
-		result2 = ccw_device_start(CARD_RDEV(card),
-					   &card->dma_stuff->write_ccw,
-					   IDX_ACTIVATE_WRITE_STATE, 0, 0);
-		sprintf(dbf_text, "IRW1%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		sprintf(dbf_text, "IRW2%4x", result2);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		PRINT_WARN("qeth_idx_activate_read (write): do_IO returned "
-			   "%i, next try returns %i\n", result, result2);
-	}
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_RDEV(card)), flags);
-
-	if (atomic_read(&card->break_out)) {
-		QETH_DBF_TEXT3(0, trace, "IARWBRKO");
-		return -EIO;
-	}
-
-	if (qeth_sleepon(card, QETH_MPC_TIMEOUT)) {
-		QETH_DBF_TEXT1(0, trace, "IRWT");
-		QETH_DBF_TEXT1(0, trace, CARD_RDEV_ID(card));
-		PRINT_ERR("IDX_ACTIVATE(wr) on read channel device %s: "
-			  "timeout\n", CARD_RDEV_ID(card));
-		return -EIO;
-	}
-
-/* start reading on read channel, card->read_ccw is not yet used */
-	memcpy(&card->dma_stuff->read_ccw, READ_CCW, sizeof (struct ccw1));
-	card->dma_stuff->read_ccw.count = QETH_BUFSIZE;
-	card->dma_stuff->read_ccw.cda = QETH_GET_ADDR(card->dma_stuff->recbuf);
-
-	spin_lock_irqsave(get_ccwdev_lock(CARD_RDEV(card)), flags);
-	result2 = 0;
-	result = ccw_device_start(CARD_RDEV(card), &card->dma_stuff->read_ccw,
-				  IDX_ACTIVATE_READ_STATE, 0, 0);
-	if (result) {
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO);
-		result2 = ccw_device_start(CARD_RDEV(card),
-					   &card->dma_stuff->read_ccw,
-					   IDX_ACTIVATE_READ_STATE, 0, 0);
-		sprintf(dbf_text, "IRR1%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		sprintf(dbf_text, "IRR2%4x", result2);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		PRINT_WARN("qeth_idx_activate_read (read): do_IO "
-			   "returned %i, next try returns %i\n",
-			   result, result2);
-	}
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_RDEV(card)), flags);
-
-	if (result2) {
-		result = result2;
-		if (result)
-			return result;
-	}
-
-	if (qeth_sleepon(card, QETH_MPC_TIMEOUT)) {
-		QETH_DBF_TEXT1(0, trace, "IRRT");
-		QETH_DBF_TEXT1(0, trace, CARD_RDEV_ID(card));
-		PRINT_ERR("IDX_ACTIVATE(rd) on read channel device %s: "
-			  "timeout\n", CARD_RDEV_ID(card));
-		return -EIO;
-	}
-	QETH_DBF_TEXT2(0, trace, "iarr");
-	QETH_DBF_TEXT2(0, trace, CARD_RDEV_ID(card));
-	QETH_DBF_HEX2(0, control, card->dma_stuff->recbuf,
-		      QETH_DBF_CONTROL_LEN);
-
-	if (!(QETH_IS_IDX_ACT_POS_REPLY(card->dma_stuff->recbuf))) {
-		QETH_DBF_TEXT1(0, trace, "IRNR");
-		QETH_DBF_TEXT1(0, trace, CARD_RDEV_ID(card));
-		PRINT_ERR("IDX_ACTIVATE on read channel device %s: negative "
-			  "reply\n", CARD_RDEV_ID(card));
-		return -EIO;
-	}
-
-	card->portname_required =
-	    ((!QETH_IDX_NO_PORTNAME_REQUIRED(card->dma_stuff->recbuf)) &&
-	     (card->type == QETH_CARD_TYPE_OSAE));
-
-	/*
-	 * however, as the portname indication of OSA is wrong, we have to
-	 * do this:
-	 */
-	card->portname_required = (card->type == QETH_CARD_TYPE_OSAE);
-
-	memcpy(&temp, QETH_IDX_ACT_FUNC_LEVEL(card->dma_stuff->recbuf), 2);
-	if (temp != qeth_peer_func_level(card->func_level)) {
-		QETH_DBF_TEXT1(0, trace, "IRFL");
-		QETH_DBF_TEXT1(0, trace, CARD_RDEV_ID(card));
-		sprintf(dbf_text, "%4x%4x", card->func_level, temp);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		PRINT_WARN("IDX_ACTIVATE on read channel device %s: function "
-			   "level mismatch (sent: 0x%x, received: 0x%x)\n",
-			   CARD_RDEV_ID(card), card->func_level, temp);
-		result = -EIO;
-	}
-
-	memcpy(&card->token.issuer_rm_r,
-	       QETH_IDX_ACT_ISSUER_RM_TOKEN(card->dma_stuff->recbuf),
-	       QETH_MPC_TOKEN_LENGTH);
-
-	memcpy(&card->level[0],
-	       QETH_IDX_REPLY_LEVEL(card->dma_stuff->recbuf), QETH_MCL_LENGTH);
-
-	return result;
-}
-
-static int
-qeth_idx_activate_write(struct qeth_card *card)
-{
-	int result, result2;
-	__u16 temp;
-	unsigned long flags;
-	char dbf_text[15];
-
-	result = result2 = 0;
-
-	memcpy(&card->dma_stuff->write_ccw, WRITE_CCW, sizeof (struct ccw1));
-	card->dma_stuff->write_ccw.count = IDX_ACTIVATE_SIZE;
-	card->dma_stuff->write_ccw.cda =
-	    QETH_GET_ADDR(card->dma_stuff->sendbuf);
-
-	memcpy(card->dma_stuff->sendbuf, IDX_ACTIVATE_WRITE, IDX_ACTIVATE_SIZE);
-	memcpy(QETH_TRANSPORT_HEADER_SEQ_NO(card->dma_stuff->sendbuf),
-	       &card->seqno.trans_hdr, QETH_SEQ_NO_LENGTH);
-	card->seqno.trans_hdr++;
-
-	memcpy(QETH_IDX_ACT_ISSUER_RM_TOKEN(card->dma_stuff->sendbuf),
-	       &card->token.issuer_rm_w, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_IDX_ACT_FUNC_LEVEL(card->dma_stuff->sendbuf),
-	       &card->func_level, 2);
-
-	temp = __raw_devno_from_bus_id(CARD_DDEV_ID(card));
-	memcpy(QETH_IDX_ACT_QDIO_DEV_CUA(card->dma_stuff->sendbuf), &temp, 2);
-	temp = (card->cula << 8) + card->unit_addr2;
-	memcpy(QETH_IDX_ACT_QDIO_DEV_REALADDR(card->dma_stuff->sendbuf),
-	       &temp, 2);
-
-	QETH_DBF_TEXT2(0, trace, "iaww");
-	QETH_DBF_TEXT2(0, trace, CARD_WDEV_ID(card));
-	QETH_DBF_HEX2(0, control, card->dma_stuff->sendbuf,
-		      QETH_DBF_CONTROL_LEN);
-
-	spin_lock_irqsave(get_ccwdev_lock(CARD_WDEV(card)), flags);
-	result = ccw_device_start(CARD_WDEV(card), &card->dma_stuff->write_ccw,
-				  IDX_ACTIVATE_WRITE_STATE, 0, 0);
-	if (result) {
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO);
-		result2 = ccw_device_start(CARD_WDEV(card),
-					   &card->dma_stuff->write_ccw,
-					   IDX_ACTIVATE_WRITE_STATE, 0, 0);
-		sprintf(dbf_text, "IWW1%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		sprintf(dbf_text, "IWW2%4x", result2);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		PRINT_WARN("qeth_idx_activate_write (write): do_IO "
-			   "returned %i, next try returns %i\n",
-			   result, result2);
-	}
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_WDEV(card)), flags);
-
-	if (atomic_read(&card->break_out)) {
-		QETH_DBF_TEXT3(0, trace, "IAWWBRKO");
-		return -EIO;
-	}
-
-	if (qeth_sleepon(card, QETH_MPC_TIMEOUT)) {
-		QETH_DBF_TEXT1(0, trace, "IWWT");
-		QETH_DBF_TEXT1(0, trace, CARD_WDEV_ID(card));
-		PRINT_ERR("IDX_ACTIVATE(wr) on write channel device %s: "
-			  "timeout\n", CARD_WDEV_ID(card));
-		return -EIO;
-	}
-
-	QETH_DBF_TEXT3(0, trace, "idxawrrd");
-	/* start one read on write channel */
-	memcpy(&card->dma_stuff->read_ccw, READ_CCW, sizeof (struct ccw1));
-	card->dma_stuff->read_ccw.count = QETH_BUFSIZE;
-	/* recbuf and card->read_ccw is not yet used by any other
-	   read channel program */
-	card->dma_stuff->read_ccw.cda = QETH_GET_ADDR(card->dma_stuff->recbuf);
-
-	spin_lock_irqsave(get_ccwdev_lock(CARD_WDEV(card)), flags);
-	result2 = 0;
-	result = ccw_device_start(CARD_WDEV(card), &card->dma_stuff->read_ccw,
-				  IDX_ACTIVATE_READ_STATE, 0, 0);
-	if (result) {
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO);
-		result2 = ccw_device_start(CARD_WDEV(card),
-					   &card->dma_stuff->read_ccw,
-					   IDX_ACTIVATE_READ_STATE, 0, 0);
-		sprintf(dbf_text, "IWR1%4x", result);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		sprintf(dbf_text, "IWR2%4x", result2);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-		PRINT_WARN("qeth_idx_activate_write (read): do_IO returned "
-			   "%i, next try returns %i\n", result, result2);
-	}
-
-	spin_unlock_irqrestore(get_ccwdev_lock(CARD_WDEV(card)), flags);
-
-	if (result2) {
-		result = result2;
-		if (result)
-			return result;
-	}
-
-	if (qeth_sleepon(card, QETH_MPC_TIMEOUT)) {
-		QETH_DBF_TEXT1(0, trace, "IWRT");
-		QETH_DBF_TEXT1(0, trace, CARD_WDEV_ID(card));
-		PRINT_ERR("IDX_ACTIVATE(rd) on write channel device %s: "
-			  "timeout\n", CARD_WDEV_ID(card));
-		return -EIO;
-	}
-	QETH_DBF_TEXT2(0, trace, "iawr");
-	QETH_DBF_TEXT2(0, trace, CARD_WDEV_ID(card));
-	QETH_DBF_HEX2(0, control, card->dma_stuff->recbuf,
-		      QETH_DBF_CONTROL_LEN);
-
-	if (!(QETH_IS_IDX_ACT_POS_REPLY(card->dma_stuff->recbuf))) {
-		QETH_DBF_TEXT1(0, trace, "IWNR");
-		QETH_DBF_TEXT1(0, trace, CARD_WDEV_ID(card));
-		PRINT_ERR("IDX_ACTIVATE on write channel device %s: negative "
-			  "reply\n", CARD_WDEV_ID(card));
-		return -EIO;
-	}
-
-	memcpy(&temp, QETH_IDX_ACT_FUNC_LEVEL(card->dma_stuff->recbuf), 2);
-	if ((temp & ~0x0100) != qeth_peer_func_level(card->func_level)) {
-		QETH_DBF_TEXT1(0, trace, "IWFM");
-		QETH_DBF_TEXT1(0, trace, CARD_WDEV_ID(card));
-		sprintf(dbf_text, "%4x%4x", card->func_level, temp);
-		QETH_DBF_TEXT1(0, trace, dbf_text);
-		PRINT_WARN("IDX_ACTIVATE on write channel device %s: function "
-			   "level mismatch (sent: 0x%x, received: 0x%x)\n",
-			   CARD_WDEV_ID(card), card->func_level, temp);
-		result = -EIO;
-	}
-
-	return result;
-}
-
-static int
-qeth_cm_enable(struct qeth_card *card)
-{
-	unsigned char *buffer;
-	int result;
-	char dbf_text[15];
-
-	memcpy(card->send_buf, CM_ENABLE, CM_ENABLE_SIZE);
-
-	memcpy(QETH_CM_ENABLE_ISSUER_RM_TOKEN(card->send_buf),
-	       &card->token.issuer_rm_r, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_CM_ENABLE_FILTER_TOKEN(card->send_buf),
-	       &card->token.cm_filter_w, QETH_MPC_TOKEN_LENGTH);
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					CM_ENABLE_SIZE, MPC_SETUP_STATE);
-
-	if (!buffer) {
-		QETH_DBF_TEXT2(0, trace, "CME:NOBF");
-		return -EIO;
-	}
-
-	memcpy(&card->token.cm_filter_r,
-	       QETH_CM_ENABLE_RESP_FILTER_TOKEN(buffer), QETH_MPC_TOKEN_LENGTH);
-
-	result = qeth_check_idx_response(buffer);
-
-	sprintf(dbf_text, "cme=%4x", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static int
-qeth_cm_setup(struct qeth_card *card)
-{
-	unsigned char *buffer;
-	int result;
-	char dbf_text[15];
-
-	memcpy(card->send_buf, CM_SETUP, CM_SETUP_SIZE);
-
-	memcpy(QETH_CM_SETUP_DEST_ADDR(card->send_buf),
-	       &card->token.issuer_rm_r, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_CM_SETUP_CONNECTION_TOKEN(card->send_buf),
-	       &card->token.cm_connection_w, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_CM_SETUP_FILTER_TOKEN(card->send_buf),
-	       &card->token.cm_filter_r, QETH_MPC_TOKEN_LENGTH);
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					CM_SETUP_SIZE, MPC_SETUP_STATE);
-
-	if (!buffer) {
-		QETH_DBF_TEXT2(0, trace, "CMS:NOBF");
-		return -EIO;
-	}
-
-	memcpy(&card->token.cm_connection_r,
-	       QETH_CM_SETUP_RESP_DEST_ADDR(buffer), QETH_MPC_TOKEN_LENGTH);
-
-	result = qeth_check_idx_response(buffer);
-
-	sprintf(dbf_text, "cms=%4x", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static int
-qeth_ulp_enable(struct qeth_card *card)
-{
-	unsigned char *buffer;
-	__u16 mtu, framesize;
-	__u16 len;
-	__u8 link_type;
-	int result;
-	char dbf_text[15];
-
-	memcpy(card->send_buf, ULP_ENABLE, ULP_ENABLE_SIZE);
-
-	*(QETH_ULP_ENABLE_LINKNUM(card->send_buf)) =
-	    (__u8) card->options.portno;
-
-	memcpy(QETH_ULP_ENABLE_DEST_ADDR(card->send_buf),
-	       &card->token.cm_connection_r, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_ULP_ENABLE_FILTER_TOKEN(card->send_buf),
-	       &card->token.ulp_filter_w, QETH_MPC_TOKEN_LENGTH);
-
-	memcpy(QETH_ULP_ENABLE_PORTNAME_AND_LL(card->send_buf),
-	       card->options.portname, 9);
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					ULP_ENABLE_SIZE, MPC_SETUP_STATE);
-
-	if (!buffer) {
-		QETH_DBF_TEXT2(0, trace, "ULE:NOBF");
-		return -EIO;
-	}
-
-	memcpy(&card->token.ulp_filter_r,
-	       QETH_ULP_ENABLE_RESP_FILTER_TOKEN(buffer),
-	       QETH_MPC_TOKEN_LENGTH);
-
-	/* to be done before qeth_init_ringbuffers and qeth_init_dev */
-	if (qeth_get_mtu_out_of_mpc(card->type)) {
-		memcpy(&framesize, QETH_ULP_ENABLE_RESP_MAX_MTU(buffer), 2);
-		mtu = qeth_get_mtu_outof_framesize(framesize);
-
-		QETH_DBF_CARD2(0, trace, "ule", card);
-		sprintf(dbf_text, "mtu=%4x", mtu);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-
-		if (!mtu)
-			return -EINVAL;
-
-		card->max_mtu = mtu;
-		card->initial_mtu = mtu;
-		card->inbound_buffer_size = mtu + 2 * PAGE_SIZE;
-	} else {
-		card->initial_mtu = qeth_get_initial_mtu_for_card(card);
-		card->max_mtu = qeth_get_max_mtu_for_card(card->type);
-		card->inbound_buffer_size = DEFAULT_BUFFER_SIZE;
-	}
-
-	memcpy(&len, QETH_ULP_ENABLE_RESP_DIFINFO_LEN(buffer), 2);
-	if (len >= QETH_MPC_DIFINFO_LEN_INDICATES_LINK_TYPE) {
-		memcpy(&link_type, QETH_ULP_ENABLE_RESP_LINK_TYPE(buffer), 1);
-		card->link_type = link_type;
-		sprintf(dbf_text, "link=%2x", link_type);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	} else
-		card->link_type = 0;
-
-	result = qeth_check_idx_response(buffer);
-
-	sprintf(dbf_text, "ule=%4x", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static int
-qeth_ulp_setup(struct qeth_card *card)
-{
-	unsigned char *buffer;
-	__u16 temp;
-	int result;
-	char dbf_text[15];
-
-	memcpy(card->send_buf, ULP_SETUP, ULP_SETUP_SIZE);
-
-	memcpy(QETH_ULP_SETUP_DEST_ADDR(card->send_buf),
-	       &card->token.cm_connection_r, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_ULP_SETUP_CONNECTION_TOKEN(card->send_buf),
-	       &card->token.ulp_connection_w, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_ULP_SETUP_FILTER_TOKEN(card->send_buf),
-	       &card->token.ulp_filter_r, QETH_MPC_TOKEN_LENGTH);
-
-	temp = __raw_devno_from_bus_id(CARD_DDEV_ID(card));
-	memcpy(QETH_ULP_SETUP_CUA(card->send_buf), &temp, 2);
-	temp = (card->cula << 8) + card->unit_addr2;
-	memcpy(QETH_ULP_SETUP_REAL_DEVADDR(card->send_buf), &temp, 2);
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					ULP_SETUP_SIZE, MPC_SETUP_STATE);
-
-	if (!buffer) {
-		QETH_DBF_TEXT2(0, trace, "ULS:NOBF");
-		return -EIO;
-	}
-
-	memcpy(&card->token.ulp_connection_r,
-	       QETH_ULP_SETUP_RESP_CONNECTION_TOKEN(buffer),
-	       QETH_MPC_TOKEN_LENGTH);
-
-	result = qeth_check_idx_response(buffer);
-
-	sprintf(dbf_text, "uls=%4x", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static int
-qeth_qdio_establish(struct qeth_card *card)
-{
-	int result;
-	char *adapter_area;
-	char dbf_text[15];
-	void **input_array, **output_array, **ptr;
-	int i, j;
-	struct qdio_initialize init_data;
-
- 	adapter_area = vmalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof(char));
- 	if (!adapter_area)
-		return -ENOMEM;
- 
- 	memset(adapter_area, 0, QDIO_MAX_BUFFERS_PER_Q * sizeof(char));
-
-	adapter_area[0] = _ascebc['P'];
-	adapter_area[1] = _ascebc['C'];
-	adapter_area[2] = _ascebc['I'];
-	adapter_area[3] = _ascebc['T'];
-	*((unsigned int *) (&adapter_area[4])) = PCI_THRESHOLD_A;
-	*((unsigned int *) (&adapter_area[8])) = PCI_THRESHOLD_B;
-	*((unsigned int *) (&adapter_area[12])) = PCI_TIMER_VALUE;
-
-	input_array = vmalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof (void *));
-	if (!input_array) {
-		vfree(adapter_area);
-		return -ENOMEM;
-	}
-	ptr = input_array;
-	for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++) {
-		*ptr = (void *) virt_to_phys(&card->inbound_qdio_buffers[j]);
-		ptr++;
-	}
-
-	output_array = vmalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof (void *) *
-			       card->no_queues);
-	if (!output_array) {
-		vfree(input_array);
-		vfree(adapter_area);
-		return -ENOMEM;
-	}
-	ptr = output_array;
-	for (i = 0; i < card->no_queues; i++)
-		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++) {
-			*ptr = (void *) virt_to_phys
-			    (&card->outbound_ringbuffer[i]->buffer[j]);
-			ptr++;
-		}
-
-	init_data.cdev = CARD_DDEV(card);
-	init_data.q_format = qeth_get_q_format(card->type);
-	init_data.qib_param_field_format = 0;
-	init_data.qib_param_field = adapter_area;
-	init_data.input_slib_elements = NULL;
-	init_data.output_slib_elements = NULL;
-	init_data.min_input_threshold = card->options.polltime;
-	init_data.max_input_threshold = card->options.polltime;
-	init_data.min_output_threshold = QETH_MIN_OUTPUT_THRESHOLD;
-	init_data.max_output_threshold = QETH_MAX_OUTPUT_THRESHOLD;
-	init_data.no_input_qs = 1;
-	init_data.no_output_qs = card->no_queues;
-	init_data.input_handler = qeth_qdio_input_handler;
-	init_data.output_handler = qeth_qdio_output_handler;
-	init_data.int_parm = (unsigned long) card;
-	init_data.flags = QDIO_INBOUND_0COPY_SBALS |
-	    QDIO_OUTBOUND_0COPY_SBALS | QDIO_USE_OUTBOUND_PCIS;
-	init_data.input_sbal_addr_array = input_array;
-	init_data.output_sbal_addr_array = output_array;
-
-	result = qdio_initialize(&init_data);
-
-	vfree(input_array);
-	vfree(output_array);
-	vfree(adapter_area);
-
-	sprintf(dbf_text, "qde=%4i", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static int
-qeth_qdio_activate(struct qeth_card *card)
-{
-	int result;
-	char dbf_text[15];
-
-	result = qdio_activate(CARD_DDEV(card), 0);
-
-	sprintf(dbf_text, "qda=%4x", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static int
-qeth_dm_act(struct qeth_card *card)
-{
-	unsigned char *buffer;
-	int result;
-	char dbf_text[15];
-
-	memcpy(card->send_buf, DM_ACT, DM_ACT_SIZE);
-
-	memcpy(QETH_DM_ACT_DEST_ADDR(card->send_buf),
-	       &card->token.cm_connection_r, QETH_MPC_TOKEN_LENGTH);
-	memcpy(QETH_DM_ACT_CONNECTION_TOKEN(card->send_buf),
-	       &card->token.ulp_connection_r, QETH_MPC_TOKEN_LENGTH);
-
-	buffer = qeth_send_control_data(card, card->send_buf,
-					DM_ACT_SIZE, MPC_SETUP_STATE);
-
-	if (!buffer) {
-		QETH_DBF_TEXT2(0, trace, "DMA:NOBF");
-		return -EIO;
-	}
-
-	result = qeth_check_idx_response(buffer);
-
-	sprintf(dbf_text, "dma=%4x", result);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	return result;
-}
-
-static inline int
-__qeth_verify_dev_vlan(struct net_device *dev,struct qeth_card *card)
-{
-#ifdef QETH_VLAN
-	struct vlan_group *vlan_grp;
-	int i;
-	int result = 0;
-
-	/* check all vlan devices */
-	vlan_grp = (struct vlan_group *) card->vlangrp;
-	if (vlan_grp) {
-		for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
-			if (vlan_grp->vlan_devices[i] == dev) {
-				result = QETH_VERIFY_IS_VLAN_DEV;
-			}
-		}
-	}
-	return result;
-#endif
-	return 0;
-}
-
-#if defined(QETH_VLAN)||defined(QETH_IPV6)
-static int
-qeth_verify_dev(struct net_device *dev)
-{
-	struct qeth_card *tmp;
-	int result = 0;
-
-	read_lock(&list_lock);
-	tmp = firstcard;
-	for (; tmp && (!result); tmp = tmp->next) {
-		if (atomic_read(&tmp->shutdown_phase))
-			continue;
-		result = (dev == tmp->dev)?
-			QETH_VERIFY_IS_REAL_DEV:__qeth_verify_dev_vlan(dev, tmp);
-	}
-	read_unlock(&list_lock);
-	return result;
-}
-#endif /* defined(QETH_VLAN)||defined(QETH_IPV6) */
-
-static int
-qeth_verify_card(struct qeth_card *card)
-{
-	struct qeth_card *tmp;
-	int result = 0;
-
-	read_lock(&list_lock);
-	tmp = firstcard;
-	while (tmp) {
-		if ((card == tmp) && (!atomic_read(&card->shutdown_phase))) {
-			result = 1;
-			break;
-		}
-		tmp = tmp->next;
-	}
-	read_unlock(&list_lock);
-	return result;
-}
-
-static inline struct qeth_card *
-__qeth_get_card_from_dev(struct net_device *dev)
-{
-#ifdef QETH_VLAN
-	if (qeth_verify_dev(dev) == QETH_VERIFY_IS_VLAN_DEV)
-		return (struct qeth_card *) VLAN_DEV_INFO(dev)->real_dev->priv;
-	else
-#endif
-		return (struct qeth_card *) dev->priv;
-}
-
-#ifdef QETH_IPV6
-/* FIXME: don't put extern declarations in a c file, use a header that's
- * shared with the definition for this! */
-extern struct neigh_table arp_tbl;
-static int (*qeth_old_arp_constructor) (struct neighbour *);
-static struct neigh_ops arp_direct_ops_template = {
-	.family = AF_INET,
-	.destructor = NULL,
-	.solicit = NULL,
-	.error_report = NULL,
-	.output = dev_queue_xmit,
-	.connected_output = dev_queue_xmit,
-	.hh_output = dev_queue_xmit,
-	.queue_xmit = dev_queue_xmit
-};
-
-/*
- * FIXME:
- * as we have neighbour structures point to this structure, even
- * after our life time, this will stay in memory as a leak 
- */
-static struct neigh_ops *arp_direct_ops;
-
-
-static int
-qeth_arp_constructor(struct neighbour *neigh)
-{
-	char dbf_text[15];
-	struct net_device *dev = neigh->dev;
-	struct in_device *in_dev = in_dev_get(dev);
-
-	if (in_dev == NULL)
-		return -EINVAL;
-
-	QETH_DBF_TEXT4(0, trace, "arpconst");
-	if (!qeth_verify_dev(dev)) {
-
-		in_dev_put(in_dev);
-		return qeth_old_arp_constructor(neigh);
-	}
-
-	neigh->type = inet_addr_type(*(u32 *) neigh->primary_key);
-	if (in_dev->arp_parms)
-		neigh->parms = in_dev->arp_parms;
-
-	in_dev_put(in_dev);
-
-	sprintf(dbf_text, "%08x", ntohl(*((__u32 *) (neigh->primary_key))));
-	QETH_DBF_TEXT4(0, trace, dbf_text);
-	QETH_DBF_HEX4(0, trace, &neigh, sizeof (void *));
-
-	neigh->nud_state = NUD_NOARP;
-	neigh->ops = arp_direct_ops;
-	neigh->output = neigh->ops->queue_xmit;
-	return 0;
-}
-
-static int
-qeth_hard_header(struct sk_buff *skb, struct net_device *dev,
-		 unsigned short type, void *daddr, void *saddr, unsigned len)
-{
-	struct qeth_card *card;
-
-	QETH_DBF_TEXT5(0, trace, "hardhdr");
-
-	card = __qeth_get_card_from_dev(dev);
-	return card->hard_header(skb, dev, type, daddr, saddr, len);
-}
-
-static void
-qeth_header_cache_update(struct hh_cache *hh,
-			 struct net_device *dev, unsigned char *haddr)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-	QETH_DBF_TEXT5(0, trace, "hdrcheup");
-	return card->header_cache_update(hh, dev, haddr);
-}
-
-static int
-qeth_rebuild_header(struct sk_buff *skb)
-{
-	struct qeth_card *card;
-	QETH_DBF_TEXT5(0, trace, "rebldhdr");
-	if (skb->protocol == __constant_htons(ETH_P_IP))
-		return 0;
-
-	card = __qeth_get_card_from_dev(skb->dev);
-
-	return card->rebuild_header(skb);
-}
-
-int
-qeth_ipv6_generate_eui64(u8 * eui, struct net_device *dev)
-{
-	switch (dev->type) {
-	case ARPHRD_ETHER:
-	case ARPHRD_FDDI:
-	case ARPHRD_IEEE802_TR:
-		if (dev->addr_len != ETH_ALEN)
-			return -1;
-		memcpy(eui, dev->dev_addr, 3);
-		memcpy(eui + 5, dev->dev_addr + 3, 3);
-		eui[3] = (dev->dev_id >> 8) & 0xff;
-		eui[4] = dev->dev_id & 0xff;
-		return 0;
-	}
-	return -1;
-
-}
-#endif /* QETH_IPV6 */
-
-static void
-qeth_ipv6_init_card(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	card->hard_header = qeth_get_hard_header(card->link_type);
-	card->rebuild_header = qeth_get_rebuild_header(card->link_type);
-	card->hard_header_cache = qeth_get_hard_header_cache(card->link_type);
-	card->header_cache_update =
-	    qeth_get_header_cache_update(card->link_type);
-	card->type_trans = qeth_get_type_trans(card->link_type);
-	card->dev->dev_id = card->unique_id & 0xffff;
-	if (!(card->unique_id & UNIQUE_ID_NOT_BY_CARD))
-		card->dev->generate_eui64 = qeth_ipv6_generate_eui64;
-#endif /* QETH_IPV6 */
-}
-
-#ifdef QETH_VLAN
-static void
-qeth_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
-{
-	struct qeth_card *card;
-	card = (struct qeth_card *) dev->priv;
-	spin_lock_irq(&card->vlan_lock);
-	card->vlangrp = grp;
-	spin_unlock_irq(&card->vlan_lock);
-}
-static void
-qeth_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
-{
-	struct qeth_card *card;
-	card = (struct qeth_card *) dev->priv;
-	spin_lock_irq(&card->vlan_lock);
-	if (card->vlangrp)
-		card->vlangrp->vlan_devices[vid] = NULL;
-	spin_unlock_irq(&card->vlan_lock);
-}
-#endif
-
-static void
-qeth_tx_timeout(struct net_device *dev)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-	QETH_DBF_CARD2(1, trace, "XMTO", card);
-	card->stats->tx_errors++;
-	atomic_set(&card->problem, PROBLEM_TX_TIMEOUT);
-	qeth_schedule_recovery(card);
-}
-
-static void*
-__qeth_rebuild_header_func(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	return (!(qeth_get_additional_dev_flags(card->type) & IFF_NOARP)) ?
-		(qeth_get_rebuild_header(card->link_type) ?
-		 qeth_rebuild_header : NULL) : NULL;
-#endif /* QETH_IPV6 */
-	return NULL;
-}
-
-static void*
-__qeth_hard_header_func(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	return (!(qeth_get_additional_dev_flags(card->type) & IFF_NOARP)) ?
-		(qeth_get_hard_header(card->link_type) ?
-		 qeth_hard_header : NULL) : NULL;
-#endif /* QETH_IPV6 */
-	return NULL;
-}
-
-static void*
-__qeth_header_cache_update_func(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	return (!(qeth_get_additional_dev_flags(card->type) & IFF_NOARP)) ?
-		(qeth_get_header_cache_update(card->link_type) ?
-		 qeth_header_cache_update : NULL) : NULL;
-#endif /* QETH_IPV6 */
-	return NULL;
-}
-
-static void*
-__qeth_hard_header_cache_func(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	return (!(qeth_get_additional_dev_flags(card->type) & IFF_NOARP)) ?
-		qeth_get_hard_header_cache(card->link_type) : NULL;
-#endif /* QETH_IPV6 */
-	return NULL;
-}
-
-static int
-qeth_init_dev(struct net_device *dev)
-{
-	struct qeth_card *card;
-
-	card = (struct qeth_card *) dev->priv;
-
-	QETH_DBF_CARD3(0, trace, "inid", card);
-
-	dev->rebuild_header = __qeth_rebuild_header_func(card);
-	dev->hard_header = __qeth_hard_header_func(card);
-	dev->header_cache_update = __qeth_header_cache_update_func(card);
-	dev->hard_header_cache = __qeth_hard_header_cache_func(card);
-	dev->hard_header_parse = NULL;
-
-	dev->flags |= qeth_get_additional_dev_flags(card->type);
-
-	dev->flags |= ((card->options.fake_broadcast == FAKE_BROADCAST) ||
-		       (card->broadcast_capable)) ? IFF_BROADCAST : 0;
-
-	 /* is done in hardsetup_card... see comment below
-	 qeth_send_qipassist(card,4);*/
-
-	/* that was the old place. one id. we need to make sure, that
-	 * hydra knows about us going to use the same id again, so we
-	 * do that in hardsetup_card every time
-	 qeth_get_unique_id(card);*/
-
-	dev->tx_queue_len = qeth_get_device_tx_q_len(card->type);
-	dev->hard_header_len =
-		qeth_get_hlen(card->link_type) + card->options.add_hhlen;
-	netif_start_queue(dev);
-
-	dev->mtu = card->initial_mtu;
-
-	qeth_ipv6_init_card(card);
-
-	return 0;
-}
-
-static int
-qeth_get_unitaddr(struct qeth_card *card)
-{
-	char *prcd;
-	int result = 0;
-	char dbf_text[15];
-	int length;
-
-	QETH_DBF_CARD3(0, trace, "gtua", card);
-
-	result = read_conf_data(CARD_DDEV(card), (void **) &prcd, &length);
-	if (result) {
-		sprintf(dbf_text, "rcd%4x", result);
-		QETH_DBF_TEXT3(0, trace, dbf_text);
-		PRINT_ERR("read_conf_data for device %s returned %i\n",
-			  CARD_DDEV_ID(card), result);
-		return result;
-	}
-
-	card->chpid = prcd[30];
-	card->unit_addr2 = prcd[31];
-	card->cula = prcd[63];
-	card->is_guest_lan= ((prcd[0x10] == _ascebc['V']) &&
-			     (prcd[0x11] == _ascebc['M']));
-
-	sprintf(dbf_text, "chpid:%02x", card->chpid);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-	sprintf(dbf_text, "unad2:%02x", card->unit_addr2);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-	sprintf(dbf_text, "cula:%02x", card->cula);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	return 0;
-}
-
-static int
-qeth_send_nops(struct qeth_card *card)
-{
-	int result, result2;
-	unsigned long saveflags;
-
-	card->dma_stuff->write_ccw.cmd_code = CCW_NOP_CMD;
-	card->dma_stuff->write_ccw.flags = CCW_FLAG_SLI;
-	card->dma_stuff->write_ccw.count = CCW_NOP_COUNT;
-	card->dma_stuff->write_ccw.cda = (unsigned long) NULL;
-
-#define DO_SEND_NOP(cdev) \
-do { \
-	QETH_DBF_TEXT3(0, trace, "snnp"); \
-	QETH_DBF_TEXT3(0, trace, cdev->dev.bus_id); \
-\
-	spin_lock_irqsave(get_ccwdev_lock(cdev),saveflags); \
-	ccw_device_set_options(cdev, 0); \
-        result=ccw_device_start(cdev,&card->dma_stuff->write_ccw, \
-				NOP_STATE,0,0); \
-        if (result) { \
-		qeth_delay_millis(QETH_WAIT_BEFORE_2ND_DOIO); \
-                result2=ccw_device_start(cdev,&card->dma_stuff->write_ccw, \
-					 NOP_STATE,0,0); \
-                PRINT_WARN("qeth_send_nops on device %s: do_IO returned %i, " \
-                           "next try returns %i\n", \
-                           cdev->dev.bus_id,result,result2); \
-		result=result2; \
-        } \
-        spin_unlock_irqrestore(get_ccwdev_lock(cdev),saveflags); \
-\
-	if (result) goto exit; \
-\
-        if (qeth_sleepon(card,QETH_NOP_TIMEOUT)) { \
-		QETH_DBF_TEXT2(0,trace,"snnp:tme"); \
-		result=-EIO; \
-		goto exit; \
-        } \
-} while (0)
-
-	DO_SEND_NOP(CARD_RDEV(card));
-	DO_SEND_NOP(CARD_WDEV(card));
-	DO_SEND_NOP(CARD_DDEV(card));
-
-exit:
-	return result;
-}
-
-static void
-qeth_clear_card_structures(struct qeth_card *card)
-{
-	int i, j;
-
-	if (!card) {
-		QETH_DBF_TEXT2(0, trace, "clrCRDnc");
-		return;
-	}
-
-	QETH_DBF_CARD3(0, trace, "clcs", card);
-
-	atomic_set(&card->is_startlaned, 0);
-
-	for (i = 0; i < QETH_MAX_QUEUES; i++) {
-		card->send_state[i] = SEND_STATE_DONT_PACK;
-		card->outbound_first_free_buffer[i] = 0;
-		atomic_set(&card->outbound_used_buffers[i], 0);
-		atomic_set(&card->outbound_ringbuffer_lock[i], 0);
-
-		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++) {
-			card->outbound_buffer_send_state[i][j] =
-			    SEND_STATE_DONT_PACK;
-			card->send_retries[i][j] = 0;
-
-			if (i < card->no_queues) {
-				card->outbound_ringbuffer[i]->
-				    ringbuf_element[j].next_element_to_fill = 0;
-				card->outbound_bytes_in_buffer[i] = 0;
-				skb_queue_head_init(&card->
-						    outbound_ringbuffer[i]->
-						    ringbuf_element[j].
-						    skb_list);
-			}
-		}
-	}
-
-	for (i = 0; i < card->options.inbound_buffer_count; i++) {
-		xchg((int *) &card->inbound_buffer_pool_entry_used[i],
-		     BUFFER_UNUSED);
-	}
-
-	spin_lock_init(&card->requeue_input_lock);
-	atomic_set(&card->requeue_position, 0);
-	atomic_set(&card->requeue_counter, 0);
-
-	card->seqno.trans_hdr = 0;
-	card->seqno.pdu_hdr = 0;
-	card->seqno.pdu_hdr_ack = 0;
-	card->seqno.ipa = 0;
-
-	qeth_clear_ifa4_list(&card->ip_current_state.ip_ifa);
-	qeth_clear_ifa4_list(&card->ip_new_state.ip_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_current_state.ipm_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm_ifa);
-
-#ifdef QETH_IPV6
-	qeth_clear_ifa6_list(&card->ip_current_state.ip6_ifa);
-	qeth_clear_ifa6_list(&card->ip_new_state.ip6_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_current_state.ipm6_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm6_ifa);
-#endif /* QETH_IPV6 */
-}
-
-static void
-qeth_init_input_buffers(struct qeth_card *card)
-{
-	int i;
-
-	/* slowly, slowly (we don't want to enqueue all buffers
-	 * at one time) */
-	for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++) {
-		atomic_set(&card->inbound_buffer_refcnt[i], 1);
-	}
-	for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; i++) {
-		atomic_set(&card->inbound_buffer_refcnt[i], 0);
- 		/* only try to queue as many buffers as we have at all */
- 		if (i < card->options.inbound_buffer_count)
- 			qeth_queue_input_buffer(card,i,0);
-	}
-	qdio_synchronize(CARD_DDEV(card), QDIO_FLAG_SYNC_INPUT, 0);
-}
-
-/* initializes all the structures for a card */
-static int
-qeth_hardsetup_card(struct qeth_card *card, int in_recovery)
-{
-	int result, q, breakout;
-	unsigned long flags;
-	int laps = QETH_HARDSETUP_LAPS;
-	int clear_laps;
-	int cleanup_qdio;
-	char dbf_text[15];
-	int i, r;
-
-	/* setup name and so on */
-	atomic_set(&card->shutdown_phase, 0);
-
-	if (atomic_read(&card->is_hardsetup)) {
-		QETH_DBF_CARD2(1, trace, "hscd", card);
-		PRINT_ALL("card is already hardsetup.\n");
-		return 0;
-	}
-
-	cleanup_qdio = in_recovery;	/* if we are in recovery, we clean
-					   the qdio stuff up */
-
-	down(&card->hardsetup_sema);
-	atomic_set(&card->write_busy, 0);
-
-	do {
-		if (in_recovery) {
-			PRINT_STUPID("qeth: recovery: quiescing %s...\n",
-				     card->dev_name);
-			QETH_DBF_CARD2(0, trace, "Rqsc", card);
-			qeth_wait_nonbusy(QETH_QUIESCE_WAIT_BEFORE_CLEAR);
-		}
-		clear_laps = QETH_HARDSETUP_CLEAR_LAPS;
-		do {
-			if (in_recovery)
-				PRINT_STUPID("clearing card %s\n",
-					     card->dev_name);
-			qeth_clear_card(card, cleanup_qdio,
-					(card->type == QETH_CARD_TYPE_OSAE));
-			result = qeth_send_nops(card);
-			breakout = atomic_read(&card->break_out);
-		} while ((--clear_laps) && (result));
-		if (result) {
-			goto exit;
-		}
-
-		if (in_recovery) {
-			PRINT_STUPID("qeth: recovery: still quiescing %s...\n",
-				     card->dev_name);
-			QETH_DBF_CARD2(0, trace, "RQsc", card);
-			qeth_wait_nonbusy(QETH_QUIESCE_WAIT_AFTER_CLEAR);
-		} else {
-			atomic_set(&card->shutdown_phase, 0);
-		}
-
-		cleanup_qdio = 0;	/* qdio was cleaned now, if necessary */
-
-		result = qeth_get_unitaddr(card);
-		if (result)
-			goto exit;
-
-		qeth_generate_tokens(card);
-
-#define PRINT_TOKENS do { \
-		sprintf(dbf_text,"stra    "); \
-		memcpy(&dbf_text[4],&card->seqno.trans_hdr,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"spdu    "); \
-		memcpy(&dbf_text[4],&card->seqno.pdu_hdr,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"spda    "); \
-		memcpy(&dbf_text[4],&card->seqno.pdu_hdr_ack,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"sipa    "); \
-		memcpy(&dbf_text[4],&card->seqno.ipa,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tisw    "); \
-		memcpy(&dbf_text[4],&card->token.issuer_rm_w,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tisr    "); \
-		memcpy(&dbf_text[4],&card->token.issuer_rm_r,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tcfw    "); \
-		memcpy(&dbf_text[4],&card->token.cm_filter_w,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tcfr    "); \
-		memcpy(&dbf_text[4],&card->token.cm_filter_r,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tccw    "); \
-		memcpy(&dbf_text[4],&card->token.cm_connection_w,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tccr    "); \
-		memcpy(&dbf_text[4],&card->token.cm_connection_r,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tufw    "); \
-		memcpy(&dbf_text[4],&card->token.ulp_filter_w,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tufr    "); \
-		memcpy(&dbf_text[4],&card->token.ulp_filter_r,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tucw    "); \
-		memcpy(&dbf_text[4],&card->token.ulp_connection_w,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-		sprintf(dbf_text,"tucr    "); \
-		memcpy(&dbf_text[4],&card->token.ulp_connection_r,4); \
-		QETH_DBF_HEX3(0,trace,dbf_text,QETH_DBF_TRACE_LEN); \
-	} while (0)
-
-		PRINT_TOKENS;
-
-		/* card->break_out and problem will be set here to 0
-		 * (in each lap) (there can't be a problem at this
-		 * early time) */
-		atomic_set(&card->problem, 0);
-		atomic_set(&card->break_out, 0);
-
-#define CHECK_ERRORS \
-		breakout=atomic_read(&card->break_out); \
-		if (breakout==QETH_BREAKOUT_AGAIN) \
-			continue; \
-		else if (breakout==QETH_BREAKOUT_LEAVE) { \
-			result=-EIO; \
-			goto exit; \
-		} \
-		if (result) goto exit
-
-		QETH_DBF_TEXT2(0, trace, "hsidxard");
-		result = qeth_idx_activate_read(card);
-		CHECK_ERRORS;
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hsidxawr");
-		result = qeth_idx_activate_write(card);
-		CHECK_ERRORS;
-
-		QETH_DBF_TEXT2(0, trace, "hsissurd");
-		/* from here, there will always be an outstanding read */
-		spin_lock_irqsave(get_ccwdev_lock(CARD_RDEV(card)), flags);
-		qeth_issue_next_read(card);
-		spin_unlock_irqrestore(get_ccwdev_lock(CARD_RDEV(card)), flags);
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hscmenab");
-		result = qeth_cm_enable(card);
-		CHECK_ERRORS;
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hscmsetu");
-		result = qeth_cm_setup(card);
-		CHECK_ERRORS;
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hsulpena");
-		result = qeth_ulp_enable(card);
-		CHECK_ERRORS;
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hsulpset");
-		result = qeth_ulp_setup(card);
-		CHECK_ERRORS;
-
-		cleanup_qdio = 1;
-
-		QETH_DBF_TEXT2(0, trace, "hsqdioes");
-		result = qeth_qdio_establish(card);
-		CHECK_ERRORS;
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hsqdioac");
-		result = qeth_qdio_activate(card);
-		CHECK_ERRORS;
-
-		PRINT_TOKENS;
-		QETH_DBF_TEXT2(0, trace, "hsdmact");
-		result = qeth_dm_act(card);
-		CHECK_ERRORS;
-	} while ((laps--) && (breakout == QETH_BREAKOUT_AGAIN));
-	if (breakout == QETH_BREAKOUT_AGAIN) {
-		QETH_DBF_CARD2(0, trace, "hsnr", card);
-		PRINT_ERR("qeth: recovery not successful on device "
-			  "%s/%s/%s; giving up.\n",
-			  CARD_RDEV_ID(card),
-			  CARD_WDEV_ID(card), CARD_DDEV_ID(card));
-		result = -EIO;
-		goto exit;
-	}
-
-	qeth_clear_ifa4_list(&card->ip_current_state.ip_ifa);
-	qeth_clear_ifa4_list(&card->ip_new_state.ip_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_current_state.ipm_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm_ifa);
-
-#ifdef QETH_IPV6
-	qeth_clear_ifa6_list(&card->ip_current_state.ip6_ifa);
-	qeth_clear_ifa6_list(&card->ip_new_state.ip6_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_current_state.ipm6_ifa);
-	qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm6_ifa);
-#endif /* QETH_IPV6 */
-
-	if (!atomic_read(&card->is_registered)) {
-		card->dev->dev_addr[0] = 0;	/* we don't know the mac addr yet */
-		card->dev->dev_addr[1] = 0;
-		card->dev->dev_addr[2] = 0;
-		card->dev->dev_addr[3] = 0;
-		card->dev->dev_addr[4] = 0;
-		card->dev->dev_addr[5] = 0;
-		card->dev->broadcast[0] = card->dev->broadcast[1] = 0xff;
-		card->dev->broadcast[2] = card->dev->broadcast[3] = 0xff;
-		card->dev->broadcast[4] = card->dev->broadcast[5] = 0xff;
-
-		card->dev->type = qeth_get_arphrd_type(card->type,
-						       card->link_type);
-
-		card->dev->init = qeth_init_dev;
-
-		card->ipa_timeout = qeth_get_ipa_timeout(card->type);
-	}
-
-	atomic_set(&card->is_hardsetup, 1);
-	atomic_set(&card->is_softsetup, 0);
-	atomic_set(&card->startlan_attempts, 1);
-
-	for (q = 0; q < card->no_queues; q++)
-		card->send_state[q] = SEND_STATE_DONT_PACK;
-
-	/* we need to know first, whether we should include a value
-	 * into eui-64 address generation */
-	QETH_DBF_TEXT2(0, trace, "qipassi4");
-	r = qeth_send_qipassist(card, 4);
-	if (r) {
-		PRINT_WARN("couldn't send QIPASSIST4 on %s: "
-			   "0x%x\n", card->dev_name, r);
-		sprintf(dbf_text, "QIP4%4x", r);
-		QETH_DBF_TEXT2(0, trace, dbf_text);
-	}
-
-	sprintf(dbf_text, "%4x%4x", card->ipa_supported, card->ipa_enabled);
-	QETH_DBF_TEXT2(0, trace, dbf_text);
-
-	qeth_get_unique_id(card);
-
-	/* print out status */
-	if (in_recovery) {
-		qeth_clear_card_structures(card);
-		qeth_init_input_buffers(card);
-		QETH_DBF_TEXT1(0, trace, "RECOVSUC");
-		PRINT_INFO("qeth: recovered device %s/%s/%s (%s) "
-			   "successfully.\n",
-			   CARD_RDEV_ID(card),
-			   CARD_WDEV_ID(card),
-			   CARD_DDEV_ID(card), card->dev_name);
-	} else {
-		QETH_DBF_TEXT2(0, trace, "hrdsetok");
-
-		switch (card->type) {
-		case QETH_CARD_TYPE_OSAE:
-			/* 
-			 * VM will use a non-zero first character to indicate
-			 * a HiperSockets like reporting of the level
-			 * OSA sets the first character to zero
-			 */
-			if (!card->level[0]) {
-				sprintf(card->level, "%02x%02x", card->level[2],
-					card->level[3]);
-				card->level[QETH_MCL_LENGTH] = 0;
-				break;
-			}
-			/* fallthrough */
-		case QETH_CARD_TYPE_IQD:
-			card->level[0] = (char) _ebcasc[(__u8) card->level[0]];
-			card->level[1] = (char) _ebcasc[(__u8) card->level[1]];
-			card->level[2] = (char) _ebcasc[(__u8) card->level[2]];
-			card->level[3] = (char) _ebcasc[(__u8) card->level[3]];
-			card->level[QETH_MCL_LENGTH] = 0;
-			break;
-		default:
-			memset(&card->level[0], 0, QETH_MCL_LENGTH + 1);
-		}
-
-		sprintf(dbf_text, "lvl:%s", card->level);
-		QETH_DBF_TEXT2(0, setup, dbf_text);
-
-		if (card->portname_required) {
-			sprintf(dbf_text, "%s", card->options.portname + 1);
-			for (i = 0; i < 8; i++)
-				dbf_text[i] =
-				    (char) _ebcasc[(__u8) dbf_text[i]];
-			dbf_text[8] = 0;
-			printk("qeth: Device %s/%s/%s is a%s card%s%s%s\n"
-			       "with link type %s (portname: %s)\n",
-			       CARD_RDEV_ID(card),
-			       CARD_WDEV_ID(card),
-			       CARD_DDEV_ID(card),
-			       qeth_get_cardname(card->type,
-						 card->is_guest_lan),
-			       (card->level[0]) ? " (level: " : "",
-			       (card->level[0]) ? card->level : "",
-			       (card->level[0]) ? ")" : "",
-			       qeth_get_link_type_name(card->type,
-						       card->link_type),
-			       dbf_text);
-		} else {
-			if (card->options.portname[0])
-				printk("qeth: Device %s/%s/%s is a%s "
-				       "card%s%s%s\nwith link type %s "
-				       "(no portname needed by interface).\n",
-				       CARD_RDEV_ID(card),
-				       CARD_WDEV_ID(card),
-				       CARD_DDEV_ID(card),
-				       qeth_get_cardname(card->type,
-							 card->is_guest_lan),
-				       (card->level[0]) ? " (level: " : "",
-				       (card->level[0]) ? card->level : "",
-				       (card->level[0]) ? ")" : "",
-				       qeth_get_link_type_name(card->type,
-							       card->link_type));
-			else
-				printk("qeth: Device %s/%s/%s is a%s "
-				       "card%s%s%s\nwith link type %s.\n",
-				       CARD_RDEV_ID(card),
-				       CARD_WDEV_ID(card),
-				       CARD_DDEV_ID(card),
-				       qeth_get_cardname(card->type,
-							 card->is_guest_lan),
-				       (card->level[0]) ? " (level: " : "",
-				       (card->level[0]) ? card->level : "",
-				       (card->level[0]) ? ")" : "",
-				       qeth_get_link_type_name(card->type,
-							       card->link_type));
-		}
-	}
-
-exit:
-	up(&card->hardsetup_sema);
-	return result;
-}
-
-static int
-qeth_reinit_thread(void *param)
-{
-	struct qeth_card *card = (struct qeth_card *) param;
-	int already_registered;
-	int already_hardsetup;
-	int retry = QETH_RECOVERY_HARDSETUP_RETRY;
-	int result;
-	char name[15];
-
-	QETH_DBF_CARD1(0, trace, "RINI", card);
-
-	/* set a nice name ... */
-	sprintf(name, "qethrinid%s", CARD_BUS_ID(card));
-	daemonize(name);
-
-	if (atomic_read(&card->shutdown_phase))
-		goto out_wakeup;
-	down_interruptible(&card->reinit_thread_sem);
-	if (atomic_read(&card->shutdown_phase))
-		goto out_wakeup;
-
-	QETH_DBF_TEXT1(0, trace, "ri-gotin");
-	PRINT_STUPID("entering recovery (reinit) thread for device %s\n",
-		     card->dev_name);
-
-	atomic_set(&card->is_startlaned, 0);
-	atomic_set(&card->is_softsetup, 0);
-
-	read_lock(&list_lock);
-	if (!qeth_verify_card(card))
-		goto out;
-	QETH_DBF_TEXT1(0, trace, "ri-vrfd");
-
-	atomic_set(&card->write_busy, 0);
-	qeth_set_dev_flag_norunning(card);
-	already_hardsetup = atomic_read(&card->is_hardsetup);
-	already_registered = atomic_read(&card->is_registered);
-	if (already_hardsetup) {
-		atomic_set(&card->is_hardsetup, 0);
-
-		if (-1 == my_spin_lock_nonbusy(card, &setup_lock))
-			goto out;
-		if (atomic_read(&card->shutdown_phase))
-			goto out_wakeup;
-
-		atomic_set(&card->escape_softsetup, 1);
-
-		if (-1 == my_down_trylock_nonbusy(card, &card->softsetup_sema)) {
-			atomic_set(&card->escape_softsetup, 0);
-			goto out;
-		}
-		atomic_set(&card->escape_softsetup, 0);
-		if (atomic_read(&card->shutdown_phase)) {
-			up(&card->softsetup_sema);
-			goto out_wakeup;
-		}
-		if (!qeth_verify_card(card))
-			goto out;
-
-		if (already_registered)
-			netif_stop_queue(card->dev);
-
-		qeth_wait_nonbusy(QETH_QUIESCE_NETDEV_TIME);
-
-		atomic_set(&card->is_startlaned, 0);
-
-		QETH_DBF_TEXT1(0, trace, "ri-frskb");
-		qeth_free_all_skbs(card);
-		do {
-			QETH_DBF_TEXT1(0, trace, "ri-hrdst");
-			result = qeth_hardsetup_card(card, 1);
-		} while (result && (retry--));
-
-		/* tries to remove old ips, that's paranoid, but ok */
-		qeth_clear_ifa4_list(&card->ip_new_state.ip_ifa);
-		qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm_ifa);
-
-#ifdef QETH_IPV6
-		qeth_clear_ifa6_list(&card->ip_new_state.ip6_ifa);
-		qeth_clear_ifamc_list(&card->ip_mc_new_state.ipm6_ifa);
-#endif /* QETH_IPV6 */
-
-		if (result) {
-			QETH_DBF_TEXT1(0, trace, "ri-nosuc");
-			PRINT_ERR("qeth: RECOVERY WAS NOT SUCCESSFUL ON %s "
-				  "(%s/%s/%s), GIVING UP, "
-				  "OUTGOING PACKETS WILL BE DISCARDED!\n",
-				  card->dev_name,
-				  CARD_RDEV_ID(card),
-				  CARD_WDEV_ID(card),
-				  CARD_DDEV_ID(card));
-			/* early leave hard_start_xmit! */
-			atomic_set(&card->is_startlaned, 0);
-			qeth_wakeup_procfile();
-		} else {
-			QETH_DBF_TEXT1(0, trace, "ri-sftst");
-			qeth_softsetup_card(card, QETH_LOCK_ALREADY_HELD);
-			up(&card->softsetup_sema);
-
-			if (!already_registered) {
-				QETH_DBF_TEXT1(0, trace, "ri-regcd");
-				qeth_register_netdev(card);
-			}
-			qeth_restore_dev_flag_state(card);
-			netif_wake_queue(card->dev);
-			qeth_wakeup_procfile();
-		}
-		spin_unlock(&setup_lock);
-	}
-out:
-	atomic_set(&card->in_recovery, 0);
-	read_unlock(&list_lock);
-	QETH_DBF_TEXT1(0, trace, "ri-leave");
-out_wakeup:
-	up(&card->reinit_thread_sem);
-	atomic_dec(&card->reinit_counter);
-
-	return 0;
-}
-
-static void
-qeth_fill_qeth_card_options(struct qeth_card *card)
-{
-	int i;
-
-	card->options.portname[0] = 0;
-	for (i = 1; i < 9; i++)
-		card->options.portname[i] = _ascebc[' '];
-	strcpy(card->options.devname, " ");
-	card->options.routing_type4 = NO_ROUTER;
-#ifdef QETH_IPV6
-	card->options.routing_type6 = NO_ROUTER;
-#endif /* QETH_IPV6 */
-	card->options.portno = 0;
-	card->options.checksum_type = QETH_CHECKSUM_DEFAULT;
-	card->options.do_prio_queueing = QETH_PRIOQ_DEFAULT;
-	card->options.default_queue = QETH_DEFAULT_QUEUE;
-	card->options.inbound_buffer_count = DEFAULT_BUFFER_COUNT;
-	card->options.polltime = QETH_MAX_INPUT_THRESHOLD;
-	card->options.macaddr_mode = MACADDR_NONCANONICAL;
-	card->options.broadcast_mode = BROADCAST_ALLRINGS;
-	card->options.fake_broadcast = DONT_FAKE_BROADCAST;
-	card->options.ena_ipat = ENABLE_TAKEOVER;
-	card->options.add_hhlen = DEFAULT_ADD_HHLEN;
-	card->options.fake_ll = DONT_FAKE_LL;
-}
-
-static void qeth_setup(struct net_device *dev)
-{
-	dev->tx_timeout = &qeth_tx_timeout;
-	dev->watchdog_timeo = QETH_TX_TIMEOUT;
-	dev->open = qeth_open;
-	dev->stop = qeth_stop;
-	dev->set_config = qeth_set_config;
-	dev->hard_start_xmit = qeth_hard_start_xmit;
-	dev->do_ioctl = qeth_do_ioctl;
-	dev->get_stats = qeth_get_stats;
-	dev->change_mtu = qeth_change_mtu;
-#ifdef QETH_VLAN
-	dev->vlan_rx_register = qeth_vlan_rx_register;
-	dev->vlan_rx_kill_vid = qeth_vlan_rx_kill_vid;
-#endif
-	dev->set_multicast_list = qeth_set_multicast_list;
-	dev->set_mac_address = qeth_set_mac_address;
-	dev->neigh_setup = qeth_neigh_setup;
-	dev->addr_len = OSA_ADDR_LEN;	/* is ok for eth, tr, atm lane */
-	SET_MODULE_OWNER(dev);
-}
-
-static int
-qeth_alloc_card_stuff(struct qeth_card *card)
-{
-	if (!card)
-		return -EINVAL;
-
-	QETH_DBF_TEXT3(0, trace, "alccrdst");
-
-	card->dma_stuff =
-	    (struct qeth_dma_stuff *) kmalloc(sizeof (struct qeth_dma_stuff),
-					      GFP_KERNEL | GFP_DMA);
-	if (!card->dma_stuff)
-		goto exit_dma;
-	memset(card->dma_stuff, 0, sizeof (struct qeth_dma_stuff));
-
-	card->dma_stuff->recbuf = (char *) kmalloc(QETH_BUFSIZE,
-						   GFP_KERNEL | GFP_DMA);
-	if (!card->dma_stuff->recbuf)
-		goto exit_dma1;
-	memset(card->dma_stuff->recbuf, 0, QETH_BUFSIZE);
-
-	card->dma_stuff->sendbuf = (char *) kmalloc(QETH_BUFSIZE,
-						    GFP_KERNEL | GFP_DMA);
-	if (!card->dma_stuff->sendbuf)
-		goto exit_dma2;
-	memset(card->dma_stuff->sendbuf, 0, QETH_BUFSIZE);
-
-	card->dev = alloc_netdev(0, "", qeth_setup);
-	if (!card->dev)
-		goto exit_dev;
-
-	card->stats =
-	    (struct net_device_stats *)
-	    kmalloc(sizeof (struct net_device_stats), GFP_KERNEL);
-	if (!card->stats)
-		goto exit_stats;
-	memset(card->stats, 0, sizeof (struct net_device_stats));
-
-	/* setup net_device stuff */
-	card->dev->priv = card;
-
-	/* setup net_device_stats stuff */
-	/* =nothing yet */
-
-	return 0;
-
-	/* these are quick exits in case of failures of the kmallocs */
-exit_stats:
-	free_netdev(card->dev);
-exit_dev:
-	kfree(card->dma_stuff->sendbuf);
-exit_dma2:
-	kfree(card->dma_stuff->recbuf);
-exit_dma1:
-	kfree(card->dma_stuff);
-exit_dma:
-	return -ENOMEM;
-}
-
-static struct qeth_card *
-qeth_alloc_card(void)
-{
-	struct qeth_card *card;
-
-	QETH_DBF_TEXT3(0, trace, "alloccrd");
-	card = (struct qeth_card *) vmalloc(sizeof (struct qeth_card));
-	if (!card)
-		return NULL;
-	memset(card, 0, sizeof (struct qeth_card));
-	init_waitqueue_head(&card->wait_q);
-	init_waitqueue_head(&card->ioctl_wait_q);
-
-	qeth_fill_qeth_card_options(card);
-
-	init_MUTEX(&card->softsetup_sema);
-	init_MUTEX(&card->hardsetup_sema);
-	spin_lock_init(&card->ioctl_lock);
-#ifdef QETH_VLAN
-	spin_lock_init(&card->vlan_lock);
-	card->vlangrp = NULL;
-#endif
-	card->unique_id = 0;
-	sema_init(&card->reinit_thread_sem, 0);
-	up(&card->reinit_thread_sem);
-
-	/* setup card stuff */
-	card->ip_current_state.ip_ifa = NULL;
-	card->ip_new_state.ip_ifa = NULL;
-	card->ip_mc_current_state.ipm_ifa = NULL;
-	card->ip_mc_new_state.ipm_ifa = NULL;
-
-#ifdef QETH_IPV6
-	card->ip_current_state.ip6_ifa = NULL;
-	card->ip_new_state.ip6_ifa = NULL;
-	card->ip_mc_current_state.ipm6_ifa = NULL;
-	card->ip_mc_new_state.ipm6_ifa = NULL;
-#endif /* QETH_IPV6 */
-
-	card->csum_enable_mask = IPA_CHECKSUM_DEFAULT_ENABLE_MASK;
-
-	/* and return to the sender */
-	return card;
-
-}
-
-static int
-qeth_init_ringbuffers1(struct qeth_card *card)
-{
-	int i, j;
-
-	QETH_DBF_CARD3(0, trace, "irb1", card);
-
-	for (i = 0; i < card->no_queues; i++) {
-		card->outbound_ringbuffer[i] =
-		    vmalloc(sizeof (struct qeth_ringbuffer));
-		if (!card->outbound_ringbuffer[i]) {
-			for (j = i - 1; j >= 0; j--) {
-				vfree(card->outbound_ringbuffer[j]);
-				card->outbound_ringbuffer[j] = NULL;
-			}
-			return -ENOMEM;
-		}
-		memset(card->outbound_ringbuffer[i], 0,
-		       sizeof (struct qeth_ringbuffer));
-		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; j++)
-			skb_queue_head_init(&card->outbound_ringbuffer[i]->
-					    ringbuf_element[j].skb_list);
-	}
-
-	return 0;
-}
-
-static int
-qeth_init_ringbuffers2(struct qeth_card *card)
-{
-	int i, j;
-
-	QETH_DBF_CARD3(0, trace, "irb2", card);
-
-	for (i = 0; i < card->options.inbound_buffer_count; i++) {
-		for (j = 0; j < BUFFER_MAX_ELEMENTS; j++) {
-			card->inbound_buffer_pool_entry[i][j] =
-				kmalloc(PAGE_SIZE, GFP_KERNEL);
-			if (!card->inbound_buffer_pool_entry[i][j]) {
-				goto out;
-			}
-		}
-		card->inbound_buffer_pool_entry_used[i] = BUFFER_UNUSED;
-	}
-
-	spin_lock_init(&card->requeue_input_lock);
-
-	return 0;
-out:
-	for (i = 0; i < card->options.inbound_buffer_count; i++) {
-		for (j = 0; j < QDIO_MAX_ELEMENTS_PER_BUFFER; j++) {
-			if (card->inbound_buffer_pool_entry[i][j]) {
-				if (j < BUFFER_MAX_ELEMENTS)
-					kfree(card->
-					      inbound_buffer_pool_entry[i][j]);
-				card->inbound_buffer_pool_entry[i][j] = NULL;
-			}
-		}
-	}
-	for (i = 0; i < card->no_queues; i++) {
-		vfree(card->outbound_ringbuffer[i]);
-		card->outbound_ringbuffer[i] = NULL;
-	}
-	return -ENOMEM;
-
-}
-
-/* also locked from outside (setup_lock) */
-static void
-qeth_insert_card_into_list(struct qeth_card *card)
-{
-	QETH_DBF_CARD3(0, trace, "icil", card);
-
-	write_lock(&list_lock);
-	card->next = firstcard;
-	firstcard = card;
-	write_unlock(&list_lock);
-}
-
-static int
-qeth_determine_card_type(struct qeth_card *card)
-{
-	int i = 0;
-	char dbf_text[15];
-
-	while (known_devices[i][4]) {
-		if ((CARD_RDEV(card)->id.dev_type == known_devices[i][2]) &&
-		    (CARD_RDEV(card)->id.dev_model == known_devices[i][3])) {
-			card->type = known_devices[i][4];
-			if (card->options.ena_ipat == ENABLE_TAKEOVER)
-				card->func_level = known_devices[i][6];
-			else
-				card->func_level = known_devices[i][7];
-			card->no_queues = known_devices[i][8];
-			card->is_multicast_different = known_devices[i][9];
-			QETH_DBF_TEXT2(0, setup, CARD_BUS_ID(card));
-			sprintf(dbf_text, "ctyp%4x", card->type);
-			QETH_DBF_TEXT2(0, setup, dbf_text);
-			return 0;
-		}
-		i++;
-	}
-	card->type = QETH_CARD_TYPE_UNKNOWN;
-	QETH_DBF_TEXT2(0, setup, CARD_BUS_ID(card));
-	sprintf(dbf_text, "ctypUNKN");
-	QETH_DBF_TEXT2(0, setup, dbf_text);
-	PRINT_ERR("unknown card type on device %s\n", CARD_BUS_ID(card));
-	return -ENOENT;
-}
-
-static int
-qeth_getint(char *s, int longint)
-{
-	int cnt;
-	int hex;
-	int result;
-	char c;
-
-	if (!s)
-		return -1;
-	hex = ((s[0] == '0') && ((s[1] == 'x') || (s[1] == 'X'))) ? 1 : 0;
-	cnt = (hex) ? 2 : 0;	/* start from the first real digit */
-	if (!(s[cnt]))
-		return -1;
-	result = 0;
-	while ((c = s[cnt++])) {
-		if (hex) {
-			if (isxdigit(c))
-				result = result * 16 + qeth_getxdigit(c);
-			else
-				return -1;
-		} else {
-			if (isdigit(c))
-				result = result * 10 + c - '0';
-			else
-				return -1;
-		}
-		/* prevent overflow, 0xffff is enough for us */
-		if (longint) {
-			if (result > 0xfffffff)
-				return -1;
-		} else {
-			if (result > 0xffff)
-				return -1;
-		}
-	}
-	return result;
-}
-
-static void
-__qeth_correct_routing_status_v4(struct qeth_card *card)
-{
-	if (card->options.routing_type4 == NO_ROUTER)
-		return;
-
-	if (card->type == QETH_CARD_TYPE_IQD) {
-		/* if it's not a mc router, it's no router */
-		if ((card->options.routing_type4 == PRIMARY_ROUTER) ||
-		    (card->options.routing_type4 == SECONDARY_ROUTER)) {
-			PRINT_WARN("routing not applicable, reset "
-				   "routing status for ipv4. \n");
-			card->options.routing_type4 = NO_ROUTER;
-		}
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-	} else {
-		/* if it's a mc router, it's no router */
-		if ((!qeth_is_supported(IPA_OSA_MC_ROUTER_AVAIL) &&
-		     (card->options.routing_type4 == MULTICAST_ROUTER)) ||
-		    (card->options.routing_type4 == PRIMARY_CONNECTOR) ||
-		    (card->options.routing_type4 == SECONDARY_CONNECTOR)) {
-			PRINT_WARN("routing not applicable, reset "
-				   "routing status for ipv4. (Did you mean "
-				   "primary_router or secondary_router?)\n");
-			card->options.routing_type4 = NO_ROUTER;
-		}
-	}
-}
-
-static void
-__qeth_correct_routing_status_v6(struct qeth_card *card)
-{
-#ifdef QETH_IPV6
-	if (card->options.routing_type6 == NO_ROUTER)
-		return;
-	if (card->type == QETH_CARD_TYPE_IQD) {
-		/* if it's not a mc router, it's no router */
-		if ((card->options.routing_type6 == PRIMARY_ROUTER) ||
-		    (card->options.routing_type6 == SECONDARY_ROUTER)) {
-			PRINT_WARN("routing not applicable, reset "
-				   "routing status for ipv6. \n");
-			card->options.routing_type6 = NO_ROUTER;
-		}
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-	} else {
-		/* if it's a mc router, it's no router */
-		if ((!qeth_is_supported(IPA_OSA_MC_ROUTER_AVAIL) &&
-		     (card->options.routing_type6 == MULTICAST_ROUTER)) ||
-		    (card->options.routing_type6 == PRIMARY_CONNECTOR) ||
-		    (card->options.routing_type6 == SECONDARY_CONNECTOR)) {
-			PRINT_WARN("routing not applicable, reset "
-				   "routing status for ipv6. (Did you mean "
-				   "primary_router or secondary_router?)\n");
-			card->options.routing_type6 = NO_ROUTER;
-		}
-	}
-#endif /* QETH_IPV6 */
-}
-
-static void
-qeth_correct_routing_status(struct qeth_card *card)
-{
-	__qeth_correct_routing_status_v4(card);
-	__qeth_correct_routing_status_v6(card);
-}
-
-static int
-qeth_init_netdev(struct qeth_card *card)
-{
-
-	int result;
-	char dbf_text[15];
-
-	result = qeth_register_netdev(card);
-	if (result) {
-		PRINT_ALL("         register_netdev %s -- rc=%i\n",
-			  card->dev_name, result);
-		sprintf(dbf_text, "rgnd%4x", (__u16) result);
-		QETH_DBF_TEXT2(1, trace, dbf_text);
-		atomic_set(&card->is_registered, 0);
-		goto out;
-	}
-	strcpy(card->dev_name, card->dev->name);
-	atomic_set(&card->write_busy, 0);
-	atomic_set(&card->is_registered, 1);
-
-	result = qeth_softsetup_card(card, QETH_WAIT_FOR_LOCK);
-
-	if (!result) {
-		qeth_init_input_buffers(card);
-	} else {
-		QETH_DBF_TEXT2(0, trace, "SSFAILED");
-		PRINT_WARN("soft-setup of card failed!\n");
-	}
-
-	INIT_WORK(&card->tqueue, qeth_softsetup_thread_starter, card);
-	schedule_work(&card->tqueue);
-out:
-	qeth_wakeup_procfile();
-	return result;
-
-}
-
-static int
-qeth_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct qeth_card *card;
-	struct net_device *dev = (struct net_device *) ptr;
-
-	QETH_DBF_TEXT3(0, trace, "devevent");
-	QETH_DBF_HEX3(0, trace, &event, sizeof (unsigned long));
-	QETH_DBF_HEX3(0, trace, &dev, sizeof (void *));
-
-	card = __qeth_get_card_from_dev(dev);
-	if (qeth_does_card_exist(card)) {
-		qeth_save_dev_flag_state(card);
-		switch (event) {
-		default:
-			qeth_start_softsetup_thread(card);
-			break;
-		}
-	}
-
-	return NOTIFY_DONE;
-}
-
-static int
-qeth_ip_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct qeth_card *card;
-	struct in_ifaddr *ifa = (struct in_ifaddr *) ptr;
-	struct net_device *dev = ifa->ifa_dev->dev;
-	char dbf_text[15];
-
-	QETH_DBF_TEXT3(0, trace, "ipevent");
-	QETH_DBF_HEX3(0, trace, &event, sizeof (unsigned long));
-	QETH_DBF_HEX3(0, trace, &dev, sizeof (void *));
-	sprintf(dbf_text, "%08x", ifa->ifa_address);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-	sprintf(dbf_text, "%08x", ifa->ifa_mask);
-	QETH_DBF_TEXT3(0, trace, dbf_text);
-
-	card = __qeth_get_card_from_dev(dev);
-	if (qeth_does_card_exist(card)) {
-		QETH_DBF_HEX3(0, trace, &card, sizeof (void *));
-		qeth_save_dev_flag_state(card);
-		qeth_start_softsetup_thread(card);
-	}
-
-	return NOTIFY_DONE;
-}
-
-#ifdef QETH_IPV6
-static int
-qeth_ip6_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct qeth_card *card;
-	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *) ptr;
-	struct net_device *dev = ifa->idev->dev;
-
-	QETH_DBF_TEXT3(0, trace, "ip6event");
-	QETH_DBF_HEX3(0, trace, &event, sizeof (unsigned long));
-	QETH_DBF_HEX3(0, trace, &dev, sizeof (void *));
-	QETH_DBF_HEX3(0, trace, ifa->addr.s6_addr, QETH_DBF_TRACE_LEN);
-	QETH_DBF_HEX3(0, trace, ifa->addr.s6_addr + QETH_DBF_TRACE_LEN,
-		      QETH_DBF_TRACE_LEN);
-
-	card = __qeth_get_card_from_dev(dev);
-	if (qeth_does_card_exist(card)) {
-		QETH_DBF_HEX3(0, trace, &card, sizeof (void *));
-		qeth_save_dev_flag_state(card);
-		qeth_start_softsetup_thread(card);
-	}
-
-	return NOTIFY_DONE;
-}
-#endif /* QETH_IPV6 */
-
-static int
-qeth_reboot_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct qeth_card *card;
-
-	read_lock(&list_lock);
-	if (firstcard) {
-		card = firstcard;
-	clear_another_one:
-		if (card->type == QETH_CARD_TYPE_IQD) {
-			ccw_device_halt(CARD_DDEV(card), 0);
-			ccw_device_clear(CARD_RDEV(card), 0);
-			ccw_device_clear(CARD_WDEV(card), 0);
-			ccw_device_clear(CARD_DDEV(card), 0);
-		} else {
-			ccw_device_clear(CARD_DDEV(card), 0);
-			ccw_device_clear(CARD_RDEV(card), 0);
-			ccw_device_clear(CARD_WDEV(card), 0);
-		}
-		if (card->next) {
-			card = card->next;
-			goto clear_another_one;
-		}
-	}
-	read_unlock(&list_lock);
-
-	return 0;
-}
-
-static struct notifier_block qeth_dev_notifier = {
-	qeth_dev_event,
-	0
-};
-
-static struct notifier_block qeth_ip_notifier = {
-	qeth_ip_event,
-	0
-};
-
-#ifdef QETH_IPV6
-static struct notifier_block qeth_ip6_notifier = {
-	qeth_ip6_event,
-	0
-};
-#endif /* QETH_IPV6 */
-
-static struct notifier_block qeth_reboot_notifier = {
-	qeth_reboot_event,
-	0
-};
-
-static void
-qeth_register_notifiers(void)
-{
-	int r;
-
-	QETH_DBF_TEXT5(0, trace, "regnotif");
-	/* register to be notified on events */
-	r = register_netdevice_notifier(&qeth_dev_notifier);
-
-	r = register_inetaddr_notifier(&qeth_ip_notifier);
-#ifdef QETH_IPV6
-	r = register_inet6addr_notifier(&qeth_ip6_notifier);
-#endif /* QETH_IPV6 */
-	r = register_reboot_notifier(&qeth_reboot_notifier);
-}
-
-static void __exit
-qeth_unregister_notifiers(void)
-{
-	int r;
-
-	QETH_DBF_TEXT5(0, trace, "unregnot");
-	r = unregister_netdevice_notifier(&qeth_dev_notifier);
-	r = unregister_inetaddr_notifier(&qeth_ip_notifier);
-#ifdef QETH_IPV6
-	r = unregister_inet6addr_notifier(&qeth_ip6_notifier);
-#endif /* QETH_IPV6 */
-	r = unregister_reboot_notifier(&qeth_reboot_notifier);
-
-}
-
-static int
-qeth_procfile_open(struct inode *inode, struct file *file)
-{
-	int length = 0;
-	struct qeth_card *card;
-	char checksum_str[5], queueing_str[14], router_str[8], bufsize_str[4];
-	char *buffer;
-	int rc = 0;
-	int size;
-	struct tempinfo *info;
-
-	info = (struct tempinfo *) vmalloc(sizeof (struct tempinfo));
-	if (info == NULL) {
-		PRINT_WARN("No memory available for data\n");
-		return -ENOMEM;
-	} else {
-		file->private_data = (void *) info;
-	}
-
-	/* lock all the stuff */
-	read_lock(&list_lock);
-	card = firstcard;
-	size = 200;		/* 2 lines plus some sanity space */
-	while (card) {
-		size += 90;	/* if device name is > 10 chars, (should never
-				   happen...), we'll need that */
-		card = card->next;
-	}
-
-	buffer = info->data = (char *) vmalloc(size);
-	if (info->data == NULL) {
-		PRINT_WARN("No memory available for data\n");
-		vfree(info);
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	QETH_DBF_TEXT2(0, trace, "procread");
-	length += sprintf(buffer + length,
-			  "devices                  CHPID     "
-			  "device     cardtype port chksum prio-q'ing "
-			  "rtr fsz cnt\n");
-	length += sprintf(buffer + length,
-			  "-------------------------- --- ----"
-			  "------ -------------- --     -- ---------- "
-			  "--- --- ---\n");
-	card = firstcard;
-	while (card) {
-		strcpy(checksum_str,
-		       (card->options.checksum_type == SW_CHECKSUMMING) ? "SW" :
-		       (card->options.checksum_type == HW_CHECKSUMMING) ? "HW" :
-		       "no");
-		if (card->options.do_prio_queueing == NO_PRIO_QUEUEING) {
-			sprintf(queueing_str, "always_q_%i",
-				card->options.default_queue);
-		} else {
-			strcpy(queueing_str, (card->options.do_prio_queueing
-					      ==
-					      PRIO_QUEUEING_PREC) ? "by_prec." :
-			       "by_ToS");
-		}
-
-		/* FIXME: this is really a mess... */
-
-#ifdef QETH_IPV6
-		if (atomic_read(&card->rt4fld) || atomic_read(&card->rt6fld))
-			strcpy(router_str, "FLD");
-#else/* QETH_IPV6 */
-		if (atomic_read(&card->rt4fld))
-			strcpy(router_str, "FLD");
-#endif /* QETH_IPV6 */
-		else if (((card->options.routing_type4 & ROUTER_MASK) ==
-			  PRIMARY_ROUTER)
-#ifdef QETH_IPV6
-			 &&
-			 (((card->options.routing_type6 & ROUTER_MASK) ==
-			  PRIMARY_ROUTER) ||
-			  (!qeth_is_supported(IPA_IPv6)))
-#endif /* QETH_IPV6 */
-		    ) {
-			strcpy(router_str, "pri");
-		} else
-		    if (((card->options.routing_type4 & ROUTER_MASK) ==
-			 SECONDARY_ROUTER)
-#ifdef QETH_IPV6
-			&&
-			(((card->options.routing_type6 & ROUTER_MASK) ==
-			 SECONDARY_ROUTER) ||
-			 (!qeth_is_supported(IPA_IPv6)))
-#endif /* QETH_IPV6 */
-		    ) {
-			strcpy(router_str, "sec");
-		} else
-		    if (((card->options.routing_type4 & ROUTER_MASK) ==
-			 MULTICAST_ROUTER)
-#ifdef QETH_IPV6
-			&&
-			(((card->options.routing_type6 & ROUTER_MASK) ==
-			 MULTICAST_ROUTER) ||
-			 (!qeth_is_supported(IPA_IPv6)))
-#endif /* QETH_IPV6 */
-		    ) {
-			strcpy(router_str, "mc");
-		} else
-		    if (((card->options.routing_type4 & ROUTER_MASK) ==
-			 PRIMARY_CONNECTOR)
-#ifdef QETH_IPV6
-			&&
-			(((card->options.routing_type6 & ROUTER_MASK) ==
-			 PRIMARY_CONNECTOR) ||
-			 (!qeth_is_supported(IPA_IPv6)))
-#endif /* QETH_IPV6 */
-		    ) {
-			strcpy(router_str, "p.c");
-		} else
-		    if (((card->options.routing_type4 & ROUTER_MASK) ==
-			 SECONDARY_CONNECTOR)
-#ifdef QETH_IPV6
-			&&
-			(((card->options.routing_type6 & ROUTER_MASK) ==
-			 SECONDARY_CONNECTOR) ||
-			 (!qeth_is_supported(IPA_IPv6)))
-#endif /* QETH_IPV6 */
-		    ) {
-			strcpy(router_str, "s.c");
-		} else
-		    if (((card->options.routing_type4 & ROUTER_MASK) ==
-			 NO_ROUTER)
-#ifdef QETH_IPV6
-			&&
-			(((card->options.routing_type6 & ROUTER_MASK) ==
-			 NO_ROUTER) ||
-			 (!qeth_is_supported(IPA_IPv6)))
-#endif /* QETH_IPV6 */
-		    ) {
-			strcpy(router_str, "no");
-		} else {
-			strcpy(router_str, "mix");
-		}
-		strcpy(bufsize_str,
-		       (BUFFER_SIZE == 16384) ? "16k" :
-		       (BUFFER_SIZE == 24576) ? "24k" :
-		       (BUFFER_SIZE == 32768) ? "32k" :
-		       (BUFFER_SIZE == 40960) ? "40k" : "64k");
-
-		if (!atomic_read(&card->is_startlaned)) {
-			length += sprintf(buffer + length,
-					  "%s/%s/%s x%02X %10s %14s %2i"
-					  "  +++ CABLE PULLED +++\n",
-					  CARD_RDEV_ID(card),
-					  CARD_WDEV_ID(card),
-					  CARD_DDEV_ID(card),
-					  card->chpid,
-					  card->dev_name,
-					  qeth_get_cardname_short
-					  (card->type, card->link_type,
-					   card->is_guest_lan),
-					  card->options.portno);
-		} else {
-			length += sprintf(buffer + length,
-					  "%s/%s/%s x%02X %10s %14s %2i"
-					  "     %2s %10s %3s %3s %3i\n",
-					  CARD_RDEV_ID(card),
-					  CARD_WDEV_ID(card),
-					  CARD_DDEV_ID(card),
-					  card->chpid, card->dev_name,
-					  qeth_get_cardname_short
-					  (card->type, card->link_type,
-					   card->is_guest_lan),
-					  card->options.portno, checksum_str,
-					  queueing_str, router_str, bufsize_str,
-					  card->options.inbound_buffer_count);
-		}
-		card = card->next;
-	}
-
-out:
-	info->len = length;
-	/* unlock all the stuff */
-	read_unlock(&list_lock);
-	return rc;
-}
-
-#define _OUTP_IT(x...) c+=sprintf(buffer+c,x)
-
-#ifdef QETH_PERFORMANCE_STATS
-static int
-qeth_perf_procfile_read(char *buffer, char **buffer_location,
-			off_t offset, int buffer_length, int *eof, void *data)
-{
-	int c = 0;
-	struct qeth_card *card;
-	/* we are always called with buffer_length=4k, so we all
-	   deliver on the first read */
-	if (offset > 0)
-		return 0;
-
-	QETH_DBF_TEXT2(0, trace, "perfpfrd");
-
-	card = firstcard;
-
-	while (card) {
-		_OUTP_IT("For card with devnos %s/%s/%s (%s):\n",
-			 CARD_RDEV_ID(card),
-			 CARD_WDEV_ID(card),
-			 CARD_DDEV_ID(card), card->dev_name);
-		_OUTP_IT("  Skb's/buffers received                 : %i/%i\n",
-			 card->perf_stats.skbs_rec, card->perf_stats.bufs_rec);
-		_OUTP_IT("  Skb's/buffers sent                     : %i/%i\n",
-			 card->perf_stats.skbs_sent,
-			 card->perf_stats.bufs_sent);
-		_OUTP_IT("\n");
-		_OUTP_IT("  Skb's/buffers sent without packing     : %i/%i\n",
-			 card->perf_stats.skbs_sent_dont_pack,
-			 card->perf_stats.bufs_sent_dont_pack);
-		_OUTP_IT("  Skb's/buffers sent with packing        : %i/%i\n",
-			 card->perf_stats.skbs_sent_pack,
-			 card->perf_stats.bufs_sent_pack);
-		_OUTP_IT("\n");
-		_OUTP_IT("  Packing state changes no pkg.->packing : %i/%i\n",
-			 card->perf_stats.sc_dp_p, card->perf_stats.sc_p_dp);
-		_OUTP_IT("  Current buffer usage (outbound q's)    : "
-			 "%i/%i/%i/%i\n",
-			 atomic_read(&card->outbound_used_buffers[0]),
-			 atomic_read(&card->outbound_used_buffers[1]),
-			 atomic_read(&card->outbound_used_buffers[2]),
-			 atomic_read(&card->outbound_used_buffers[3]));
-		_OUTP_IT("\n");
-		_OUTP_IT("  Inbound time (in us)                   : %i\n",
-			 card->perf_stats.inbound_time);
-		_OUTP_IT("  Inbound cnt                            : %i\n",
-			 card->perf_stats.inbound_cnt);
-		_OUTP_IT("  Outbound time (in us, incl QDIO)       : %i\n",
-			 card->perf_stats.outbound_time);
-		_OUTP_IT("  Outbound cnt                           : %i\n",
-			 card->perf_stats.outbound_cnt);
-		_OUTP_IT("  Watermarks: L/H=%i/%i\n",
-			 LOW_WATERMARK_PACK, HIGH_WATERMARK_PACK);
-		_OUTP_IT("\n");
-
-		card = card->next;
-	}
-
-	return c;
-}
-
-static struct proc_dir_entry *qeth_perf_proc_file;
-
-#endif /* QETH_PERFORMANCE_STATS */
-
-static int
-qeth_ipato_procfile_open(struct inode *inode, struct file *file)
-{
-	char text[33];
-	struct ipato_entry *ipato_entry;
-	struct qeth_card *card;
-	struct qeth_vipa_entry *vipa_entry;
-	int rc = 0;
-	struct tempinfo *info;
-	int size;
-	char entry_type[5];
-
-	info = (struct tempinfo *) vmalloc(sizeof (struct tempinfo));
-	if (info == NULL) {
-		PRINT_WARN("No memory available for data\n");
-		return -ENOMEM;
-	} else {
-		file->private_data = (void *) info;
-	}
-	info->len = 0;
-
-	QETH_DBF_TEXT2(0, trace, "ipatorea");
-	/* lock all the stuff */
-	spin_lock(&ipato_list_lock);
-	read_lock(&list_lock);
-
-	size = 64;		/* for inv4/6 etc. */
-
-	ipato_entry = ipato_entries;
-	while (ipato_entry) {
-		ipato_entry = ipato_entry->next;
-		size += 64;
-	}
-	card = firstcard;
-	while (card) {
-		read_lock(&card->vipa_list_lock);
-		vipa_entry = card->vipa_list;
-		while (vipa_entry) {
-			vipa_entry = vipa_entry->next;
-			size += 64;
-		}
-		/*read_unlock(&card->vipa_list_lock); don't unlock it here */
-		card = card->next;
-	}
-	info->data = (char *) vmalloc(size);
-	if (info->data == NULL) {
-		PRINT_WARN("No memory available for data\n");
-		vfree(info);
-		rc = -ENOMEM;
-		goto out;
-	}
-#define _IOUTP_IT(x...) info->len+=sprintf(info->data+info->len,x)
-	if (ipato_inv4)
-		_IOUTP_IT("inv4\n");
-	ipato_entry = ipato_entries;
-	text[8] = 0;
-	while (ipato_entry) {
-		if (ipato_entry->version == 4) {
-			qeth_convert_addr_to_text(4, ipato_entry->addr, text);
-			_IOUTP_IT("add4 %s/%i%s%s\n", text,
-				  ipato_entry->mask_bits,
-				  ipato_entry->dev_name[0] ? ":" : "",
-				  ipato_entry->dev_name[0] ?
-				  ipato_entry->dev_name : "");
-		}
-		ipato_entry = ipato_entry->next;
-	}
-
-	if (ipato_inv6)
-		_IOUTP_IT("inv6\n");
-	ipato_entry = ipato_entries;
-	text[32] = 0;
-	while (ipato_entry) {
-		if (ipato_entry->version == 6) {
-			qeth_convert_addr_to_text(6, ipato_entry->addr, text);
-			_IOUTP_IT("add6 %s/%i%s%s\n", text,
-				  ipato_entry->mask_bits,
-				  ipato_entry->dev_name[0] ? ":" : "",
-				  ipato_entry->dev_name[0] ?
-				  ipato_entry->dev_name : "");
-		}
-		ipato_entry = ipato_entry->next;
-	}
-	card = firstcard;
-	while (card) {
-		vipa_entry = card->vipa_list;
-		while (vipa_entry) {
-			strcpy(entry_type, (vipa_entry->flag ==
-					    IPA_SETIP_VIPA_FLAGS) ?
-			       "vipa" : "rxip");
-			if (vipa_entry->version == 4) {
-				_IOUTP_IT("add_%s4 %02x%02x%02x%02x:%s\n",
-					  entry_type,
-					  vipa_entry->ip[0],
-					  vipa_entry->ip[1],
-					  vipa_entry->ip[2],
-					  vipa_entry->ip[3], card->dev_name);
-			} else {
-				_IOUTP_IT("add_%s6 %02x%02x%02x%02x"
-					  "%02x%02x%02x%02x"
-					  "%02x%02x%02x%02x"
-					  "%02x%02x%02x%02x:%s\n",
-					  entry_type,
-					  vipa_entry->ip[0],
-					  vipa_entry->ip[1],
-					  vipa_entry->ip[2],
-					  vipa_entry->ip[3],
-					  vipa_entry->ip[4],
-					  vipa_entry->ip[5],
-					  vipa_entry->ip[6],
-					  vipa_entry->ip[7],
-					  vipa_entry->ip[8],
-					  vipa_entry->ip[9],
-					  vipa_entry->ip[10],
-					  vipa_entry->ip[11],
-					  vipa_entry->ip[12],
-					  vipa_entry->ip[13],
-					  vipa_entry->ip[14],
-					  vipa_entry->ip[15], card->dev_name);
-			}
-			vipa_entry = vipa_entry->next;
-		}
-		card = card->next;
-	}
-out:
-	/* unlock all the stuff */
-	card = firstcard;
-	while (card) {
-		/*read_lock(&card->vipa_list_lock); don't lock it here */
-		read_unlock(&card->vipa_list_lock);
-		card = card->next;
-	}
-	read_unlock(&list_lock);
-	spin_unlock(&ipato_list_lock);
-
-	return rc;
-}
-
-static ssize_t
-qeth_procfile_read(struct file *file, char *user_buf,
-		   size_t user_len, loff_t * offset)
-{
-	loff_t len;
-	struct tempinfo *p_info = (struct tempinfo *) file->private_data;
-
-	if (*offset >= p_info->len) {
-		return 0;
-	} else {
-		len = __min(user_len, (p_info->len - *offset));
-		if (copy_to_user(user_buf, &(p_info->data[*offset]), len))
-			return -EFAULT;
-		(*offset) += len;
-		return len;
-	}
-}
-
-/* ATT: this is also the procfile release function for the ipato
- * procfs entry */
-static int
-qeth_procfile_release(struct inode *inode, struct file *file)
-{
-	struct tempinfo *p_info = (struct tempinfo *) file->private_data;
-
-	if (p_info) {
-		if (p_info->data)
-			vfree(p_info->data);
-		vfree(p_info);
-	}
-
-	return 0;
-}
-
-static ssize_t
-qeth_ipato_procfile_write(struct file *file,
-			  const char *user_buffer,
-			  size_t user_len, loff_t * offset)
-{
-	int add, version;
-	char text[33];
-	__u8 addr[16];
-	int len, i, flag;
-	int mask_bits;
-	char *buffer;
-	int dev_name_there;
-	char *dev_name_ptr;
-	struct qeth_card *card;
-#define BUFFER_LEN (10+32+1+5+1+DEV_NAME_LEN+1)
-
-	if (*offset > 0)
-		return user_len;
-	buffer =
-	    vmalloc(__max(__max(user_len + 1, BUFFER_LEN), QETH_DBF_MISC_LEN));
-
-	if (buffer == NULL)
-		return -ENOMEM;
-	/* BUFFER_LEN=command incl. blank+addr+slash+mask_bits+
-	 * colon+DEV_NAME_LEN+zero */
-	memset(buffer, 0, BUFFER_LEN);
-
-	if (copy_from_user(buffer, user_buffer, user_len)) {
-		vfree(buffer);
-		return -EFAULT;
-	}
-
-	QETH_DBF_TEXT2(0, trace, "ipatowri");
-	QETH_DBF_TEXT2(0, misc, buffer);
-	if (!strncmp(buffer, "inv4", 4)) {
-		ipato_inv4 = 1 - ipato_inv4;
-		goto out;
-	}
-	if (!strncmp(buffer, "inv6", 4)) {
-		ipato_inv6 = 1 - ipato_inv6;
-		goto out;
-	}
-	if ((!strncmp(buffer, "add4 ", 5)) ||
-	    (!strncmp(buffer, "add6 ", 5)) ||
-	    (!strncmp(buffer, "del4 ", 5)) || (!strncmp(buffer, "del6 ", 5))) {
-		text[8] = 0;
-		text[32] = 0;
-		add = !strncmp(buffer, "add", 3);
-		version = (buffer[3] == '4') ? 4 : 6;
-		len = (version == 4) ? 8 : 32;
-		strncpy(text, buffer + 5, len);
-		if (qeth_convert_text_to_addr(version, text, addr)) {
-			PRINT_ERR("error in parsing ipato information "
-				  "(addr)\n");
-			goto out;
-		}
-		strncpy(text, buffer + 5 + len + 1, 10);
-		/* we prepare mask_bits for qeth_getints */
-		dev_name_there = 0;
-		for (i = 5 + len + 1; i < BUFFER_LEN; i++) {
-			if (*(buffer + i) == '\n') {
-				*(buffer + i) = 0;
-				break;
-			}
-			if (*(buffer + i) == ':') {
-				*(buffer + i) = 0;	/* so that qeth_getint works */
-				dev_name_there = i;
-				break;
-			}
-			if (*(buffer + i) == 0)
-				break;
-		}
-		mask_bits = qeth_getint(buffer + 5 + len + 1, 0);
-		if ((mask_bits < 0)
-		    || (mask_bits > ((version == 4) ? 32 : 128))) {
-			PRINT_ERR("error in parsing ipato information "
-				  "(mask bits)\n");
-			goto out;
-		}
-		if (dev_name_there) {
-			dev_name_ptr = buffer + dev_name_there + 1;
-			/* wipe out the linefeed */
-			for (i = dev_name_there + 1;
-			     i < dev_name_there + 1 + DEV_NAME_LEN + 1; i++)
-				if (*(buffer + i) == '\n')
-					*(buffer + i) = 0;
-		} else
-			dev_name_ptr = NULL;
-
-		if (add)
-			qeth_add_ipato_entry(version, addr, mask_bits,
-					     dev_name_ptr);
-		else
-			qeth_del_ipato_entry(version, addr, mask_bits,
-					     dev_name_ptr);
-		goto out;
-	}
-	if ((!strncmp(buffer, "add_vipa4 ", 10)) ||
-	    (!strncmp(buffer, "add_rxip4 ", 10)) ||
-	    (!strncmp(buffer, "add_vipa6 ", 10)) ||
-	    (!strncmp(buffer, "add_rxip6 ", 10)) ||
-	    (!strncmp(buffer, "del_vipa4 ", 10)) ||
-	    (!strncmp(buffer, "del_rxip4 ", 10)) ||
-	    (!strncmp(buffer, "del_vipa6 ", 10)) ||
-	    (!strncmp(buffer, "del_rxip6 ", 10))) {
-		text[8] = 0;
-		text[32] = 0;
-		add = !strncmp(buffer, "add", 3);
-		flag =
-		    (!strncmp(buffer + 4, "vipa", 4)) ? IPA_SETIP_VIPA_FLAGS :
-		    IPA_SETIP_TAKEOVER_FLAGS;
-		version = (buffer[8] == '4') ? 4 : 6;
-		len = (version == 4) ? 8 : 32;
-		strncpy(text, buffer + 10, len);
-		if (qeth_convert_text_to_addr(version, text, addr)) {
-			PRINT_ERR("error in parsing vipa/rxip information "
-				  "(addr)\n");
-			goto out;
-		}
-		if (*(buffer + 10 + len) != ':') {
-			PRINT_ERR("error in parsing vipa/rxip information "
-				  "(no interface)\n");
-			goto out;
-		}
-		/* interface name is at buffer+10+len+1 */
-		/* wipe out the \n */
-		for (i = 10 + len + 1; i < 10 + len + 1 + DEV_NAME_LEN + 1; i++)
-			if (*(buffer + i) == '\n')
-				*(buffer + i) = 0;
-		card = qeth_get_card_by_name(buffer + 10 + len + 1);
-		if (!card) {
-			PRINT_ERR("error in parsing vipa/rxip information "
-				  "(unknown interface)\n");
-			goto out;
-		}
-		if (add)
-			i = qeth_add_vipa_entry(card, version, addr, flag);
-		else
-			i = qeth_del_vipa_entry(card, version, addr, flag);
-		if (!i)
-			qeth_start_softsetup_thread(card);
-		goto out;
-	}
-	PRINT_ERR("unknown ipato information command\n");
-out:
-	vfree(buffer);
-	*offset = *offset + user_len;
-#undef BUFFER_LEN
-	return user_len;
-}
-
-static int
-qeth_procfile_getinterfaces(unsigned long arg)
-{
-	struct qeth_card *card;
-
-	char parms[16];
-	char *buffer;
-	char *buffer_pointer;
-	__u32 version, valid_fields, qeth_version, number_of_devices, if_index;
-	__u32 data_size, data_len;
-	unsigned long ioctl_flags;
-	int result = 0;
-
-	/* the struct of version 0 is:
-	   typedef struct dev_list
-	   {
-	   char device_name[IFNAME_MAXLEN]; // OSA-Exp device name (e.g. eth0)
-	   __u32 if_index;                  // interface index from kernel
-	   __u32 flags;                    // device charateristics
-	   } __attribute__((packed)) DEV_LIST;
-
-	   typedef struct osaexp_dev_ver0
-	   {
-	   __u32 version;                // structure version
-	   __u32 valid_fields;           // bitmask of fields that are really filled
-	   __u32 qeth_version;           // qeth driver version
-	   __u32 number_of_devices;      // number of OSA Express devices
-	   struct dev_list devices[0]; // list of OSA Express devices
-	   } __attribute__((packed)) OSAEXP_DEV_VER0;
-	 */
-
-	version = 0;
-	valid_fields = 0;
-	qeth_version = 0;
-	number_of_devices = 0;
-
-	if (copy_from_user((void *) parms, (void *) arg, sizeof (parms)))
-		return -EFAULT;
-	memcpy(&data_size, parms, sizeof (__u32));
-
-	if (!(data_size > 0))
-		return -EFAULT;
-	if (data_size > IOCTL_MAX_TRANSFER_SIZE)
-		return -EFAULT;
-	if (!access_ok(VERIFY_WRITE, (void *) arg, data_size))
-		return -EFAULT;
-
-	read_lock(&list_lock);
-	card = firstcard;
-#define IOCTL_USER_STRUCT_SIZE (DEV_NAME_LEN*sizeof(char)) + \
-	sizeof(__u32) + sizeof(__u32)
-	while (card) {
-		if (card->type == QETH_CARD_TYPE_OSAE)
-			number_of_devices =
-			    number_of_devices + IOCTL_USER_STRUCT_SIZE;
-		card = card->next;
-	}
-#undef IOCTL_USER_STRUCT_SIZE
-	if ((number_of_devices + 4 * sizeof (__u32)) >= data_size) {
-		result = -ENOMEM;
-		goto out;
-	}
-
-	number_of_devices = 0;
-	card = firstcard;
-	buffer = (char *) vmalloc(data_size);
-	if (!buffer) {
-		result = -EFAULT;
-		goto out;
-	}
-	buffer_pointer = ((char *) (buffer)) + (4 * sizeof (__u32));
-	while (card) {
-		if ((card->type == QETH_CARD_TYPE_OSAE) &&
-		    (atomic_read(&card->is_hardsetup)) &&
-		    (atomic_read(&card->is_registered))) {
-
-			memcpy(buffer_pointer, card->dev_name, DEV_NAME_LEN);
-			buffer_pointer = buffer_pointer + DEV_NAME_LEN;
-			if_index = card->dev->ifindex;
-			memcpy(buffer_pointer, &if_index, sizeof (__u32));
-			buffer_pointer = buffer_pointer + sizeof (__u32);
-			memcpy(buffer_pointer, &ioctl_flags, sizeof (__u32));
-			buffer_pointer = buffer_pointer + sizeof (__u32);
-			number_of_devices = number_of_devices + 1;
-		}
-		card = card->next;
-	}
-
-	/* we copy the real size */
-	data_len = buffer_pointer - buffer;
-
-	buffer_pointer = buffer; 
-	/* copy the header information at the beginning of the buffer */
-	memcpy(buffer_pointer, &version, sizeof (__u32));
-	memcpy(((char *) buffer_pointer) + sizeof (__u32), &valid_fields,
-	       sizeof (__u32));
-	memcpy(((char *) buffer_pointer) + (2 * sizeof (__u32)), &qeth_version,
-	       sizeof (__u32));
-	memcpy(((char *) buffer_pointer) + (3 * sizeof (__u32)),
-	       &number_of_devices, sizeof (__u32));
-	if (copy_to_user((char *) arg, buffer, data_len))
-		result = -EFAULT;
-	vfree(buffer);
-out:
-	read_unlock(&list_lock);
-	return result;
-
-#undef PARMS_BUFFERLENGTH
-
-};
-
-static int
-qeth_procfile_interfacechanges(unsigned long arg)
-{
-	return qeth_sleepon_procfile();
-
-}
-
-static int
-qeth_procfile_ioctl(struct inode *inode, struct file *file,
-		    unsigned int cmd, unsigned long arg)
-{
-
-	int result;
-	if (!down_interruptible(&qeth_procfile_ioctl_lock)) {
-		switch (cmd) {
-			case QETH_IOCPROC_OSAEINTERFACES:
-				result = qeth_procfile_getinterfaces(arg);
-				break;
-			case QETH_IOCPROC_INTERFACECHANGES:
-				result = qeth_procfile_interfacechanges(arg);
-				break;
-			default:
-				result = -EOPNOTSUPP;
-		}
-		up(&qeth_procfile_ioctl_lock);
-	} else
-		result = -ERESTARTSYS;
-	return result;
-};
-
-static struct file_operations qeth_procfile_fops = {
-	.owner = THIS_MODULE,
-	.ioctl = qeth_procfile_ioctl,
-	.read = qeth_procfile_read,
-	.open = qeth_procfile_open,
-	.release = qeth_procfile_release,
-};
-
-static struct proc_dir_entry *qeth_proc_file;
-
-static struct file_operations qeth_ipato_procfile_fops = {
-	.owner = THIS_MODULE,
-	.read = qeth_procfile_read,	/* same as above! */
-	.write = qeth_ipato_procfile_write,
-	.open = qeth_ipato_procfile_open,
-	.release = qeth_procfile_release	/* same as above! */
-};
-
-static struct proc_dir_entry *qeth_ipato_proc_file;
-
-static inline void
-__qeth_add_procfs_perf(void)
-{
-#ifdef QETH_PERFORMANCE_STATS
-	proc_perf_file_registration = 0;
-	qeth_perf_proc_file = create_proc_entry(QETH_PERF_PROCFILE_NAME,
-						S_IFREG | 0444, &proc_root);
-	if (qeth_perf_proc_file) {
-		qeth_perf_proc_file->read_proc = &qeth_perf_procfile_read;
-	} else
-		proc_perf_file_registration = -1;
-
-	if (proc_perf_file_registration)
-		PRINT_WARN("was not able to register perf. proc-file (%i).\n",
-			   proc_perf_file_registration);
-#endif /* QETH_PERFORMANCE_STATS */
-}
-
-static void
-qeth_add_procfs_entries(void)
-{
-	proc_file_registration = 0;
-	qeth_proc_file = create_proc_entry(QETH_PROCFILE_NAME,
-					   S_IFREG | 0444, &proc_root);
-	if (qeth_proc_file) {
-		qeth_proc_file->proc_fops = &qeth_procfile_fops;
-		sema_init(&qeth_procfile_ioctl_sem,
-			  PROCFILE_SLEEP_SEM_MAX_VALUE);
-		sema_init(&qeth_procfile_ioctl_lock,
-			  PROCFILE_IOCTL_SEM_MAX_VALUE);
-	} else
-		proc_file_registration = -1;
-
-	if (proc_file_registration)
-		PRINT_WARN("was not able to register proc-file (%i).\n",
-			   proc_file_registration);
-	proc_ipato_file_registration = 0;
-	qeth_ipato_proc_file = create_proc_entry(QETH_IPA_PROCFILE_NAME,
-						 S_IFREG | 0644, &proc_root);
-	if (qeth_ipato_proc_file) {
-		qeth_ipato_proc_file->proc_fops = &qeth_ipato_procfile_fops;
-	} else
-		proc_ipato_file_registration = -1;
-
-	if (proc_ipato_file_registration)
-		PRINT_WARN("was not able to register ipato-proc-file (%i).\n",
-			   proc_ipato_file_registration);
-	__qeth_add_procfs_perf();
-}
-
-static void __exit
-qeth_remove_procfs_entries(void)
-{
-	if (!proc_file_registration)	/* means if it went ok earlier */
-		remove_proc_entry(QETH_PROCFILE_NAME, &proc_root);
-
-	if (!proc_ipato_file_registration)	/* means if it went ok earlier */
-		remove_proc_entry(QETH_IPA_PROCFILE_NAME, &proc_root);
-
-#ifdef QETH_PERFORMANCE_STATS
-	if (!proc_perf_file_registration)	/* means if it went ok earlier */
-		remove_proc_entry(QETH_PERF_PROCFILE_NAME, &proc_root);
-#endif /* QETH_PERFORMANCE_STATS */
-}
-
-static int
-qeth_register_dbf_views(void)
-{
-	qeth_dbf_setup = debug_register(QETH_DBF_SETUP_NAME,
-					QETH_DBF_SETUP_INDEX,
-					QETH_DBF_SETUP_NR_AREAS,
-					QETH_DBF_SETUP_LEN);
-	if (!qeth_dbf_setup)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_setup, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_setup, QETH_DBF_SETUP_LEVEL);
-
-	qeth_dbf_misc = debug_register(QETH_DBF_MISC_NAME,
-				       QETH_DBF_MISC_INDEX,
-				       QETH_DBF_MISC_NR_AREAS,
-				       QETH_DBF_MISC_LEN);
-	if (!qeth_dbf_misc)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_misc, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_misc, QETH_DBF_MISC_LEVEL);
-
-	qeth_dbf_data = debug_register(QETH_DBF_DATA_NAME,
-				       QETH_DBF_DATA_INDEX,
-				       QETH_DBF_DATA_NR_AREAS,
-				       QETH_DBF_DATA_LEN);
-	if (!qeth_dbf_data)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_data, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_data, QETH_DBF_DATA_LEVEL);
-
-	qeth_dbf_control = debug_register(QETH_DBF_CONTROL_NAME,
-					  QETH_DBF_CONTROL_INDEX,
-					  QETH_DBF_CONTROL_NR_AREAS,
-					  QETH_DBF_CONTROL_LEN);
-	if (!qeth_dbf_control)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_control, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_control, QETH_DBF_CONTROL_LEVEL);
-
-	qeth_dbf_sense = debug_register(QETH_DBF_SENSE_NAME,
-					QETH_DBF_SENSE_INDEX,
-					QETH_DBF_SENSE_NR_AREAS,
-					QETH_DBF_SENSE_LEN);
-	if (!qeth_dbf_sense)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_sense, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_sense, QETH_DBF_SENSE_LEVEL);
-
-	qeth_dbf_qerr = debug_register(QETH_DBF_QERR_NAME,
-				       QETH_DBF_QERR_INDEX,
-				       QETH_DBF_QERR_NR_AREAS,
-				       QETH_DBF_QERR_LEN);
-	if (!qeth_dbf_qerr)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_qerr, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_qerr, QETH_DBF_QERR_LEVEL);
-
-	qeth_dbf_trace = debug_register(QETH_DBF_TRACE_NAME,
-					QETH_DBF_TRACE_INDEX,
-					QETH_DBF_TRACE_NR_AREAS,
-					QETH_DBF_TRACE_LEN);
-	if (!qeth_dbf_trace)
-		return -ENOMEM;
-
-	debug_register_view(qeth_dbf_trace, &debug_hex_ascii_view);
-	debug_set_level(qeth_dbf_trace, QETH_DBF_TRACE_LEVEL);
-
-	return 0;
-}
-
-static void
-qeth_unregister_dbf_views(void)
-{
-	if (qeth_dbf_setup)
-		debug_unregister(qeth_dbf_setup);
-	if (qeth_dbf_qerr)
-		debug_unregister(qeth_dbf_qerr);
-	if (qeth_dbf_sense)
-		debug_unregister(qeth_dbf_sense);
-	if (qeth_dbf_misc)
-		debug_unregister(qeth_dbf_misc);
-	if (qeth_dbf_data)
-		debug_unregister(qeth_dbf_data);
-	if (qeth_dbf_control)
-		debug_unregister(qeth_dbf_control);
-	if (qeth_dbf_trace)
-		debug_unregister(qeth_dbf_trace);
-}
-
-#ifdef QETH_IPV6
-static int
-qeth_ipv6_init(void)
-{
-	qeth_old_arp_constructor = arp_tbl.constructor;
-	write_lock(&arp_tbl.lock);
-	arp_tbl.constructor = qeth_arp_constructor;
-	write_unlock(&arp_tbl.lock);
-
- 	/* generate the memory leak here - FIXME*/
- 	arp_direct_ops = (struct neigh_ops*)
- 		kmalloc(sizeof(struct neigh_ops), GFP_KERNEL);
- 	if (!arp_direct_ops)
- 		return -ENOMEM;
-
- 	memcpy(arp_direct_ops, &arp_direct_ops_template,
- 	       sizeof(struct neigh_ops));
- 	return 0;
-
-}
-
-static void
-qeth_ipv6_uninit(void)
-{
-	write_lock(&arp_tbl.lock);
-	arp_tbl.constructor = qeth_old_arp_constructor;
-	write_unlock(&arp_tbl.lock);
-}
-#endif /* QETH_IPV6 */
-
-static int
-qeth_get_internal_functions(void)
-{
-	struct net_device *dev;
-#ifdef CONFIG_NET_ETHERNET
-	dev = alloc_etherdev(0);
-	if (!dev) {
-		PRINT_ERR("Not enough memory for internal functions.\n");
-		return -ENOMEM;
-	}
-	qeth_my_eth_header = dev->hard_header;
-	qeth_my_eth_rebuild_header = dev->rebuild_header;
-	qeth_my_eth_header_cache = dev->hard_header_cache;
-	qeth_my_eth_header_cache_update = dev->header_cache_update;
-	free_netdev(dev);
-#endif
-#ifdef CONFIG_TR
-	dev = alloc_trdev(0);
-	if (!dev) {
-		PRINT_ERR("Not enough memory for internal functions.\n");
-		return -ENOMEM;
-	}
-	qeth_my_tr_header = dev->hard_header;
-	qeth_my_tr_rebuild_header = dev->rebuild_header;
-	free_netdev(dev);
-#endif
-	return 0;
-}
-
-static struct ccw_device_id qeth_ids[] = {
-      {CCW_DEVICE(0x1731, 0x01), driver_info:QETH_CARD_TYPE_OSAE},
-      {CCW_DEVICE(0x1731, 0x05), driver_info:QETH_CARD_TYPE_IQD},
-	{},
-};
-
-MODULE_DEVICE_TABLE(ccw, qeth_ids);
-
-static struct ccw_driver qeth_ccw_driver = {
-	.name = "qeth",
-	.ids = qeth_ids,
-	.probe = ccwgroup_probe_ccwdev,
-	.remove = ccwgroup_remove_ccwdev,
-};
-
-static struct device *qeth_root_dev;
-
-static struct ccwgroup_driver qeth_ccwgroup_driver;
-static ssize_t
-qeth_group_store(struct device_driver *drv, const char *buf, size_t count)
-{
-	const char *start, *end;
-	char bus_ids[3][BUS_ID_SIZE], *argv[3];
-	int i;
-
-	pr_debug("group_store %s\n", buf);
-	start = buf;
-	for (i = 0; i < 3; i++) {
-		static const char delim[] = { ',', ',', '\n' };
-		int len;
-
-		if (!(end = strchr(start, delim[i])))
-			return count;
-		len = min_t(ptrdiff_t, BUS_ID_SIZE, end - start);
-		strncpy(bus_ids[i], start, len);
-		bus_ids[i][len] = '\0';
-		start = end + 1;
-		argv[i] = bus_ids[i];
-	}
-	pr_debug("creating qeth group device from '%s', '%s' and '%s'\n",
-		 bus_ids[0], bus_ids[1], bus_ids[2]);
-	ccwgroup_create(qeth_root_dev, qeth_ccwgroup_driver.driver_id,
-			&qeth_ccw_driver, 3, argv);
-	return count;
-}
-
-static DRIVER_ATTR(group, 0200, 0, qeth_group_store);
-
-static ssize_t
-qeth_bufcnt_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%i\n", card->options.inbound_buffer_count);
-}
-
-static ssize_t
-qeth_bufcnt_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	unsigned long cnt;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_softsetup))
-		return -EPERM;
-
-	cnt = simple_strtoul(buf, &tmp, 16);
-	cnt = (cnt < BUFCNT_MIN) ? BUFCNT_MIN :
-		((cnt > BUFCNT_MAX) ? BUFCNT_MAX : cnt);
-	card->options.inbound_buffer_count = cnt;
-
-	return count;
-}
-
-static DEVICE_ATTR(bufcnt, 0644, qeth_bufcnt_show, qeth_bufcnt_store);
-
-static ssize_t
-qeth_portname_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-	char tmp[9];
-	int i;
-
-	if (!card)
-		return -EINVAL;
-
-	if (card->portname_required) {
-		sprintf(tmp, "%s", card->options.portname + 1);
-		for (i = 0; i < 8; i++)
-			tmp[i] = (char) _ebcasc[(__u8) tmp[i]];
-		tmp[8] = 0;
-		return sprintf(buf, "%s\n", tmp);
-	} else
-		return sprintf(buf, "%s\n", "no portname required");
-}
-
-static ssize_t
-qeth_portname_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int i;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	/* Remove trailing '\n'. */
-	tmp = strsep((char **) &buf, "\n");
-	if ((strlen(tmp) > 8) || (strlen(tmp) < 2))
-		return -EINVAL;
-
-	card->options.portname[0] = strlen(tmp);
-	/* for beauty reasons: */
-	for (i = 1; i < 9; i++)
-		card->options.portname[i] = ' ';
-	strcpy(card->options.portname + 1, tmp);
-	for (i = 1; i < 9; i++)
-		card->options.portname[i] =
-			_ascebc[(unsigned char)card->options.portname[i]];
-
-	return count;
-}
-
-static DEVICE_ATTR(portname, 0644, qeth_portname_show, qeth_portname_store);
-
-static ssize_t
-qeth_route4_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	if (atomic_read(&card->rt4fld))
-		return sprintf(buf, "%s\n", "FLD");
-
-	switch (card->options.routing_type4 & ROUTER_MASK) {
-	case PRIMARY_ROUTER:
-		return sprintf(buf, "%s\n", "primary router");
-	case SECONDARY_ROUTER:
-		return sprintf(buf, "%s\n", "secondary router");
-	case MULTICAST_ROUTER:
-		return sprintf(buf, "%s\n", "multicast router");
-	case PRIMARY_CONNECTOR:
-		return sprintf(buf, "%s\n", "primary connector");
-	case SECONDARY_CONNECTOR:
-		return sprintf(buf, "%s\n", "secondary connector");
-	default:
-		return sprintf(buf, "%s\n", "no");
-	}
-}
-
-static ssize_t
-qeth_route4_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	int cnt;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	/* Remove trailing '\n'. */
-	tmp = strsep((char **) &buf, "\n");
-	cnt = strlen(tmp);
-	if (!strncmp(tmp, "primary_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "pri4", card);
-		card->options.routing_type4 =
-			PRIMARY_ROUTER | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "secondary_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "sec4", card);
-		card->options.routing_type4 =
-			SECONDARY_ROUTER | RESET_ROUTING_FLAG;
-	}  else if (!strncmp(tmp, "multicast_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "mcr4", card);
-		card->options.routing_type4 =
-			MULTICAST_ROUTER | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "primary_connector", cnt)) {
-		QETH_DBF_CARD2(0, trace, "prc4", card);
-		card->options.routing_type4 =
-			PRIMARY_CONNECTOR | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "secondary_connector", cnt)) {
-		QETH_DBF_CARD2(0, trace, "scc4", card);
-		card->options.routing_type4 =
-			SECONDARY_CONNECTOR | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "no_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "nor4", card);
-		card->options.routing_type4 = NO_ROUTER | RESET_ROUTING_FLAG;
-	} else {
-		PRINT_WARN("unknown command input in route4 attribute\n");
-		return -EINVAL;
-	}
-	__qeth_correct_routing_status_v4(card);
-	atomic_set(&card->enable_routing_attempts4, QETH_ROUTING_ATTEMPTS);
-	if (atomic_read(&card->is_softsetup))
-		qeth_start_softsetup_thread(card);
-	return count;
-}
-
-static DEVICE_ATTR(route4, 0644, qeth_route4_show, qeth_route4_store);
-
-static ssize_t
-qeth_route6_show(struct device *dev, char *buf)
-{
-#ifdef QETH_IPV6
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	if (atomic_read(&card->rt6fld))
-		return sprintf(buf, "%s\n", "FLD");
-
-	if (!qeth_is_supported(IPA_IPv6))
-		return sprintf(buf, "%s\n", "n/a");
-
-	switch (card->options.routing_type6 & ROUTER_MASK) {
-	case PRIMARY_ROUTER:
-		return sprintf(buf, "%s\n", "primary router");
-	case SECONDARY_ROUTER:
-		return sprintf(buf, "%s\n", "secondary router");
-	case MULTICAST_ROUTER:
-		return sprintf(buf, "%s\n", "multicast router");
-	case PRIMARY_CONNECTOR:
-		return sprintf(buf, "%s\n", "primary connector");
-	case SECONDARY_CONNECTOR:
-		return sprintf(buf, "%s\n", "secondary connector");
-	default:
-		return sprintf(buf, "%s\n", "no");
-	}
-#endif /* QETH_IPV6 */
-	return sprintf(buf, "%s\n", "n/a");
-}
-
-static ssize_t
-qeth_route6_store(struct device *dev, const char *buf, size_t count)
-{
-#ifdef QETH_IPV6
-	struct qeth_card *card = dev->driver_data;
-	int cnt;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	/* Remove trailing '\n'. */
-	tmp = strsep((char **) &buf, "\n");
-	cnt = strlen(tmp);
-	if (!strncmp(tmp, "primary_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "pri6", card);
-		card->options.routing_type6 =
-			PRIMARY_ROUTER | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "secondary_router", cnt)) {
-				QETH_DBF_TEXT2(0, trace, "sec6");
-		QETH_DBF_CARD2(0, trace, "sec6", card);
-		card->options.routing_type6 =
-			SECONDARY_ROUTER | RESET_ROUTING_FLAG;
-	}  else if (!strncmp(tmp, "multicast_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "mcr6", card);
-		card->options.routing_type6 =
-			MULTICAST_ROUTER | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "primary_connector", cnt)) {
-		QETH_DBF_CARD2(0, trace, "prc6", card);
-		card->options.routing_type6 =
-			PRIMARY_CONNECTOR | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "secondary_connector", cnt)) {
-		QETH_DBF_CARD2(0, trace, "scc6", card);
-		card->options.routing_type6 =
-			SECONDARY_CONNECTOR | RESET_ROUTING_FLAG;
-	} else if (!strncmp(tmp, "no_router", cnt)) {
-		QETH_DBF_CARD2(0, trace, "nor6", card);
-		card->options.routing_type6 = NO_ROUTER | RESET_ROUTING_FLAG;
-	} else {
-		PRINT_WARN("unknown command input in route6 attribute\n");
-		return -EINVAL;
-	}
-	__qeth_correct_routing_status_v6(card);
-	atomic_set(&card->enable_routing_attempts6, QETH_ROUTING_ATTEMPTS);
-	if (atomic_read(&card->is_softsetup))
-		qeth_start_softsetup_thread(card);
-	return count;
-#endif /* QETH_IPV6 */
-	return -EINVAL;
-}
-
-static DEVICE_ATTR(route6, 0644, qeth_route6_show, qeth_route6_store);
-
-
-static ssize_t
-qeth_checksum_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	switch (card->options.checksum_type) {
-	case SW_CHECKSUMMING:
-		return sprintf(buf, "%s\n", "sw");
-	case HW_CHECKSUMMING:
-		return sprintf(buf, "%s\n", "hw");
-	default:
-		return sprintf(buf, "%s\n", "no");
-	}
-}
-
-static ssize_t
-qeth_checksum_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int cnt;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	/* Remove trailing '\n'. */
-	tmp = strsep((char **) &buf, "\n");
-	cnt = strlen(tmp);
-	if (!strncmp(tmp, "sw_checksumming", cnt))
-		card->options.checksum_type = SW_CHECKSUMMING;
-	else if (!strncmp(tmp, "hw_checksumming", cnt))
-		card->options.checksum_type = HW_CHECKSUMMING;
-	else if (!strncmp(tmp, "no_checksumming", cnt))
-		card->options.checksum_type = NO_CHECKSUMMING;
-	else
-		PRINT_WARN("unknown checksumming type '%s'\n", tmp);
-
-	return count;
-}
-
-static DEVICE_ATTR(checksumming, 0644, qeth_checksum_show, qeth_checksum_store);
-
-static ssize_t
-qeth_prioq_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	switch (card->options.do_prio_queueing) {
-	case PRIO_QUEUEING_PREC:
-		return sprintf(buf, "%s\n", "by precedence");
-	case PRIO_QUEUEING_TOS:
-		return sprintf(buf, "%s\n", "by type of service");
-	default:
-		return sprintf(buf, "always queue %i\n",
-			       card->options.default_queue);
-	}
-}
-
-static ssize_t
-qeth_prioq_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int cnt;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	/* Remove trailing '\n'. */
-	tmp = strsep((char **) &buf, "\n");
-	cnt = strlen(tmp);
-	if (!strncmp(tmp, "prio_queueing_prec", cnt))
-		card->options.do_prio_queueing = PRIO_QUEUEING_PREC;
-	else if (!strncmp(tmp, "prio_queueing_tos", cnt))
-		card->options.do_prio_queueing = PRIO_QUEUEING_TOS;
-	else if (!strncmp(tmp, "no_prio_queueing:0", cnt)) {
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-		card->options.default_queue = 0;
-	} else if (!strncmp(tmp, "no_prio_queueing:1", cnt)) {
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-		card->options.default_queue = 1;
-	} else if (!strncmp(tmp, "no_prio_queueing:2", cnt)) {
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-		card->options.default_queue = 2;
-	} else if (!strncmp(tmp, "no_prio_queueing:3", cnt)) {
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-		card->options.default_queue = 3;
-	} else if (!strncmp(tmp, "no_prio_queueing", cnt)) {
-		card->options.do_prio_queueing = NO_PRIO_QUEUEING;
-		card->options.default_queue = QETH_DEFAULT_QUEUE;
-	} else
-		PRINT_WARN("unknown queueing type '%s'\n", tmp);
-
-	return count;
-}
-
-static DEVICE_ATTR(priority_queueing, 0644, qeth_prioq_show, qeth_prioq_store);
-
-static ssize_t
-qeth_portno_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%i\n", card->options.portno);
-}
-
-static ssize_t
-qeth_portno_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int i;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if ((i < 0) || (i > MAX_PORTNO)) {
-		PRINT_ERR("portno %i out of range\n", i);
-		return -EINVAL;
-	}
-	card->options.portno = i;
-
-	return count;
-}
-
-static DEVICE_ATTR(portno, 0644, qeth_portno_show, qeth_portno_store);
-
-static ssize_t
-qeth_polltime_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%i\n", card->options.polltime);
-}
-
-static ssize_t
-qeth_polltime_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int i;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if (i < 0) {
-		PRINT_ERR("polltime %i invalid\n", i);
-		return -EINVAL;
-	}
-	card->options.polltime = i;
-
-	return count;
-}
-
-static DEVICE_ATTR(polltime, 0644, qeth_polltime_show, qeth_polltime_store);
-
-static ssize_t
-qeth_hhlen_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%i\n", card->options.add_hhlen);
-}
-
-static ssize_t
-qeth_hhlen_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int i;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if ((i < 0) || (i > MAX_ADD_HHLEN)) {
-		PRINT_ERR("add_hhlen out of range\n");
-		return -EINVAL;
-	}
-	card->options.add_hhlen = i;
-
-	return count;
-}
-
-static DEVICE_ATTR(add_hhlen, 0644, qeth_hhlen_show, qeth_hhlen_store);
-
-static ssize_t
-qeth_takeover_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%s\n",
-		       (card->options.ena_ipat == ENABLE_TAKEOVER)?"1":"0");
-}
-
-static ssize_t
-qeth_takeover_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	int i;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if (i == 1)
-		card->options.ena_ipat = ENABLE_TAKEOVER;
-	else if (i == 0)
-		card->options.ena_ipat = DISABLE_TAKEOVER;
-	else
-		return -EINVAL;
-
-	return count;
-}
-
-static DEVICE_ATTR(enable_takeover, 0644, qeth_takeover_show, qeth_takeover_store);
-
-static ssize_t
-qeth_macaddr_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%s\n",
-		       (card->options.macaddr_mode == MACADDR_CANONICAL)?"1":"0");
-}
-
-static ssize_t
-qeth_macaddr_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	int i;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if (i == 0)
-		card->options.macaddr_mode = MACADDR_NONCANONICAL;
-	else if (i == 1)
-		card->options.macaddr_mode = MACADDR_CANONICAL;
-	else
-		return -EINVAL;
-
-	return count;
-}
-
-static DEVICE_ATTR(canonical_macaddr, 0644, qeth_macaddr_show, qeth_macaddr_store);
-
-static ssize_t
-qeth_fakebr_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%s\n",
-		       (card->options.fake_broadcast == FAKE_BROADCAST)?"1":"0");
-}
-
-static ssize_t
-qeth_fakebr_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	int i;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if (i == 0)
-		card->options.fake_broadcast = DONT_FAKE_BROADCAST;
-	else if (i == 1)
-		card->options.fake_broadcast = FAKE_BROADCAST;
-	else
-		return -EINVAL;
-
-	return count;
-}
-
-static DEVICE_ATTR(fake_broadcast, 0644, qeth_fakebr_show, qeth_fakebr_store);
-
-static ssize_t
-qeth_fakell_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%s\n",
-		       (card->options.fake_ll == FAKE_LL)?"1":"0");
-}
-
-static ssize_t
-qeth_fakell_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	int i;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if (i == 0)
-		card->options.fake_ll = DONT_FAKE_LL;
-	else if (i == 1)
-		card->options.fake_ll = FAKE_LL;
-	else
-		return -EINVAL;
-
-	return count;
-}
-
-static DEVICE_ATTR(fake_ll, 0644, qeth_fakell_show, qeth_fakell_store);
-
-static ssize_t
-qeth_broadcast_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	return sprintf(buf, "%s\n",
-		       (card->options.broadcast_mode == BROADCAST_ALLRINGS)
-		       ?"allrings":"local");
-}
-
-static ssize_t
-qeth_broadcast_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	char *tmp;
-	int cnt;
-
-	if (!card)
-		return count;
-
-	if (atomic_read(&card->is_hardsetup))
-		return -EPERM;
-
-	/* Remove trailing '\n'. */
-	tmp = strsep((char **) &buf, "\n");
-	cnt = strlen(tmp);
-	if (!strncmp(tmp, "broadcast_allrings", cnt))
-		card->options.broadcast_mode = BROADCAST_ALLRINGS;
-	else if (!strncmp(tmp, "broadcast_local", cnt))
-		card->options.broadcast_mode = BROADCAST_LOCAL;
-	else
-		PRINT_WARN("unknown broadcast type '%s'\n", tmp);
-
-	return count;
-}
-
-static DEVICE_ATTR(broadcast_mode, 0644, qeth_broadcast_show, qeth_broadcast_store);
-
-static ssize_t
-qeth_recover_store(struct device *dev, const char *buf, size_t count)
-{
-	struct qeth_card *card = dev->driver_data;
-	int i;
-	char *tmp;
-
-	if (!card)
-		return count;
-
-	i = simple_strtoul(buf, &tmp, 16);
-	if (i == 1) {
-		QETH_DBF_CARD2(0, trace, "UTRC", card);
-		atomic_set(&card->problem, PROBLEM_USER_TRIGGERED_RECOVERY);
-		qeth_schedule_recovery(card);
-		return count;
-	} else
-		return -EINVAL;
-}
-
-static DEVICE_ATTR(recover, 0200, 0, qeth_recover_store);
-
-static ssize_t
-qeth_card_type_show(struct device *dev, char *buf)
-{
-	struct qeth_card *card = dev->driver_data;
-
-	if (!card)
-		return -EINVAL;
-
-	if (!atomic_read(&card->is_softsetup))
-		return sprintf(buf, "n/a\n");
-
-	return sprintf(buf, "%s\n",
-		       qeth_get_cardname_short(card->type, card->link_type,
-					       card->is_guest_lan));
-}
-
-static DEVICE_ATTR(card_type, 0444, qeth_card_type_show, NULL);
-
-static struct attribute * qeth_attrs[] = {
-	&dev_attr_bufcnt.attr,
-	&dev_attr_portname.attr,
-	&dev_attr_route4.attr,
-	&dev_attr_route6.attr,
-	&dev_attr_checksumming.attr,
-	&dev_attr_priority_queueing.attr,
-	&dev_attr_portno.attr,
-	&dev_attr_polltime.attr,
-	&dev_attr_add_hhlen.attr,
-	&dev_attr_enable_takeover.attr,
-	&dev_attr_canonical_macaddr.attr,
-	&dev_attr_fake_broadcast.attr,
-	&dev_attr_fake_ll.attr,
-	&dev_attr_broadcast_mode.attr,
-	&dev_attr_recover.attr,
-	&dev_attr_card_type.attr,
-	NULL,
-};
-
-static struct attribute_group qeth_attr_group = {
-	.attrs = qeth_attrs,
-};
-
-static inline int
-__qeth_create_attributes(struct device *dev)
-{
-	return sysfs_create_group(&dev->kobj, &qeth_attr_group);
-}
-
-static inline void
-__qeth_remove_attributes(struct device *dev)
-{
-	sysfs_remove_group(&dev->kobj, &qeth_attr_group);
-}
-
-static int
-qeth_probe_device(struct ccwgroup_device *gdev)
-{
-	struct qeth_card *card;
-	int ret;
-
-	if (!get_device(&gdev->dev))
-		return -ENODEV;
-
-	card = qeth_alloc_card();
-	if (!card) {
-		put_device(&gdev->dev);
-		return -ENOMEM;
-	}
-
-	gdev->dev.driver_data = card;
-	card->gdev = gdev;
-
-	gdev->cdev[0]->handler = qeth_interrupt_handler_read;
-
-	gdev->cdev[1]->handler = qeth_interrupt_handler_write;
-
-	gdev->cdev[2]->handler = qeth_interrupt_handler_qdio;
-
-	ret = __qeth_create_attributes(&gdev->dev);
-	if (ret != 0)
-		goto out;
-
-	return 0;
-out:
-	put_device(&gdev->dev);
-	qeth_free_card(card);
-	return ret;
-}
-
-/* 
- * Replaces qeth_probe and qeth_attach_handler. 
- * This is called after piping to the 'online' attribute,
- * when all parameters are ready.
- */
-static int
-qeth_activate(struct qeth_card *card)
-{
-	int result;
-
-	ccw_device_set_online(CARD_RDEV(card));
-	ccw_device_set_online(CARD_WDEV(card));
-	ccw_device_set_online(CARD_DDEV(card));
-
-	QETH_DBF_CARD1(0, setup, "activ", card);
-	QETH_DBF_HEX1(0, setup, &card, sizeof (void *));
-	QETH_DBF_HEX1(0, setup, &card->dev, sizeof (void *));
-	QETH_DBF_HEX1(0, setup, &card->stats, sizeof (void *));
-
-	QETH_DBF_HEX2(0, misc, &card->options, QETH_DBF_MISC_LEN);
-
-	if (qeth_determine_card_type(card)) {
-		PRINT_WARN("%s: not a valid card type\n", __func__);
-		goto out;
-	}
-
-	qeth_insert_card_into_list(card);
-
-	qeth_correct_routing_status(card);
-
-	result = qeth_init_ringbuffers1(card);
-	if (result) {
-		PRINT_WARN("%s: could not init ringbuffers1\n", __func__);
-		goto out_remove;
-	}
-
-	result = qeth_hardsetup_card(card, 0);
-	if (result) {
-		goto out_remove;
-	}
-
-	result = qeth_init_ringbuffers2(card);
-	if (result) {
-		PRINT_WARN("%s: could not init ringbuffers2\n", __func__);
-		goto out_remove;
-	}
-
-	/* this was previously done in chandev_initnetdevice */
-	snprintf(card->dev->name, 8, "%s%%d",
-		 qeth_get_dev_basename(card->type, card->link_type));
-	if (qeth_init_netdev(card))
-		goto out_remove;
-
-	return 0;		/* success */
-
-out_remove:
-	qeth_remove_card(card, QETH_REMOVE_CARD_QUICK);
-	qeth_remove_card_from_list(card);
-
-out:
-	QETH_DBF_TEXT4(0, trace, "freecard");
-
-	ccw_device_set_offline(CARD_DDEV(card));
-	ccw_device_set_offline(CARD_WDEV(card));
-	ccw_device_set_offline(CARD_RDEV(card));
-
-	return -ENODEV;
-}
-
-static int
-qeth_set_online(struct ccwgroup_device *gdev)
-{
-	int rc;
-	struct qeth_card *card = gdev->dev.driver_data;
-
-	BUG_ON(!card);
-
-	rc = qeth_alloc_card_stuff(card);
-
-	return rc ? rc : qeth_activate(card);
-
-}
-
-static int
-qeth_set_offline(struct ccwgroup_device *gdev)
-{
-	struct qeth_card *card = gdev->dev.driver_data;
-
-	if (!card)
-		return -ENODEV;
-
-	qeth_remove_card(card, QETH_REMOVE_CARD_PROPER);
-	qeth_remove_card_from_list(card);
-
-	QETH_DBF_TEXT4(0, trace, "freecard");
-
-	ccw_device_set_offline(CARD_DDEV(card));
-	ccw_device_set_offline(CARD_WDEV(card));
-	ccw_device_set_offline(CARD_RDEV(card));
-
-	qeth_free_card_stuff(card);
-
-	return 0;
-}
-
-static void
-qeth_remove_device(struct ccwgroup_device *gdev)
-{
-	struct qeth_card *card = gdev->dev.driver_data;
-
-	if (card && qeth_does_card_exist(card))
-		/* Means that card is already in list. */
-		qeth_set_offline(gdev);
-	__qeth_remove_attributes(&gdev->dev);
-	gdev->dev.driver_data = NULL;
-	if (card)
-		qeth_free_card(card);
-	put_device(&gdev->dev);
-}
-
-static struct ccwgroup_driver qeth_ccwgroup_driver = {
-	.owner = THIS_MODULE,
-	.name = "qeth",
-	.driver_id = 0xD8C5E3C8,
-	.probe = qeth_probe_device,
-	.remove = qeth_remove_device,
-	.set_online = qeth_set_online,
-	.set_offline = qeth_set_offline,
-};
-
-static int __init
-qeth_init(void)
-{
-	int result;
-
-	qeth_eyecatcher();
-
-	printk(KERN_INFO "qeth: loading %s\n", version);
-
-	result = qeth_get_internal_functions();
-	if (result)
-		goto out;
-
-	qeth_alloc_spare_bufs();
-
-#ifdef QETH_IPV6
-	if (qeth_ipv6_init()) {
-		PRINT_ERR("Out of memory during ipv6 init.\n");
-		goto out_sparebufs;
-	}
-#endif /* QETH_IPV6 */
-
-	result = qeth_register_dbf_views();
-	if (result) {
-		PRINT_ERR("not enough memory for dbf. Will not load module.\n");
-		goto out_ipv6;
-	}
-
-	result = ccwgroup_driver_register(&qeth_ccwgroup_driver);
-	if (result)
-		goto out_dbf;
-
-	result = ccw_driver_register(&qeth_ccw_driver);
-	if (result)
-		goto out_gdrv;
-
-	result = driver_create_file(&qeth_ccwgroup_driver.driver,
-				    &driver_attr_group);
-	if (result)
-		goto out_cdrv;
-
-	qeth_root_dev = s390_root_dev_register("qeth");
-	if (IS_ERR(qeth_root_dev)) {
-		result = PTR_ERR(qeth_root_dev);
-		goto out_file;
-	}
-	qeth_register_notifiers();
-	qeth_add_procfs_entries();
-
-	return 0;
-
-out_file:
-	driver_remove_file(&qeth_ccwgroup_driver.driver, &driver_attr_group);
-out_cdrv:
-	ccw_driver_unregister(&qeth_ccw_driver);
-out_gdrv:
-	ccwgroup_driver_unregister(&qeth_ccwgroup_driver);
-out_dbf:
-	qeth_unregister_dbf_views();
-out_ipv6:
-#ifdef QETH_IPV6
-	qeth_ipv6_uninit();
-out_sparebufs:
-#endif /* QETH_IPV6 */
-	qeth_free_all_spare_bufs();
-out:
-	return result;
-}
-
-static void __exit
-qeth_exit(void)
-{
-#ifdef QETH_IPV6
-	qeth_ipv6_uninit();
-#endif /* QETH_IPV6 */
-	qeth_unregister_notifiers();
-
-	qeth_remove_procfs_entries();
-
-	QETH_DBF_TEXT1(0, trace, "cleanup.");
-
-	driver_remove_file(&qeth_ccwgroup_driver.driver, &driver_attr_group);
-	ccw_driver_unregister(&qeth_ccw_driver);
-	ccwgroup_driver_unregister(&qeth_ccwgroup_driver);
-	s390_root_dev_unregister(qeth_root_dev);
-
-	while (firstcard) {
-		struct qeth_card *card = firstcard;
-		qeth_remove_card(card, QETH_REMOVE_CARD_QUICK);
-		qeth_remove_card_from_list(card);
-		qeth_free_card(card);
-	}
-
-	qeth_free_all_spare_bufs();
-
-	qeth_unregister_dbf_views();
-
-	printk("qeth: %s: module removed\n", version);
-}
-
-EXPORT_SYMBOL(qeth_eyecatcher);
-
-module_init(qeth_init);
-module_exit(qeth_exit);
diff --git a/drivers/s390/net/qeth.h b/drivers/s390/net/qeth.h
index 728623f85f0c..1175c1e8deac 100644
--- a/drivers/s390/net/qeth.h
+++ b/drivers/s390/net/qeth.h
@@ -1,1147 +1,789 @@
-/*
- * linux/drivers/s390/net/qeth.h
- *
- * Linux on zSeries OSA Express and HiperSockets support
- *
- * Copyright 2000,2003 IBM Corporation
- * Author(s): Utz Bacher <utz.bacher@de.ibm.com>
- *
- */
 #ifndef __QETH_H__
 #define __QETH_H__
 
-#include <asm/qdio.h>
+#include <linux/if.h>
+#include <linux/if_arp.h>
+
+#include <linux/if_tr.h>
+#include <linux/trdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
 
-#define QETH_NAME " qeth"
+#include <net/ipv6.h>
+#include <linux/in6.h>
+#include <net/if_inet6.h>
+#include <net/addrconf.h>
 
-#define VERSION_QETH_H "$Revision: 1.60 $"
 
-/******************** CONFIG STUFF ***********************/
-//#define QETH_DBF_LIKE_HELL
+#include <asm/bitops.h>
+#include <asm/debug.h>
+#include <asm/qdio.h>
+#include <asm/ccwdev.h>
+#include <asm/ccwgroup.h>
+
+#include "qeth_mpc.h"
+
+#define VERSION_QETH_H 		"$Revision: 1.98 $"
 
 #ifdef CONFIG_QETH_IPV6
-#define QETH_IPV6
-#define QETH_VERSION_IPV6 ":IPv6"
+#define QETH_VERSION_IPV6 	":IPv6"
 #else
-#define QETH_VERSION_IPV6 ""
-#endif	/* CONFIG_QETH_IPV6 */
-
+#define QETH_VERSION_IPV6 	""
+#endif
 #ifdef CONFIG_QETH_VLAN
-#define QETH_VLAN
-#define QETH_VERSION_VLAN ":VLAN"
+#define QETH_VERSION_VLAN 	":VLAN"
 #else
-#define QETH_VERSION_VLAN ""
-#endif	/* CONFIG_QETH_VLAN */
-
-/* these values match CHECKSUM_* in include/linux/skbuff.h */
-#define SW_CHECKSUMMING 0
-#define HW_CHECKSUMMING 1
-#define NO_CHECKSUMMING 2
-
-#define QETH_CHECKSUM_DEFAULT NO_CHECKSUMMING
-
-#define QETH_PRIOQ_DEFAULT NO_PRIO_QUEUEING
-#define QETH_DEFAULT_QUEUE 2
-
-/******************** CONFIG STUFF END ***********************/
-/********************* TUNING STUFF **************************/
-#define HIGH_WATERMARK_PACK		5
-#define LOW_WATERMARK_PACK		2
-#define WATERMARK_FUZZ			2
-
-#define QETH_MAX_INPUT_THRESHOLD 500
-#define QETH_MAX_OUTPUT_THRESHOLD 300	/* ? */
-
-/* only the MAX values are used */
-#define QETH_MIN_INPUT_THRESHOLD 1
-#define QETH_MIN_OUTPUT_THRESHOLD 1
-
-#define QETH_REQUEUE_THRESHOLD (card->options.inbound_buffer_count/4)
-
-#ifdef CONFIG_QETH_PERF_STATS
-#define QETH_PERFORMANCE_STATS
-#endif	/* CONFIG_QETH_PERF_STATS */
-
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_VERBOSE_LEVEL 8
-#else /* QETH_DBF_LIKE_HELL */
-#define QETH_VERBOSE_LEVEL 5
-#endif /* QETH_DBF_LIKE_HELL */
-
-#define PCI_THRESHOLD_A (card->options.inbound_buffer_count+1)	
-/* buffers we have to be behind before we get a PCI */
-#define PCI_THRESHOLD_B 0	/* enqueued free buffers left before we get a PCI */
-#define PCI_TIMER_VALUE 3	/* not used, unless the microcode gets patched */
-
-#define DEFAULT_SPARE_BUFFERS 0
-#define MAX_SPARE_BUFFERS 1024
-#define SPAREBUF_MASK 65536
-#define MAX_PORTNO 15
-
-#define QETH_PROCFILE_NAME "qeth"
-#define QETH_PERF_PROCFILE_NAME "qeth_perf"
-#define QETH_IPA_PROCFILE_NAME "qeth_ipa_takeover"
-
-#define SEND_RETRIES_ALLOWED 5
-#define QETH_ROUTING_ATTEMPTS 2
-
-#define QETH_HARDSETUP_LAPS 5
-#define QETH_HARDSETUP_CLEAR_LAPS 3
-#define QETH_RECOVERY_HARDSETUP_RETRY 2
-
-/************************* DEBUG FACILITY STUFF *********************/
-
-#define QETH_DBF_HEX(ex,name,level,addr,len) \
-        do { \
-        if (ex) \
-                debug_exception(qeth_dbf_##name,level,(void*)addr,len); \
-        else \
-                debug_event(qeth_dbf_##name,level,(void*)addr,len); \
-        } while (0)
-#define QETH_DBF_TEXT(ex,name,level,text) \
-        do { \
-        if (ex) \
-                debug_text_exception(qeth_dbf_##name,level,text); \
-        else \
-                debug_text_event(qeth_dbf_##name,level,text); \
-        } while (0)
-
-#define QETH_DBF_CARD(ex,name,level,text,card) \
-	do { \
-		QETH_DBF_TEXT(ex,name,level,text); \
-		QETH_DBF_TEXT(ex,name,level,card->gdev->dev.bus_id); \
-	} while (0)
-
-#define QETH_DBF_HEX0(ex,name,addr,len) QETH_DBF_HEX(ex,name,0,addr,len)
-#define QETH_DBF_HEX1(ex,name,addr,len) QETH_DBF_HEX(ex,name,1,addr,len)
-#define QETH_DBF_HEX2(ex,name,addr,len) QETH_DBF_HEX(ex,name,2,addr,len)
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_HEX3(ex,name,addr,len) QETH_DBF_HEX(ex,name,3,addr,len)
-#define QETH_DBF_HEX4(ex,name,addr,len) QETH_DBF_HEX(ex,name,4,addr,len)
-#define QETH_DBF_HEX5(ex,name,addr,len) QETH_DBF_HEX(ex,name,5,addr,len)
-#define QETH_DBF_HEX6(ex,name,addr,len) QETH_DBF_HEX(ex,name,6,addr,len)
-#else /* QETH_DBF_LIKE_HELL */
-#define QETH_DBF_HEX3(ex,name,addr,len) do {} while (0)
-#define QETH_DBF_HEX4(ex,name,addr,len) do {} while (0)
-#define QETH_DBF_HEX5(ex,name,addr,len) do {} while (0)
-#define QETH_DBF_HEX6(ex,name,addr,len) do {} while (0)
-#endif /* QETH_DBF_LIKE_HELL */
-
-#define QETH_DBF_TEXT0(ex,name,text) QETH_DBF_TEXT(ex,name,0,text)
-#define QETH_DBF_TEXT1(ex,name,text) QETH_DBF_TEXT(ex,name,1,text)
-#define QETH_DBF_TEXT2(ex,name,text) QETH_DBF_TEXT(ex,name,2,text)
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_TEXT3(ex,name,text) QETH_DBF_TEXT(ex,name,3,text)
-#define QETH_DBF_TEXT4(ex,name,text) QETH_DBF_TEXT(ex,name,4,text)
-#define QETH_DBF_TEXT5(ex,name,text) QETH_DBF_TEXT(ex,name,5,text)
-#define QETH_DBF_TEXT6(ex,name,text) QETH_DBF_TEXT(ex,name,6,text)
-#else /* QETH_DBF_LIKE_HELL */
-#define QETH_DBF_TEXT3(ex,name,text) do {} while (0)
-#define QETH_DBF_TEXT4(ex,name,text) do {} while (0)
-#define QETH_DBF_TEXT5(ex,name,text) do {} while (0)
-#define QETH_DBF_TEXT6(ex,name,text) do {} while (0)
-#endif /* QETH_DBF_LIKE_HELL */
-
-#define QETH_DBF_CARD0(ex,name,text,card) QETH_DBF_CARD(ex,name,0,text,card)
-#define QETH_DBF_CARD1(ex,name,text,card) QETH_DBF_CARD(ex,name,1,text,card)
-#define QETH_DBF_CARD2(ex,name,text,card) QETH_DBF_CARD(ex,name,2,text,card)
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_CARD3(ex,name,text,card) QETH_DBF_CARD(ex,name,3,text,card)
-#define QETH_DBF_CARD4(ex,name,text,card) QETH_DBF_CARD(ex,name,4,text,card)
-#define QETH_DBF_CARD5(ex,name,text,card) QETH_DBF_CARD(ex,name,5,text,card)
-#define QETH_DBF_CARD6(ex,name,text,card) QETH_DBF_CARD(ex,name,6,text,card)
-#else /* QETH_DBF_LIKE_HELL */
-#define QETH_DBF_CARD3(ex,name,text,card) do {} while (0)
-#define QETH_DBF_CARD4(ex,name,text,card) do {} while (0)
-#define QETH_DBF_CARD5(ex,name,text,card) do {} while (0)
-#define QETH_DBF_CARD6(ex,name,text,card) do {} while (0)
-#endif /* QETH_DBF_LIKE_HELL */
+#define QETH_VERSION_VLAN 	""
+#endif
 
+/**
+ * Debug Facility stuff
+ */
 #define QETH_DBF_SETUP_NAME "qeth_setup"
 #define QETH_DBF_SETUP_LEN 8
 #define QETH_DBF_SETUP_INDEX 3
 #define QETH_DBF_SETUP_NR_AREAS 1
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_SETUP_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_SETUP_LEVEL 3
-#endif /* QETH_DBF_LIKE_HELL */
 
 #define QETH_DBF_MISC_NAME "qeth_misc"
 #define QETH_DBF_MISC_LEN 128
 #define QETH_DBF_MISC_INDEX 1
 #define QETH_DBF_MISC_NR_AREAS 1
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_MISC_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_MISC_LEVEL 2
-#endif /* QETH_DBF_LIKE_HELL */
 
 #define QETH_DBF_DATA_NAME "qeth_data"
 #define QETH_DBF_DATA_LEN 96
 #define QETH_DBF_DATA_INDEX 3
 #define QETH_DBF_DATA_NR_AREAS 1
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_DATA_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_DATA_LEVEL 2
-#endif /* QETH_DBF_LIKE_HELL */
 
 #define QETH_DBF_CONTROL_NAME "qeth_control"
-/* buffers are 255 bytes long, but no prob */
 #define QETH_DBF_CONTROL_LEN 256
 #define QETH_DBF_CONTROL_INDEX 3
 #define QETH_DBF_CONTROL_NR_AREAS 2
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_CONTROL_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_CONTROL_LEVEL 2
-#endif /* QETH_DBF_LIKE_HELL */
 
 #define QETH_DBF_TRACE_NAME "qeth_trace"
 #define QETH_DBF_TRACE_LEN 8
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_TRACE_INDEX 3
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_TRACE_INDEX 2
-#endif /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_TRACE_NR_AREAS 2
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_TRACE_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
-#define QETH_DBF_TRACE_LEVEL 2
-#endif /* QETH_DBF_LIKE_HELL */
+#define QETH_DBF_TRACE_LEVEL 3
 
 #define QETH_DBF_SENSE_NAME "qeth_sense"
 #define QETH_DBF_SENSE_LEN 64
 #define QETH_DBF_SENSE_INDEX 1
 #define QETH_DBF_SENSE_NR_AREAS 1
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_SENSE_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_SENSE_LEVEL 2
-#endif /* QETH_DBF_LIKE_HELL */
 
 #define QETH_DBF_QERR_NAME "qeth_qerr"
 #define QETH_DBF_QERR_LEN 8
 #define QETH_DBF_QERR_INDEX 1
 #define QETH_DBF_QERR_NR_AREAS 2
-#ifdef QETH_DBF_LIKE_HELL
-#define QETH_DBF_QERR_LEVEL 6
-#else /* QETH_DBF_LIKE_HELL */
 #define QETH_DBF_QERR_LEVEL 2
-#endif /* QETH_DBF_LIKE_HELL */
-/****************** END OF DEBUG FACILITY STUFF *********************/
-
-/********************* CARD DATA STUFF **************************/
 
-#define QETH_MAX_PARAMS 150
-
-#define QETH_CARD_TYPE_UNKNOWN	0
-#define QETH_CARD_TYPE_OSAE	10
-#define QETH_CARD_TYPE_IQD	1234
+#define QETH_DBF_TEXT(name,level,text) \
+	do { \
+		debug_text_event(qeth_dbf_##name,level,text); \
+	} while (0)
 
-#define QETH_IDX_FUNC_LEVEL_OSAE_ENA_IPAT 0x0101
-#define QETH_IDX_FUNC_LEVEL_OSAE_DIS_IPAT 0x0101
-#define QETH_IDX_FUNC_LEVEL_IQD_ENA_IPAT 0x4108
-#define QETH_IDX_FUNC_LEVEL_IQD_DIS_IPAT 0x5108
+#define QETH_DBF_HEX(name,level,addr,len) \
+	do { \
+		debug_event(qeth_dbf_##name,level,(void*)(addr),len); \
+	} while (0)
 
-#define QETH_MAX_QUEUES 4
+#define QETH_DBF_TEXT_(name,level,text...)				  \
+	do {								  \
+		sprintf(qeth_dbf_text_buf, text);			  \
+		debug_text_event(qeth_dbf_##name,level,qeth_dbf_text_buf);\
+	} while (0)
 
-#define UNIQUE_ID_IF_CREATE_ADDR_FAILED 0xfffe
-#define UNIQUE_ID_NOT_BY_CARD 0x10000
+#define QETH_DBF_SPRINTF(name,level,text...) \
+	do { \
+		debug_sprintf_event(qeth_dbf_trace, level, ##text ); \
+		debug_sprintf_event(qeth_dbf_trace, level, text ); \
+	} while (0)
 
-/* 
- * CU type & model, Dev type & model, card_type, odd_even_restriction,
- * func level, no of queues, multicast is different (multicast-queue_no + 0x100)
+/**
+ * some more debug stuff
  */
-#define QETH_MODELLIST_ARRAY \
-	{{0x1731,0x01,0x1732,0x01,QETH_CARD_TYPE_OSAE,1, \
-	  QETH_IDX_FUNC_LEVEL_OSAE_ENA_IPAT, \
-	  QETH_IDX_FUNC_LEVEL_OSAE_DIS_IPAT, \
-	  QETH_MAX_QUEUES,0}, \
-	 {0x1731,0x05,0x1732,0x05,QETH_CARD_TYPE_IQD,0, \
-	  QETH_IDX_FUNC_LEVEL_IQD_ENA_IPAT, \
-	  QETH_IDX_FUNC_LEVEL_IQD_DIS_IPAT, \
-	  QETH_MAX_QUEUES,0x103}, \
-	 {0,0,0,0,0,0,0,0,0}}
-
-#define QETH_MPC_DIFINFO_LEN_INDICATES_LINK_TYPE 0x18
- /* only the first two bytes are looked at in qeth_get_cardname_short */
-#define QETH_MPC_LINK_TYPE_FAST_ETHERNET 0x01
-#define QETH_MPC_LINK_TYPE_HSTR 0x02
-#define QETH_MPC_LINK_TYPE_GIGABIT_ETHERNET 0x03
-#define QETH_MPC_LINK_TYPE_10GIGABIT_ETHERNET 0x10
-#define QETH_MPC_LINK_TYPE_LANE_ETH100 0x81
-#define QETH_MPC_LINK_TYPE_LANE_TR 0x82
-#define QETH_MPC_LINK_TYPE_LANE_ETH1000 0x83
-#define QETH_MPC_LINK_TYPE_LANE 0x88
-#define QETH_MPC_LINK_TYPE_ATM_NATIVE 0x90
-
-#define DEFAULT_ADD_HHLEN 0
-#define MAX_ADD_HHLEN 1024
-
-#define QETH_HEADER_SIZE	32
-#define QETH_IP_HEADER_SIZE	40
-#define QETH_HEADER_LEN_POS	8
-/* flags for the header: */
-#define QETH_HEADER_PASSTHRU	0x10
-#define QETH_HEADER_IPV6	0x80
-
-#define QETH_ETH_MAC_V4      0x0100 /* like v4 */
-#define QETH_ETH_MAC_V6      0x3333 /* like v6 */
-/* tr mc mac is longer, but that will be enough to detect mc frames */
-#define QETH_TR_MAC_NC       0xc000 /* non-canonical */
-#define QETH_TR_MAC_C        0x0300 /* canonical */
-
-#define QETH_CAST_FLAGS		0x07
-#define QETH_CAST_UNICAST	6
-#define QETH_CAST_MULTICAST	4
-#define QETH_CAST_BROADCAST	5
-#define QETH_CAST_ANYCAST	7
-#define QETH_CAST_NOCAST	0
-
-/* VLAN defines */
-#define QETH_EXT_HEADER_VLAN_FRAME	  0x01
-#define QETH_EXT_HEADER_TOKEN_ID	  0x02
-#define QETH_EXT_HEADER_INCLUDE_VLAN_TAG  0x04
+#define PRINTK_HEADER 	"qeth: "
 
-#define QETH_EXT_HEADER_SRC_MAC_ADDRESS   0x08
-#define QETH_EXT_HEADER_CSUM_HDR_REQ      0x10
-#define QETH_EXT_HEADER_CSUM_TRANSP_REQ   0x20
-#define QETH_EXT_HEADER_CSUM_TRANSP_FRAME_TYPE   0x40
-
-#define QETH_UDP_CSUM_OFFSET	6
-#define QETH_TCP_CSUM_OFFSET	16
-
-#define QETH_VERIFY_IS_REAL_DEV               1
-#define QETH_VERIFY_IS_VLAN_DEV               2
+#define HEXDUMP16(importance,header,ptr) \
+PRINT_##importance(header "%02x %02x %02x %02x  %02x %02x %02x %02x  " \
+		   "%02x %02x %02x %02x  %02x %02x %02x %02x\n", \
+		   *(((char*)ptr)),*(((char*)ptr)+1),*(((char*)ptr)+2), \
+		   *(((char*)ptr)+3),*(((char*)ptr)+4),*(((char*)ptr)+5), \
+		   *(((char*)ptr)+6),*(((char*)ptr)+7),*(((char*)ptr)+8), \
+		   *(((char*)ptr)+9),*(((char*)ptr)+10),*(((char*)ptr)+11), \
+		   *(((char*)ptr)+12),*(((char*)ptr)+13), \
+		   *(((char*)ptr)+14),*(((char*)ptr)+15)); \
+PRINT_##importance(header "%02x %02x %02x %02x  %02x %02x %02x %02x  " \
+		   "%02x %02x %02x %02x  %02x %02x %02x %02x\n", \
+		   *(((char*)ptr)+16),*(((char*)ptr)+17), \
+		   *(((char*)ptr)+18),*(((char*)ptr)+19), \
+		   *(((char*)ptr)+20),*(((char*)ptr)+21), \
+		   *(((char*)ptr)+22),*(((char*)ptr)+23), \
+		   *(((char*)ptr)+24),*(((char*)ptr)+25), \
+		   *(((char*)ptr)+26),*(((char*)ptr)+27), \
+		   *(((char*)ptr)+28),*(((char*)ptr)+29), \
+		   *(((char*)ptr)+30),*(((char*)ptr)+31));
 
-inline static unsigned int
-qeth_get_ipa_timeout(int cardtype)
+static inline void
+qeth_hex_dump(unsigned char *buf, size_t len)
 {
-	switch (cardtype) {
-	case QETH_CARD_TYPE_IQD:
-		return 2000;
-	default:
-		return 20000;
-	}
-}
+	size_t i;
 
-inline static unsigned short
-qeth_get_additional_dev_flags(int cardtype)
-{
-	switch (cardtype) {
-	case QETH_CARD_TYPE_IQD:
-		return IFF_NOARP;
-#ifdef QETH_IPV6
-	default:
-		return 0;
-#else /* QETH_IPV6 */
-	default:
-		return IFF_NOARP;
-#endif /* QETH_IPV6 */
+	for (i = 0; i < len; i++) {
+		if (i && !(i % 16))
+			printk("\n");
+		printk("%02x ", *(buf + i));
 	}
+	printk("\n");
 }
 
-inline static int
-qeth_get_hlen(__u8 link_type)
-{
-#ifdef QETH_IPV6
-	switch (link_type) {
-	case QETH_MPC_LINK_TYPE_HSTR:
-	case QETH_MPC_LINK_TYPE_LANE_TR:
-		return QETH_HEADER_SIZE + TR_HLEN;
-	default:
-#ifdef QETH_VLAN
-		return QETH_HEADER_SIZE + VLAN_ETH_HLEN;
-#else
-		return QETH_HEADER_SIZE + ETH_HLEN;
-#endif
-	}
-#else /* QETH_IPV6 */
-#ifdef QETH_VLAN
-	return QETH_HEADER_SIZE + VLAN_HLEN;
-#else
-	return QETH_HEADER_SIZE;
-#endif
-
-#endif /* QETH_IPV6 */
-}
+#define SENSE_COMMAND_REJECT_BYTE 0
+#define SENSE_COMMAND_REJECT_FLAG 0x80
+#define SENSE_RESETTING_EVENT_BYTE 1
+#define SENSE_RESETTING_EVENT_FLAG 0x80
 
-static int (*qeth_my_eth_header) (struct sk_buff *, struct net_device *,
-				  unsigned short, void *, void *, unsigned);
-#ifdef CONFIG_TR
-static int (*qeth_my_tr_header) (struct sk_buff *, struct net_device *,
-				 unsigned short, void *, void *, unsigned);
-#endif /* CONFIG_TR */
-static int (*qeth_my_eth_rebuild_header) (struct sk_buff *);
-#ifdef CONFIG_TR
-static int (*qeth_my_tr_rebuild_header) (struct sk_buff *);
-#endif /* CONFIG_TR */
-static int (*qeth_my_eth_header_cache) (struct neighbour *, struct hh_cache *);
-static void (*qeth_my_eth_header_cache_update) (struct hh_cache *,
-						struct net_device *,
-						unsigned char *);
-
-#ifdef QETH_IPV6
-typedef int (*__qeth_temp1) (struct sk_buff *, struct net_device *,
-			     unsigned short, void *, void *, unsigned);
-inline static __qeth_temp1
-qeth_get_hard_header(__u8 link_type)
-{
-	switch (link_type) {
-#ifdef CONFIG_TR
-	case QETH_MPC_LINK_TYPE_HSTR:
-	case QETH_MPC_LINK_TYPE_LANE_TR:
-		return qeth_my_tr_header;
-#endif /* CONFIG_TR */
-	default:
-		return qeth_my_eth_header;
-	}
-}
+/*
+ * Common IO related definitions
+ */
+extern struct device *qeth_root_dev;
+extern struct ccw_driver qeth_ccw_driver;
+extern struct ccwgroup_driver qeth_ccwgroup_driver;
 
-typedef int (*__qeth_temp2) (struct sk_buff *);
-inline static __qeth_temp2
-qeth_get_rebuild_header(__u8 link_type)
-{
-	switch (link_type) {
-#ifdef CONFIG_TR
-	case QETH_MPC_LINK_TYPE_HSTR:
-	case QETH_MPC_LINK_TYPE_LANE_TR:
-		return qeth_my_tr_rebuild_header;
-#endif /* CONFIG_TR */
-	default:
-		return qeth_my_eth_rebuild_header;
-	}
-}
+#define CARD_RDEV(card) card->read.ccwdev
+#define CARD_WDEV(card) card->write.ccwdev
+#define CARD_DDEV(card) card->data.ccwdev
+#define CARD_BUS_ID(card) card->gdev->dev.bus_id
+#define CARD_RDEV_ID(card) card->read.ccwdev->dev.bus_id
+#define CARD_WDEV_ID(card) card->write.ccwdev->dev.bus_id
+#define CARD_DDEV_ID(card) card->data.ccwdev->dev.bus_id
+#define CHANNEL_ID(channel) channel->ccwdev->dev.bus_id
 
-typedef int (*__qeth_temp3) (struct neighbour *, struct hh_cache *);
-inline static __qeth_temp3
-qeth_get_hard_header_cache(__u8 link_type)
-{
-	switch (link_type) {
-	case QETH_MPC_LINK_TYPE_HSTR:
-	case QETH_MPC_LINK_TYPE_LANE_TR:
-		return NULL;
-	default:
-		return qeth_my_eth_header_cache;
-	}
-}
+#define CARD_FROM_CDEV(cdev) (struct qeth_card *) \
+		((struct ccwgroup_device *)cdev->dev.driver_data)\
+		->dev.driver_data;
 
-typedef void (*__qeth_temp4) (struct hh_cache *, struct net_device *,
-			      unsigned char *);
-inline static __qeth_temp4
-qeth_get_header_cache_update(__u8 link_type)
-{
-	switch (link_type) {
-	case QETH_MPC_LINK_TYPE_HSTR:
-	case QETH_MPC_LINK_TYPE_LANE_TR:
-		return NULL;
-	default:
-		return qeth_my_eth_header_cache_update;
-	}
-}
+/**
+ * card stuff
+ */
+#ifdef CONFIG_QETH_PERF_STATS
+struct qeth_perf_stats {
+	unsigned int bufs_rec;
+	unsigned int bufs_sent;
 
-static unsigned short
-qeth_eth_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
-	struct ethhdr *eth;
-
-	skb->mac.raw = skb->data;
-	skb_pull(skb, ETH_ALEN * 2 + sizeof (short));
-	eth = skb->mac.ethernet;
-
-	if (*eth->h_dest & 1) {
-		if (memcmp(eth->h_dest, dev->broadcast, ETH_ALEN) == 0)
-			skb->pkt_type = PACKET_BROADCAST;
-		else
-			skb->pkt_type = PACKET_MULTICAST;
-	} else {
-		skb->pkt_type = PACKET_OTHERHOST;
-	}
-	if (ntohs(eth->h_proto) >= 1536)
-		return eth->h_proto;
-	if (*(unsigned short *) (skb->data) == 0xFFFF)
-		return htons(ETH_P_802_3);
-	return htons(ETH_P_802_2);
-}
+	unsigned int skbs_sent_pack;
+	unsigned int bufs_sent_pack;
 
-typedef unsigned short (*__qeth_temp5) (struct sk_buff *, struct net_device *);
-inline static __qeth_temp5
-qeth_get_type_trans(__u8 link_type)
-{
-	switch (link_type) {
-#ifdef CONFIG_TR
-	case QETH_MPC_LINK_TYPE_HSTR:
-	case QETH_MPC_LINK_TYPE_LANE_TR:
-		return tr_type_trans;
-#endif
-	default:
-		return qeth_eth_type_trans;
-	}
-}
-#endif /* QETH_IPV6 */
+	unsigned int sc_dp_p;
+	unsigned int sc_p_dp;
 
-inline static const char *
-qeth_get_link_type_name(int cardtype, __u8 linktype)
-{
-	switch (cardtype) {
-	case QETH_CARD_TYPE_UNKNOWN:
-		return "unknown";
-	case QETH_CARD_TYPE_OSAE:
-		switch (linktype) {
-		case QETH_MPC_LINK_TYPE_FAST_ETHERNET:
-			return "Fast Eth";
-		case QETH_MPC_LINK_TYPE_HSTR:
-			return "HSTR";
-		case QETH_MPC_LINK_TYPE_GIGABIT_ETHERNET:
-			return "Gigabit Eth";
-		case QETH_MPC_LINK_TYPE_LANE_ETH100:
-			return "LANE Eth100";
-		case QETH_MPC_LINK_TYPE_LANE_TR:
-			return "LANE TR";
-		case QETH_MPC_LINK_TYPE_LANE_ETH1000:
-			return "LANE Eth1000";
-		default:
-			return "unknown";
-		}
-	case QETH_CARD_TYPE_IQD:
-		return "magic";
-	default:
-		return "unknown";
-	}
-}
+	__u64 inbound_start_time;
+	unsigned int inbound_cnt;
+	unsigned int inbound_time;
+	__u64 outbound_start_time;
+	unsigned int outbound_cnt;
+	unsigned int outbound_time;
+};
+#endif /* CONFIG_QETH_PERF_STATS */
 
-inline static const char *
-qeth_get_dev_basename(int cardtype, __u8 link_type)
-{
-	switch (cardtype) {
-	case QETH_CARD_TYPE_UNKNOWN:
-		return "eth";
-	case QETH_CARD_TYPE_OSAE:
-		switch (link_type) {
-		case QETH_MPC_LINK_TYPE_LANE_TR:
-			/* fallthrough */
-		case QETH_MPC_LINK_TYPE_HSTR:
-			return "tr";
-		default:
-			return "eth";
-		}
-	case QETH_CARD_TYPE_IQD:
-		return "hsi";
-	default:
-		return "eth";
-	}
-}
+/* Routing stuff */
+struct qeth_routing_info {
+	enum qeth_routing_types type;
+};
 
-/* inbound: */
-#define DEFAULT_BUFFER_SIZE 65536
-#define DEFAULT_BUFFER_COUNT 128
-#define BUFCNT_MIN 8
-#define BUFCNT_MAX 128
-#define BUFFER_SIZE (card->inbound_buffer_size)
-#define BUFFER_MAX_ELEMENTS (BUFFER_SIZE>>12)
-	/* 8k for each pair header-buffer: */
+/* IPA stuff */
+struct qeth_ipa_info {
+	__u32 supported_funcs;
+	__u32 enabled_funcs;
+};
 
-inline static int
-qeth_sbal_packing_on_card(int cardtype)
+static inline int
+qeth_is_ipa_supported(struct qeth_ipa_info *ipa, enum qeth_ipa_funcs func)
 {
-	switch (cardtype) {
-	case QETH_CARD_TYPE_IQD:
-		return 0;
-	default:
-		return 1;
-	}
+	return (ipa->supported_funcs & func);
 }
 
-/* 
- * do it this way round -> __MODULE_STRING needs with
- * QETH_PRIO_NICE_LEVELS a single number
- */
-#define QETH_MAX_PRIO_QUEUES QETH_PRIO_NICE_LEVELS+1
-
 static inline int
-qeth_sbalf15_in_retrieable_range(int sbalf15)
+qeth_is_ipa_enabled(struct qeth_ipa_info *ipa, enum qeth_ipa_funcs func)
 {
-	return ((sbalf15 >= 15) && (sbalf15 <= 31));
+	return (ipa->supported_funcs & ipa->enabled_funcs & func);
 }
 
-#define INBOUND_BUFFER_POS(card,bufno,sbale) \
-	( (bufno&SPAREBUF_MASK)? \
-	  ( \
-	    (sparebufs[bufno&(~SPAREBUF_MASK)].buf+ \
-	     PAGE_SIZE*sbale) \
-	  ):( \
-	      (card->inbound_buffer_pool_entry[card-> \
-	       inbound_buffer_entry_no[bufno]][sbale]) \
-	    ) )
-
-#define SPAREBUF_UNAVAIL 0
-#define SPAREBUF_FREE 1
-#define SPAREBUF_USED 2
-
-struct sparebufs {
-	char *buf;
-	atomic_t status;
-};
+#define qeth_adp_supported(c,f) \
+	qeth_is_ipa_supported(&c->options.adp, f)
+#define qeth_adp_enabled(c,f) \
+	qeth_is_ipa_enabled(&c->options.adp, f)
+#define qeth_is_supported(c,f) \
+	qeth_is_ipa_supported(&c->options.ipa4, f)
+#define qeth_is_enabled(c,f) \
+	qeth_is_ipa_enabled(&c->options.ipa4, f)
+#ifdef CONFIG_QETH_IPV6
+#define qeth_is_supported6(c,f) \
+	qeth_is_ipa_supported(&c->options.ipa6, f)
+#define qeth_is_enabled6(c,f) \
+	qeth_is_ipa_enabled(&c->options.ipa6, f)
+#else /* CONFIG_QETH_IPV6 */
+#define qeth_is_supported6(c,f) 0
+#define qeth_is_enabled6(c,f) 0
+#endif /* CONFIG_QETH_IPV6 */
+#define qeth_is_ipafunc_supported(c,prot,f) \
+	 (prot==QETH_PROT_IPV6)? qeth_is_supported6(c,f):qeth_is_supported(c,f)
+#define qeth_is_ipafunc_enabled(c,prot,f) \
+	 (prot==QETH_PROT_IPV6)? qeth_is_enabled6(c,f):qeth_is_enabled(c,f)
+
 
-#define SEND_STATE_INACTIVE		0
-#define SEND_STATE_DONT_PACK		1
-#define SEND_STATE_PACK			2
-
-#define QETH_LOCK_UNLOCKED 0
-#define QETH_LOCK_NORMAL 1
-#define QETH_LOCK_FLUSH 2
-
-#define QETH_TX_TIMEOUT 100*HZ	/* 100 seconds */
-
-#define QETH_REMOVE_WAIT_TIME 200
-#define QETH_WAIT_FOR_THREAD_TIME 20
-#define QETH_IDLE_WAIT_TIME 10
-#define QETH_WAIT_BEFORE_2ND_DOIO 1000
-
-#define QETH_FAKE_LL_LEN ETH_HLEN	/* 14 */
-#define QETH_FAKE_LL_PROT_LEN 2
-#define QETH_FAKE_LL_ADDR_LEN ETH_ALEN	/* 6 */
-#define QETH_FAKE_LL_DEST_MAC_POS 0
-#define QETH_FAKE_LL_SRC_MAC_POS 6
-#define QETH_FAKE_LL_SRC_MAC_POS_IN_QDIO_HDR 6
-#define QETH_FAKE_LL_PROT_POS 12
-#define QETH_FAKE_LL_V4_ADDR_POS 16
+#define QETH_IDX_FUNC_LEVEL_OSAE_ENA_IPAT 0x0101
+#define QETH_IDX_FUNC_LEVEL_OSAE_DIS_IPAT 0x0101
+#define QETH_IDX_FUNC_LEVEL_IQD_ENA_IPAT 0x4108
+#define QETH_IDX_FUNC_LEVEL_IQD_DIS_IPAT 0x5108
+
+#define QETH_MODELLIST_ARRAY \
+	{{0x1731,0x01,0x1732,0x01,QETH_CARD_TYPE_OSAE,1, \
+	QETH_IDX_FUNC_LEVEL_OSAE_ENA_IPAT, \
+	QETH_IDX_FUNC_LEVEL_OSAE_DIS_IPAT, \
+	QETH_MAX_QUEUES,0}, \
+	{0x1731,0x05,0x1732,0x05,QETH_CARD_TYPE_IQD,0, \
+	QETH_IDX_FUNC_LEVEL_IQD_ENA_IPAT, \
+	QETH_IDX_FUNC_LEVEL_IQD_DIS_IPAT, \
+	QETH_MAX_QUEUES,0x103}, \
+	{0,0,0,0,0,0,0,0,0}}
+
+#define QETH_REAL_CARD		1
+#define QETH_VLAN_CARD		2
+#define QETH_BUFSIZE	 	4096
+
+/**
+ * some more defs
+ */
+#define IF_NAME_LEN	 	16
+#define QETH_TX_TIMEOUT		100 * HZ
+#define QETH_HEADER_SIZE	32
+#define MAX_PORTNO 		15
+#define QETH_FAKE_LL_LEN 	ETH_HLEN
 #define QETH_FAKE_LL_V6_ADDR_POS 24
 
-#define DEV_NAME_LEN 16
-#define IOCTL_MAX_TRANSFER_SIZE 65535
+/*IPv6 address autoconfiguration stuff*/
+#define UNIQUE_ID_IF_CREATE_ADDR_FAILED 0xfffe
+#define UNIQUE_ID_NOT_BY_CARD 		0x10000
 
+/*****************************************************************************/
+/* QDIO queue and buffer handling                                            */
+/*****************************************************************************/
+#define QETH_MAX_QUEUES 4
+#define QETH_IN_BUF_SIZE_DEFAULT 65536
+#define QETH_IN_BUF_COUNT_DEFAULT 16
+#define QETH_IN_BUF_COUNT_MIN 8
+#define QETH_IN_BUF_COUNT_MAX 128
+#define QETH_MAX_BUFFER_ELEMENTS(card) ((card)->qdio.in_buf_size >> 12)
+#define QETH_IN_BUF_REQUEUE_THRESHOLD(card) \
+		((card)->qdio.in_buf_pool.buf_count / 4)
+
+/* buffers we have to be behind before we get a PCI */
+#define QETH_PCI_THRESHOLD_A(card) ((card)->qdio.in_buf_pool.buf_count+1)
+/*enqueued free buffers left before we get a PCI*/
+#define QETH_PCI_THRESHOLD_B(card) 0
+/*not used unless the microcode gets patched*/
+#define QETH_PCI_TIMER_VALUE(card) 3
+
+#define QETH_MIN_INPUT_THRESHOLD 1
+#define QETH_MAX_INPUT_THRESHOLD 500
+#define QETH_MIN_OUTPUT_THRESHOLD 1
+#define QETH_MAX_OUTPUT_THRESHOLD 300
+
+/* priority queing */
+#define QETH_PRIOQ_DEFAULT QETH_NO_PRIO_QUEUEING
+#define QETH_DEFAULT_QUEUE    2
+#define QETH_NO_PRIO_QUEUEING 0
+#define QETH_PRIO_Q_ING_PREC  1
+#define QETH_PRIO_Q_ING_TOS   2
 #define IP_TOS_LOWDELAY 0x10
 #define IP_TOS_HIGHTHROUGHPUT 0x08
 #define IP_TOS_HIGHRELIABILITY 0x04
 #define IP_TOS_NOTIMPORTANT 0x02
 
-#define QETH_RCD_LENGTH 128
-
-#define __max(a,b) ( ((a)>(b))?(a):(b) )
-#define __min(a,b) ( ((a)<(b))?(a):(b) )
-#define QETH_BUFSIZE __max(__max(IPA_PDU_HEADER_SIZE+sizeof(struct arp_cmd), \
-				 IPA_PDU_HEADER_SIZE+sizeof(struct ipa_cmd)), \
-			   QETH_RCD_LENGTH)
-
-#define QETH_NOP_TIMEOUT 1500
-#define QETH_QUIESCE_NETDEV_TIME 300
-#define QETH_QUIESCE_WAIT_BEFORE_CLEAR 4000
-#define QETH_QUIESCE_WAIT_AFTER_CLEAR 4000
-
-#define NOP_STATE 0x1001
-#define IDX_ACTIVATE_READ_STATE 0x1003
-#define IDX_ACTIVATE_WRITE_STATE 0x1004
-#define MPC_SETUP_STATE 0x1005
-#define CLEAR_STATE 0x1006
-#define IPA_CMD_STATE 0x1007
-#define IPA_IOCTL_STATE 0x1009
-#define IPA_SETIP_FLAG 0x100000
-
-#define QETH_REMOVE_CARD_PROPER 1
-#define QETH_REMOVE_CARD_QUICK 2
-
-#define NO_PRIO_QUEUEING 0
-#define PRIO_QUEUEING_PREC 1
-#define PRIO_QUEUEING_TOS 2
-#define NO_ROUTER 0
-#define PRIMARY_ROUTER 1
-#define SECONDARY_ROUTER 2
-#define MULTICAST_ROUTER 3
-#define PRIMARY_CONNECTOR 4
-#define SECONDARY_CONNECTOR 5
-#define ROUTER_MASK 0xf		/* used to remove SET_ROUTING_FLAG
-				   from routing_type */
-#define RESET_ROUTING_FLAG 0x10	/* used to indicate, that setting
-				   the routing type is desired */
-#define BROADCAST_ALLRINGS 0
-#define BROADCAST_LOCAL 1
-#define MACADDR_NONCANONICAL 0
-#define MACADDR_CANONICAL 1
-#define ENABLE_TAKEOVER 0
-#define DISABLE_TAKEOVER 1
-#define FAKE_BROADCAST 0
-#define DONT_FAKE_BROADCAST 1
-
-#define FAKE_LL 0
-#define DONT_FAKE_LL 1
-
-#define QETH_BREAKOUT_LEAVE 1
-#define QETH_BREAKOUT_AGAIN 2
-
-#define QETH_WAIT_FOR_LOCK 0
-#define QETH_DONT_WAIT_FOR_LOCK 1
-#define QETH_LOCK_ALREADY_HELD 2
-
-#define PROBLEM_CARD_HAS_STARTLANED 1
-#define PROBLEM_RECEIVED_IDX_TERMINATE 2
-#define PROBLEM_ACTIVATE_CHECK_CONDITION 3
-#define PROBLEM_RESETTING_EVENT_INDICATOR 4
-#define PROBLEM_COMMAND_REJECT 5
-#define PROBLEM_ZERO_SENSE_DATA 6
-#define PROBLEM_GENERAL_CHECK 7
-#define PROBLEM_BAD_SIGA_RESULT 8
-#define PROBLEM_USER_TRIGGERED_RECOVERY 9
-#define PROBLEM_AFFE 10
-#define PROBLEM_MACHINE_CHECK 11
-#define PROBLEM_TX_TIMEOUT 12
-
-#define CARD_RDEV(card) card->gdev->cdev[0]
-#define CARD_WDEV(card) card->gdev->cdev[1]
-#define CARD_DDEV(card) card->gdev->cdev[2]
-#define CARD_BUS_ID(card) card->gdev->dev.bus_id
-#define CARD_RDEV_ID(card) card->gdev->cdev[0]->dev.bus_id
-#define CARD_WDEV_ID(card) card->gdev->cdev[1]->dev.bus_id
-#define CARD_DDEV_ID(card) card->gdev->cdev[2]->dev.bus_id
-#define CARD_FROM_CDEV(cdev) (struct qeth_card *) \
-	((struct ccwgroup_device *) cdev->dev.driver_data)->dev.driver_data
+/* Packing */
+#define QETH_LOW_WATERMARK_PACK  2
+#define QETH_HIGH_WATERMARK_PACK 5
+#define QETH_WATERMARK_PACK_FUZZ 1
 
-#define SENSE_COMMAND_REJECT_BYTE 0
-#define SENSE_COMMAND_REJECT_FLAG 0x80
-#define SENSE_RESETTING_EVENT_BYTE 1
-#define SENSE_RESETTING_EVENT_FLAG 0x80
+#define QETH_IP_HEADER_SIZE 40
+/* VLAN defines */
+#define QETH_EXT_HDR_VLAN_FRAME        0x01
+#define QETH_EXT_HDR_TOKEN_ID          0x02
+#define QETH_EXT_HDR_INCLUDE_VLAN_TAG  0x04
 
-#define BUFFER_USED 1
-#define BUFFER_UNUSED -1
+struct qeth_hdr {
+	__u8  id;
+	__u8  flags;
+	__u16 inbound_checksum;
+	__u32 token;
+	__u16 length;
+	__u8  vlan_prio;
+	__u8  ext_flags;
+	__u16 vlan_id;
+	__u16 frame_offset;
+	__u8  dest_addr[16];
+} __attribute__ ((packed));
 
-typedef int (*reg_notifier_t) (struct notifier_block *);
+/* flags for qeth_hdr.flags */
+#define QETH_HDR_PASSTHRU 0x10
+#define QETH_HDR_IPV6     0x80
+#define QETH_HDR_CAST_MASK 0x07
+enum qeth_cast_flags {
+	QETH_CAST_UNICAST   = 0x06,
+	QETH_CAST_MULTICAST = 0x04,
+	QETH_CAST_BROADCAST = 0x05,
+	QETH_CAST_ANYCAST   = 0x07,
+	QETH_CAST_NOCAST    = 0x00,
+};
 
-struct ipato_entry {
-	int version;
-	__u8 addr[16];
-	int mask_bits;
-	char dev_name[DEV_NAME_LEN];
-	struct ipato_entry *next;
+/* flags for qeth_hdr.ext_flags */
+#define QETH_HDR_EXT_VLAN_FRAME      0x01
+#define QETH_HDR_EXT_CSUM_HDR_REQ    0x10
+#define QETH_HDR_EXT_CSUM_TRANSP_REQ 0x20
+#define QETH_HDR_EXT_SRC_MAC_ADDR    0x08
+
+static inline int
+qeth_is_last_sbale(struct qdio_buffer_element *sbale)
+{
+	return (sbale->flags & SBAL_FLAGS_LAST_ENTRY);
+}
+
+enum qeth_qdio_buffer_states {
+	/*
+	 * inbound: read out by driver; owned by hardware in order to be filled
+	 * outbound: owned by driver in order to be filled
+	 */
+	QETH_QDIO_BUF_EMPTY,
+	/*
+	 * inbound: filled by hardware; owned by driver in order to be read out
+	 * outbound: filled by driver; owned by hardware in order to be sent
+	 */
+	QETH_QDIO_BUF_PRIMED,
+	/*
+	 * inbound only: an error condition has been detected for a buffer
+	 *     the buffer will be discarded (not read out)
+	 */
+	QETH_QDIO_BUF_ERROR,
 };
 
-struct qeth_vipa_entry {
-	int version;
-	__u8 ip[16];
-	int flag;
-	volatile int state;
-	struct qeth_vipa_entry *next;
+enum qeth_qdio_info_states {
+	QETH_QDIO_UNINITIALIZED,
+	QETH_QDIO_ALLOCATED,
+	QETH_QDIO_ESTABLISHED,
 };
 
-struct ip_state {
-	struct in_ifaddr *ip_ifa;	/* pointer to IPv4 adresses */
-	struct inet6_ifaddr *ip6_ifa;
+struct qeth_buffer_pool_entry {
+	struct list_head list;
+	struct list_head init_list;
+	void *elements[QDIO_MAX_ELEMENTS_PER_BUFFER];
 };
 
-struct qeth_ipm_mac {
-	__u8 mac[ETH_ALEN];
-	__u8 ip[16];
-	struct qeth_ipm_mac *next;
+struct qeth_qdio_buffer_pool {
+	struct list_head entry_list;
+	int buf_count;
 };
 
-struct ip_mc_state {
-	struct qeth_ipm_mac *ipm_ifa;
-	struct qeth_ipm_mac *ipm6_ifa;
+struct qeth_qdio_buffer {
+	struct qdio_buffer *buffer;
+	volatile enum qeth_qdio_buffer_states state;
+	/* the buffer pool entry currently associated to this buffer */
+	struct qeth_buffer_pool_entry *pool_entry;
 };
 
-struct addr_request {
-	struct addr_request *next;
-	int request_type;
-	__u8 mac[ETH_ALEN];
-	__u8 ip[16];
+struct qeth_qdio_q {
+	struct qdio_buffer qdio_bufs[QDIO_MAX_BUFFERS_PER_Q];
+	struct qeth_qdio_buffer bufs[QDIO_MAX_BUFFERS_PER_Q];
+	/*
+	 * buf_to_process means "buffer primed by hardware,
+	 * has to be read in by driver"; current state PRIMED
+	 */
+	volatile int next_buf_to_process;
+	/*
+	 * buf_to_init means "buffer must be initialized by driver and must
+	 * be made available for hardware" -> state is set to EMPTY
+	 */
+	volatile int next_buf_to_init;
+} __attribute__ ((aligned(256)));
+
+struct qeth_qdio_out_buffer {
+	struct qdio_buffer *buffer;
+	volatile enum qeth_qdio_buffer_states state;
+	volatile int next_element_to_fill;
+	struct sk_buff_head skb_list;
 };
 
-struct qeth_card_options {
-	char devname[DEV_NAME_LEN];
-	volatile int routing_type4;
-#ifdef QETH_IPV6
-	volatile int routing_type6;
-#endif /* QETH_IPV6 */
-	int checksum_type;
+struct qeth_card;
+
+struct qeth_qdio_out_q {
+	struct qdio_buffer qdio_bufs[QDIO_MAX_BUFFERS_PER_Q];
+	struct qeth_qdio_out_buffer bufs[QDIO_MAX_BUFFERS_PER_Q];
+	int queue_no;
+	struct qeth_card *card;
+	struct tasklet_struct tasklet;
+	spinlock_t lock;
+	volatile int do_pack;
+	/*
+	 * index of buffer to be filled by driver; state EMPTY or PACKING
+	 */
+	volatile int next_buf_to_fill;
+	volatile int next_buf_to_flush;
+	/*
+	 * number of buffers that are currently filled (PRIMED)
+	 * -> these buffers are hardware-owned
+	 */
+	atomic_t used_buffers;
+	/* indicates whether PCI flag must be set (or if one is outstanding) */
+	atomic_t set_pci_flags_count;
+} __attribute__ ((aligned(256)));
+
+struct qeth_qdio_info {
+	volatile enum qeth_qdio_info_states state;
+	/* input */
+	struct qeth_qdio_q *in_q;
+	struct qeth_qdio_buffer_pool in_buf_pool;
+	struct qeth_qdio_buffer_pool init_pool;
+	int in_buf_size;
+	struct tasklet_struct in_tasklet;
+
+	/* output */
+	int no_out_queues;
+	struct qeth_qdio_out_q **out_qs;
+
+	/* priority queueing */
 	int do_prio_queueing;
-	int default_queue;
-	int inbound_buffer_count;
-	int polltime;
-	char portname[9];
-	int portno;
-	int broadcast_mode;
-	int macaddr_mode;
-	int ena_ipat;
-	int fake_broadcast;
-	int add_hhlen;
-	int fake_ll;
+	int default_out_queue;
 };
 
-struct qeth_hdr {
-	__u8 id;
-	__u8 flags;
-	__u16 inbound_checksum;
-	__u32 token;
-	__u16 length;
-	__u8 vlan_prio;
-	__u8 ext_flags;
-	__u16 vlan_id;
-	__u16 frame_offset;
-	__u8 dest_addr[16];
+enum qeth_send_errors {
+	QETH_SEND_ERROR_NONE,
+	QETH_SEND_ERROR_LINK_FAILURE,
+	QETH_SEND_ERROR_RETRY,
+	QETH_SEND_ERROR_KICK_IT,
 };
 
-struct qeth_ringbuffer_element {
-	struct sk_buff_head skb_list;
-	int next_element_to_fill;
-} __attribute__ ((packed));
+#define QETH_ETH_MAC_V4      0x0100 /* like v4 */
+#define QETH_ETH_MAC_V6      0x3333 /* like v6 */
+/* tr mc mac is longer, but that will be enough to detect mc frames */
+#define QETH_TR_MAC_NC       0xc000 /* non-canonical */
+#define QETH_TR_MAC_C        0x0300 /* canonical */
 
-struct qeth_ringbuffer {
-	struct qdio_buffer buffer[QDIO_MAX_BUFFERS_PER_Q];
-	struct qeth_ringbuffer_element ringbuf_element[QDIO_MAX_BUFFERS_PER_Q];
-}__attribute__ ((packed, aligned(PAGE_SIZE)));
+#define DEFAULT_ADD_HHLEN 0
+#define MAX_ADD_HHLEN 1024
 
-struct qeth_dma_stuff {
-	unsigned char *sendbuf;
-	unsigned char *recbuf;
-	struct ccw1 read_ccw;
-	struct ccw1 write_ccw;
-}__attribute__ ((packed, aligned(PAGE_SIZE)));
+/**
+ * buffer stuff for read channel
+ */
+#define QETH_CMD_BUFFER_NO	8
 
-struct qeth_perf_stats {
-	unsigned int skbs_rec;
-	unsigned int bufs_rec;
+/**
+ *  channel state machine
+ */
+enum qeth_channel_states {
+	CH_STATE_UP,
+	CH_STATE_DOWN,
+	CH_STATE_ACTIVATING,
+	CH_STATE_HALTED,
+	CH_STATE_STOPPED,
+};
+/**
+ * card state machine
+ */
+enum qeth_card_states {
+	CARD_STATE_DOWN,
+	CARD_STATE_HARDSETUP,
+	CARD_STATE_SOFTSETUP,
+	CARD_STATE_UP_LAN_OFFLINE,
+	CARD_STATE_UP_LAN_ONLINE,
+	CARD_STATE_RECOVER,
+};
 
-	unsigned int skbs_sent;
-	unsigned int bufs_sent;
+/**
+ * Protocol versions
+ */
+enum qeth_prot_versions {
+	QETH_PROT_SNA  = 0x0001,
+	QETH_PROT_IPV4 = 0x0004,
+	QETH_PROT_IPV6 = 0x0006,
+};
 
-	unsigned int skbs_sent_dont_pack;
-	unsigned int bufs_sent_dont_pack;
-	unsigned int skbs_sent_pack;
-	unsigned int bufs_sent_pack;
-	unsigned int skbs_sent_pack_better;
-	unsigned int bufs_sent_pack_better;
+enum qeth_ip_types {
+	QETH_IP_TYPE_NORMAL,
+	QETH_IP_TYPE_VIPA,
+	QETH_IP_TYPE_RXIP,
+};
 
-	unsigned int sc_dp_p;
-	unsigned int sc_p_dp;
+enum qeth_cmd_buffer_state {
+	BUF_STATE_FREE,
+	BUF_STATE_LOCKED,
+	BUF_STATE_PROCESSED,
+};
+/**
+ * IP address and multicast list
+ */
+struct qeth_ipaddr {
+	struct list_head entry;
+	enum qeth_ip_types type;
+	enum qeth_ipa_setdelip_flags set_flags;
+	enum qeth_ipa_setdelip_flags del_flags;
+	int is_multicast;
+	volatile int users;
+	enum qeth_prot_versions proto;
+	unsigned char mac[OSA_ADDR_LEN];
+	union {
+		struct {
+			unsigned int addr;
+			unsigned int mask;
+		} a4;
+		struct {
+			struct in6_addr addr;
+			unsigned int pfxlen;
+		} a6;
+	} u;
+};
 
-	__u64 inbound_start_time;
-	unsigned int inbound_cnt;
-	unsigned int inbound_time;
-	__u64 outbound_start_time;
-	unsigned int outbound_cnt;
-	unsigned int outbound_time;
+struct qeth_ipato_entry {
+	struct list_head entry;
+	enum qeth_prot_versions proto;
+	char addr[16];
+	int mask_bits;
 };
 
-/* ugly. I know. */
-struct qeth_card {	/* pointed to by dev->priv */
+struct qeth_ipato {
+	int enabled;
+	int invert4;
+	int invert6;
+	struct list_head entries;
+};
 
-	/* pointer to options (defaults + parameters) */
-	struct qeth_card_options options;
+struct qeth_channel;
 
-	atomic_t is_startlaned;	/* card did not get a stoplan */
-	                        /* also 0 when card is gone after a
-	                         * machine check */
-
-	__u8 link_type;
-
-	int is_guest_lan;
-
-	/* inbound buffer management */
-	atomic_t inbound_buffer_refcnt[QDIO_MAX_BUFFERS_PER_Q];
-	struct qdio_buffer inbound_qdio_buffers[QDIO_MAX_BUFFERS_PER_Q];
-	/* inbound data area */
-	void *inbound_buffer_pool_entry[QDIO_MAX_BUFFERS_PER_Q]
-	    [QDIO_MAX_ELEMENTS_PER_BUFFER];
-	volatile int inbound_buffer_pool_entry_used[QDIO_MAX_BUFFERS_PER_Q];
-	int inbound_buffer_entry_no[QDIO_MAX_BUFFERS_PER_Q];
-
-	/* for requeueing of buffers */
-	spinlock_t requeue_input_lock;
-	atomic_t requeue_position;
-	atomic_t requeue_counter;
-
-	/* outbound QDIO stuff */
-	volatile int send_state[QETH_MAX_QUEUES];
-	volatile int outbound_first_free_buffer[QETH_MAX_QUEUES];
-	atomic_t outbound_used_buffers[QETH_MAX_QUEUES];
-	int outbound_buffer_send_state[QETH_MAX_QUEUES]
-	    [QDIO_MAX_BUFFERS_PER_Q];
-	int send_retries[QETH_MAX_QUEUES][QDIO_MAX_BUFFERS_PER_Q];
-	volatile int outbound_bytes_in_buffer[QETH_MAX_QUEUES];
-	struct qeth_ringbuffer *outbound_ringbuffer[QETH_MAX_QUEUES];
-	atomic_t outbound_ringbuffer_lock[QETH_MAX_QUEUES];
-	atomic_t last_pci_pos[QETH_MAX_QUEUES];
-
-#ifdef QETH_IPV6
-	int (*hard_header) (struct sk_buff *, struct net_device *,
-			    unsigned short, void *, void *, unsigned);
-	int (*rebuild_header) (struct sk_buff *);
-	int (*hard_header_cache) (struct neighbour *, struct hh_cache *);
-	void (*header_cache_update) (struct hh_cache *, struct net_device *,
-				     unsigned char *);
-	unsigned short (*type_trans) (struct sk_buff *, struct net_device *);
-#endif /* QETH_IPV6 */
+struct qeth_cmd_buffer {
+	enum qeth_cmd_buffer_state state;
+	struct qeth_channel *channel;
+	unsigned char *data;
+	int rc;
+	void (*callback) (struct qeth_channel *, struct qeth_cmd_buffer *);
+};
 
-#ifdef QETH_VLAN
-	struct vlan_group *vlangrp;
-	spinlock_t vlan_lock;
-#endif
 
-	char dev_name[DEV_NAME_LEN];	/* pointed to by dev->name */
-	struct net_device *dev;
-	struct net_device_stats *stats;
+/**
+ * definition of a qeth channel, used for read and write
+ */
+struct qeth_channel {
+	enum qeth_channel_states state;
+	struct ccw1 ccw;
+	spinlock_t iob_lock;
+	wait_queue_head_t wait_q;
+	struct tasklet_struct irq_tasklet;
+	struct ccw_device *ccwdev;
+/*command buffer for control data*/
+	struct qeth_cmd_buffer iob[QETH_CMD_BUFFER_NO];
+	atomic_t irq_pending;
+	volatile int io_buf_no;
+	volatile int buf_no;
+};
 
-	int no_queues;
+/**
+ *  OSA card related definitions
+ */
+struct qeth_token {
+	__u32 issuer_rm_w;
+	__u32 issuer_rm_r;
+	__u32 cm_filter_w;
+	__u32 cm_filter_r;
+	__u32 cm_connection_w;
+	__u32 cm_connection_r;
+	__u32 ulp_filter_w;
+	__u32 ulp_filter_r;
+	__u32 ulp_connection_w;
+	__u32 ulp_connection_r;
+};
 
-#ifdef QETH_PERFORMANCE_STATS
-	struct qeth_perf_stats perf_stats;
-#endif /* QETH_PERFORMANCE_STATS */
-
-	/* our state */
-	atomic_t is_registered;	/* card registered as netdev? */
-	atomic_t is_hardsetup;	/* card has gone through hardsetup */
-	atomic_t is_softsetup;	/* card is setup by softsetup */
-	atomic_t is_open;	/* card is in use */
-
-	/* prevents deadlocks :-O */
-	struct semaphore softsetup_sema;
-	struct semaphore hardsetup_sema;
-	spinlock_t ioctl_lock;
-	atomic_t softsetup_thread_is_running;
-	struct semaphore softsetup_thread_sem;
-	struct work_struct tqueue_sst;
-
-	atomic_t escape_softsetup;	/* active, when recovery has to
-					   wait for softsetup */
-	struct semaphore reinit_thread_sem;
-	atomic_t in_recovery;
-	atomic_t reinit_counter;
-
-	/* problem management */
-	atomic_t break_out;
-	atomic_t problem;
-	struct work_struct tqueue;
-
-	struct {
-		__u32 trans_hdr;
-		__u32 pdu_hdr;
-		__u32 pdu_hdr_ack;
-		__u32 ipa;
-	} seqno;
-
-	struct {
-		__u32 issuer_rm_w;
-		__u32 issuer_rm_r;
-		__u32 cm_filter_w;
-		__u32 cm_filter_r;
-		__u32 cm_connection_w;
-		__u32 cm_connection_r;
-		__u32 ulp_filter_w;
-		__u32 ulp_filter_r;
-		__u32 ulp_connection_w;
-		__u32 ulp_connection_r;
-	} token;
-
-	/* this is card-related */
-	int type;
-	__u16 func_level;
-	int initial_mtu;
-	int max_mtu;
-	int inbound_buffer_size;
-
-	int is_multicast_different;	/* if multicast traffic is to be sent
-					   on a different queue, this is the
-					   queue+no_queues */
-	__u32 ipa_supported;
-	__u32 ipa_enabled;
-	__u32 ipa6_supported;
-	__u32 ipa6_enabled;
-	__u32 adp_supported;
-
-	__u32 csum_enable_mask;
-
-	atomic_t startlan_attempts;
-	atomic_t enable_routing_attempts4;
-	atomic_t rt4fld;
-#ifdef QETH_IPV6
-	atomic_t enable_routing_attempts6;
-	atomic_t rt6fld;
-#endif /* QETH_IPV6 */
-	int unique_id;
+struct qeth_seqno {
+	__u32 trans_hdr;
+	__u32 pdu_hdr;
+	__u32 pdu_hdr_ack;
+	__u32 ipa;
+};
 
-	/* device and I/O data */
-	struct ccwgroup_device *gdev;
+struct qeth_reply {
+	struct list_head list;
+	wait_queue_head_t wait_q;
+	int (*callback)(struct qeth_card *,struct qeth_reply *,unsigned long);
+ 	int seqno;
+	int received;
+	int rc;
+	void *param;
+	struct qeth_card *card;
+	atomic_t refcnt;
+};
+
+struct qeth_card_info {
+
+	char if_name[IF_NAME_LEN];
 	unsigned short unit_addr2;
 	unsigned short cula;
 	unsigned short chpid;
+	__u16 func_level;
+	char mcl_level[QETH_MCL_LENGTH + 1];
+	int guestlan;
+	int portname_required;
+	int portno;
+	char portname[9];
+	enum qeth_card_types type;
+	enum qeth_link_types link_type;
+	int is_multicast_different;
+	int initial_mtu;
+	int max_mtu;
+	int broadcast_capable;
+	int unique_id;
+	__u32 csum_mask;
+};
 
-	unsigned char ipa_buf[QETH_BUFSIZE];
-	unsigned char send_buf[QETH_BUFSIZE];
-
-/* IOCTL Stuff */
-	unsigned char *ioctl_data_buffer;
-	unsigned char *ioctl_buffer_pointer;
-	int ioctl_returncode;
-	int ioctl_buffersize;
-	int number_of_entries;
-
-	atomic_t ioctl_data_has_arrived;
-	wait_queue_head_t ioctl_wait_q;
+struct qeth_card_options {
+	struct qeth_routing_info route4;
+	struct qeth_ipa_info ipa4;
+	struct qeth_ipa_info adp; /*Adapter parameters*/
+#ifdef CONFIG_QETH_IPV6
+	struct qeth_routing_info route6;
+	struct qeth_ipa_info ipa6;
+#endif /* QETH_IPV6 */
+	enum qeth_checksum_types checksum_type;
+	int broadcast_mode;
+	int macaddr_mode;
+	int enable_takeover;
+	int fake_broadcast;
+	int add_hhlen;
+	int fake_ll;
+};
 
-/* stuff under 2 gb */
-	struct qeth_dma_stuff *dma_stuff;
+/*
+ * thread bits for qeth_card thread masks
+ */
+enum qeth_threads {
+	QETH_SET_IP_THREAD  = 1,
+	QETH_SET_MC_THREAD  = 2,
+	QETH_RECOVER_THREAD = 4,
+};
 
-	unsigned int ipa_timeout;
+struct qeth_card {
+	struct list_head list;
+	enum qeth_card_states state;
+	int lan_online;
+	spinlock_t lock;
+/*hardware and sysfs stuff*/
+	struct ccwgroup_device *gdev;
+	struct qeth_channel read;
+	struct qeth_channel write;
+	struct qeth_channel data;
 
-	atomic_t write_busy;
+	struct net_device *dev;
+	struct net_device_stats stats;
 
-	/* vipa stuff */
-	rwlock_t vipa_list_lock;
-	struct qeth_vipa_entry *vipa_list;
+	struct qeth_card_info info;
+	struct qeth_token token;
+	struct qeth_seqno seqno;
+	struct qeth_card_options options;
 
-	/* state information when doing I/O */
-	atomic_t shutdown_phase;
-	atomic_t data_has_arrived;
 	wait_queue_head_t wait_q;
+#ifdef CONFIG_QETH_VLAN
+	spinlock_t vlanlock;
+	struct vlan_group *vlangrp;
+#endif
+	struct work_struct kernel_thread_starter;
+	spinlock_t thread_mask_lock;
+	volatile unsigned long thread_start_mask;
+	volatile unsigned long thread_allowed_mask;
+	volatile unsigned long thread_running_mask;
+	spinlock_t ip_lock;
+	struct list_head ip_list;
+	struct list_head ip_tbd_list;
+	struct qeth_ipato ipato;
+	struct list_head cmd_waiter_list;
+	/* QDIO buffer handling */
+	struct qeth_qdio_info qdio;
+#ifdef CONFIG_QETH_PERF_STATS
+	struct qeth_perf_stats perf_stats;
+#endif /* CONFIG_QETH_PERF_STATS */
+	int use_hard_stop;
+};
 
-	atomic_t clear_succeeded0;
-	atomic_t clear_succeeded1;
-	atomic_t clear_succeeded2;
-
-	/* bookkeeping of IP and multicast addresses */
-	struct ip_state ip_current_state;
-	struct ip_state ip_new_state;
-
-	struct ip_mc_state ip_mc_current_state;
-	struct ip_mc_state ip_mc_new_state;
-
-	int broadcast_capable;
-	int portname_required;
-
-	int realloc_message;
-
-	char level[QETH_MCL_LENGTH + 1];
-
-	volatile int saved_dev_flags;
-
-	/* for our linked list */
-	struct qeth_card *next;
+struct qeth_card_list_struct {
+	struct list_head list;
+	rwlock_t rwlock;
 };
 
-inline static int
-qeth_get_arphrd_type(int cardtype, int linktype)
-{
-	switch (cardtype) {
-	case QETH_CARD_TYPE_OSAE:
-		switch (linktype) {
-		case QETH_MPC_LINK_TYPE_LANE_TR:
-			/* fallthrough */
-		case QETH_MPC_LINK_TYPE_HSTR:
-			return ARPHRD_IEEE802_TR;
-		default:
-			return ARPHRD_ETHER;
-		}
-	case QETH_CARD_TYPE_IQD:
-		return ARPHRD_ETHER;
-	default:
-		return ARPHRD_ETHER;
-	}
-}
+extern struct qeth_card_list_struct qeth_card_list;
+
+/*some helper functions*/
 
 inline static __u8
-qeth_get_adapter_type_for_ipa(int link_type)
+qeth_get_ipa_adp_type(enum qeth_link_types link_type)
 {
 	switch (link_type) {
-	case QETH_MPC_LINK_TYPE_HSTR:
+	case QETH_LINK_TYPE_HSTR:
 		return 2;
 	default:
 		return 1;
 	}
 }
 
-inline static const char *
-qeth_get_cardname(int cardtype, int is_guest_lan)
+inline static int
+qeth_get_hlen(__u8 link_type)
 {
-
- 	if (is_guest_lan) {
- 		switch (cardtype) {
- 		case QETH_CARD_TYPE_UNKNOWN:
-			return "n unknown";
- 		case QETH_CARD_TYPE_OSAE:
-			return " Guest LAN QDIO";
- 		case QETH_CARD_TYPE_IQD:
-			return " Guest LAN Hiper";
- 		default: return
-				 " strange";
- 		}
-	} else {
-		switch (cardtype) {
-		case QETH_CARD_TYPE_UNKNOWN:
-			return "n unknown";
-		case QETH_CARD_TYPE_OSAE:
-			return "n OSD Express";
-		case QETH_CARD_TYPE_IQD:
-			return " HiperSockets";
-		default:
-			return " strange";
-		}
+#ifdef CONFIG_QETH_IPV6
+	switch (link_type) {
+	case QETH_LINK_TYPE_HSTR:
+	case QETH_LINK_TYPE_LANE_TR:
+		return sizeof(struct qeth_hdr) + TR_HLEN;
+	default:
+#ifdef CONFIG_QETH_VLAN
+		return sizeof(struct qeth_hdr) + VLAN_ETH_HLEN;
+#else
+		return sizeof(struct qeth_hdr) + ETH_HLEN;
+#endif
 	}
+#else  /* CONFIG_QETH_IPV6 */
+#ifdef CONFIG_QETH_VLAN
+	return sizeof(struct qeth_hdr) + VLAN_HLEN;
+#else
+	return sizeof(struct qeth_hdr);
+#endif
+#endif /* CONFIG_QETH_IPV6 */
 }
 
-/* max length to be returned: 14 */
-inline static const char *
-qeth_get_cardname_short(int cardtype, __u8 link_type, int is_guest_lan)
+inline static unsigned short
+qeth_get_netdev_flags(int cardtype)
 {
 	switch (cardtype) {
-	case QETH_CARD_TYPE_UNKNOWN:
-		return "unknown";
-	case QETH_CARD_TYPE_OSAE:
-		if (is_guest_lan)
-			return "GuestLAN QDIO";
-		switch (link_type) {
-		case QETH_MPC_LINK_TYPE_FAST_ETHERNET:
-			return "OSD_100";
-		case QETH_MPC_LINK_TYPE_HSTR:
-			return "HSTR";
-		case QETH_MPC_LINK_TYPE_GIGABIT_ETHERNET:
-			return "OSD_1000";
-		case QETH_MPC_LINK_TYPE_LANE_ETH100:
-			return "OSD_FE_LANE";
-		case QETH_MPC_LINK_TYPE_LANE_TR:
-			return "OSD_TR_LANE";
-		case QETH_MPC_LINK_TYPE_LANE_ETH1000:
-			return "OSD_GbE_LANE";
-		case QETH_MPC_LINK_TYPE_LANE:
-			return "OSD_ATM_LANE";
-		default:
-			return "OSD_Express";
-		}
 	case QETH_CARD_TYPE_IQD:
-		return is_guest_lan ? "GuestLAN Hiper" : "HiperSockets";
+		return IFF_NOARP;
+#ifdef CONFIG_QETH_IPV6
 	default:
-		return " strange";
-	}
-}
-
-inline static int
-qeth_mtu_is_valid(struct qeth_card * card, int mtu)
-{
-	switch (card->type) {
-	case QETH_CARD_TYPE_UNKNOWN:
-		return 1;
-	case QETH_CARD_TYPE_OSAE:
-		return ((mtu >= 576) && (mtu <= 61440));
-	case QETH_CARD_TYPE_IQD:
-		return ((mtu >= 576) && (mtu <= card->max_mtu + 4096 - 32));
+		return 0;
+#else
 	default:
-		return 1;
+		return IFF_NOARP;
+#endif
 	}
 }
 
 inline static int
 qeth_get_initial_mtu_for_card(struct qeth_card * card)
 {
-	switch (card->type) {
+	switch (card->info.type) {
 	case QETH_CARD_TYPE_UNKNOWN:
 		return 1500;
 	case QETH_CARD_TYPE_IQD:
-		return card->max_mtu;
+		return card->info.max_mtu;
 	case QETH_CARD_TYPE_OSAE:
-		switch (card->link_type) {
-		case QETH_MPC_LINK_TYPE_HSTR:
-		case QETH_MPC_LINK_TYPE_LANE_TR:
+		switch (card->info.link_type) {
+		case QETH_LINK_TYPE_HSTR:
+		case QETH_LINK_TYPE_LANE_TR:
 			return 2000;
 		default:
 			return 1492;
@@ -1195,39 +837,50 @@ qeth_get_mtu_outof_framesize(int framesize)
 }
 
 inline static int
-qeth_get_buffersize_for_card(int cardtype)
+qeth_mtu_is_valid(struct qeth_card * card, int mtu)
 {
-	switch (cardtype) {
-	case QETH_CARD_TYPE_UNKNOWN:
-		return 65536;
+	switch (card->info.type) {
 	case QETH_CARD_TYPE_OSAE:
-		return 65536;
+		return ((mtu >= 576) && (mtu <= 61440));
 	case QETH_CARD_TYPE_IQD:
-		return 16384;
+		return ((mtu >= 576) &&
+			(mtu <= card->info.max_mtu + 4096 - 32));
+	case QETH_CARD_TYPE_UNKNOWN:
 	default:
-		return 65536;
+		return 1;
 	}
 }
 
 inline static int
-qeth_get_min_number_of_buffers(int cardtype)
+qeth_get_arphdr_type(int cardtype, int linktype)
 {
 	switch (cardtype) {
-	case QETH_CARD_TYPE_UNKNOWN:
-		return 32;
 	case QETH_CARD_TYPE_OSAE:
-		return 32;
+		switch (linktype) {
+		case QETH_LINK_TYPE_LANE_TR:
+		case QETH_LINK_TYPE_HSTR:
+			return ARPHRD_IEEE802_TR;
+		default:
+			return ARPHRD_ETHER;
+		}
 	case QETH_CARD_TYPE_IQD:
-		return 64;
 	default:
-		return 64;
+		return ARPHRD_ETHER;
 	}
 }
 
+#ifdef CONFIG_QETH_PERF_STATS
 inline static int
-qeth_get_q_format(int cardtype)
+qeth_get_micros(void)
 {
-	switch (cardtype) {
+	return (int) (get_clock() >> 12);
+}
+#endif
+
+static inline int
+qeth_get_qdio_q_format(struct qeth_card *card)
+{
+	switch (card->info.type) {
 	case QETH_CARD_TYPE_IQD:
 		return 2;
 	default:
@@ -1235,100 +888,120 @@ qeth_get_q_format(int cardtype)
 	}
 }
 
-inline static int
-qeth_get_device_tx_q_len(int cardtype)
+static inline void
+qeth_ipaddr4_to_string(const __u8 *addr, char *buf)
 {
-	return 100;
+	sprintf(buf, "%i.%i.%i.%i", addr[0], addr[1], addr[2], addr[3]);
 }
 
-inline static int
-qeth_get_max_number_of_buffers(int cardtype)
+static inline int
+qeth_string_to_ipaddr4(const char *buf, __u8 *addr)
 {
-	return 127;
+	const char *start, *end;
+	char abuf[4];
+	char *tmp;
+	int len;
+	int i;
+
+	start = buf;
+	for (i = 0; i < 3; i++) {
+		if (!(end = strchr(start, '.')))
+			return -EINVAL;
+		len = end - start;
+		memset(abuf, 0, 4);
+		strncpy(abuf, start, len);
+		addr[i] = simple_strtoul(abuf, &tmp, 10);
+		start = end + 1;
+	}
+	memset(abuf, 0, 4);
+	strcpy(abuf, start);
+	addr[3] = simple_strtoul(abuf, &tmp, 10);
+	return 0;
 }
 
-/******************** OUTPUT FACILITIES **************************/
+static inline void
+qeth_ipaddr6_to_string(const __u8 *addr, char *buf)
+{
+	sprintf(buf, "%02x%02x:%02x%02x:%02x%02x:%02x%02x"
+		     ":%02x%02x:%02x%02x:%02x%02x:%02x%02x",
+		     addr[0], addr[1], addr[2], addr[3],
+		     addr[4], addr[5], addr[6], addr[7],
+		     addr[8], addr[9], addr[10], addr[11],
+		     addr[12], addr[13], addr[14], addr[15]);
+}
 
-#ifdef PRINT_INFO
-#undef PRINTK_HEADER
-#undef PRINT_STUPID
-#undef PRINT_ALL
-#undef PRINT_INFO
-#undef PRINT_WARN
-#undef PRINT_ERR
-#undef PRINT_CRIT
-#undef PRINT_ALERT
-#undef PRINT_EMERG
-#endif				/* PRINT_INFO */
+static inline int
+qeth_string_to_ipaddr6(const char *buf, __u8 *addr)
+{
+	const char *start, *end;
+	u16 *tmp_addr;
+	char abuf[5];
+	char *tmp;
+	int len;
+	int i;
+
+	tmp_addr = (u16 *)addr;
+	start = buf;
+	for (i = 0; i < 7; i++) {
+		if (!(end = strchr(start, ':')))
+			return -EINVAL;
+		len = end - start;
+		memset(abuf, 0, 5);
+		strncpy(abuf, start, len);
+		tmp_addr[i] = simple_strtoul(abuf, &tmp, 16);
+		start = end + 1;
+	}
+	memset(abuf, 0, 5);
+	strcpy(abuf, start);
+	tmp_addr[7] = simple_strtoul(abuf, &tmp, 16);
+	return 0;
+}
 
-#define PRINTK_HEADER QETH_NAME ": "
+static inline void
+qeth_ipaddr_to_string(enum qeth_prot_versions proto, const __u8 *addr,
+		      char *buf)
+{
+	if (proto == QETH_PROT_IPV4)
+		return qeth_ipaddr4_to_string(addr, buf);
+	else if (proto == QETH_PROT_IPV6)
+		return qeth_ipaddr6_to_string(addr, buf);
+}
 
-#if QETH_VERBOSE_LEVEL>8
-#define PRINT_STUPID(x...) printk( KERN_DEBUG PRINTK_HEADER x)
-#else
-#define PRINT_STUPID(x...)
-#endif
+static inline int
+qeth_string_to_ipaddr(const char *buf, enum qeth_prot_versions proto,
+		      __u8 *addr)
+{
+	if (proto == QETH_PROT_IPV4)
+		return qeth_string_to_ipaddr4(buf, addr);
+	else if (proto == QETH_PROT_IPV6)
+		return qeth_string_to_ipaddr6(buf, addr);
+	else
+		return -EINVAL;
+}
 
-#if QETH_VERBOSE_LEVEL>7
-#define PRINT_ALL(x...) printk( KERN_DEBUG PRINTK_HEADER x)
-#else
-#define PRINT_ALL(x...)
-#endif
+extern int
+qeth_setrouting_v4(struct qeth_card *);
+extern int
+qeth_setrouting_v6(struct qeth_card *);
 
-#if QETH_VERBOSE_LEVEL>6
-#define PRINT_INFO(x...) printk( KERN_INFO PRINTK_HEADER x)
-#else
-#define PRINT_INFO(x...)
-#endif
+int
+qeth_add_ipato_entry(struct qeth_card *, struct qeth_ipato_entry *);
 
-#if QETH_VERBOSE_LEVEL>5
-#define PRINT_WARN(x...) printk( KERN_WARNING PRINTK_HEADER x)
-#else
-#define PRINT_WARN(x...)
-#endif
+void
+qeth_del_ipato_entry(struct qeth_card *, enum qeth_prot_versions, u8 *, int);
 
-#if QETH_VERBOSE_LEVEL>4
-#define PRINT_ERR(x...) printk( KERN_ERR PRINTK_HEADER x)
-#else
-#define PRINT_ERR(x...)
-#endif
+int
+qeth_add_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *);
 
-#if QETH_VERBOSE_LEVEL>3
-#define PRINT_CRIT(x...) printk( KERN_CRIT PRINTK_HEADER x)
-#else
-#define PRINT_CRIT(x...)
-#endif
+void
+qeth_del_vipa(struct qeth_card *, enum qeth_prot_versions, const u8 *);
 
-#if QETH_VERBOSE_LEVEL>2
-#define PRINT_ALERT(x...) printk( KERN_ALERT PRINTK_HEADER x)
-#else
-#define PRINT_ALERT(x...)
-#endif
+int
+qeth_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
 
-#if QETH_VERBOSE_LEVEL>1
-#define PRINT_EMERG(x...) printk( KERN_EMERG PRINTK_HEADER x)
-#else
-#define PRINT_EMERG(x...)
-#endif
-
-#define HEXDUMP16(importance,header,ptr) \
-PRINT_##importance(header "%02x %02x %02x %02x  %02x %02x %02x %02x  " \
-		   "%02x %02x %02x %02x  %02x %02x %02x %02x\n", \
-		   *(((char*)ptr)),*(((char*)ptr)+1),*(((char*)ptr)+2), \
-		   *(((char*)ptr)+3),*(((char*)ptr)+4),*(((char*)ptr)+5), \
-		   *(((char*)ptr)+6),*(((char*)ptr)+7),*(((char*)ptr)+8), \
-		   *(((char*)ptr)+9),*(((char*)ptr)+10),*(((char*)ptr)+11), \
-		   *(((char*)ptr)+12),*(((char*)ptr)+13), \
-		   *(((char*)ptr)+14),*(((char*)ptr)+15)); \
-PRINT_##importance(header "%02x %02x %02x %02x  %02x %02x %02x %02x  " \
-		   "%02x %02x %02x %02x  %02x %02x %02x %02x\n", \
-		   *(((char*)ptr)+16),*(((char*)ptr)+17), \
-		   *(((char*)ptr)+18),*(((char*)ptr)+19), \
-		   *(((char*)ptr)+20),*(((char*)ptr)+21), \
-		   *(((char*)ptr)+22),*(((char*)ptr)+23), \
-		   *(((char*)ptr)+24),*(((char*)ptr)+25), \
-		   *(((char*)ptr)+26),*(((char*)ptr)+27), \
-		   *(((char*)ptr)+28),*(((char*)ptr)+29), \
-		   *(((char*)ptr)+30),*(((char*)ptr)+31));
+void
+qeth_del_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
 
+void
+qeth_schedule_recovery(struct qeth_card *);
 #endif /* __QETH_H__ */
diff --git a/drivers/s390/net/qeth_fs.h b/drivers/s390/net/qeth_fs.h
new file mode 100644
index 000000000000..030aa6a13645
--- /dev/null
+++ b/drivers/s390/net/qeth_fs.h
@@ -0,0 +1,156 @@
+/*
+ * linux/drivers/s390/net/qeth_fs.h
+ *
+ * Linux on zSeries OSA Express and HiperSockets support.
+ *
+ * This header file contains definitions related to sysfs and procfs.
+ *
+ * Copyright 2000,2003 IBM Corporation
+ * Author(s): Thomas Spatzier <tspat@de.ibm.com>
+ *
+ */
+#ifndef __QETH_FS_H__
+#define __QETH_FS_H__
+
+#ifdef CONFIG_PROC_FS
+extern int
+qeth_create_procfs_entries(void);
+
+extern void
+qeth_remove_procfs_entries(void);
+#else
+static inline int
+qeth_create_procfs_entries(void)
+{
+	return 0;
+}
+
+static inline void
+qeth_remove_procfs_entries(void)
+{
+}
+#endif /* CONFIG_PROC_FS */
+
+extern int
+qeth_create_device_attributes(struct device *dev);
+
+extern void
+qeth_remove_device_attributes(struct device *dev);
+
+extern int
+qeth_create_driver_attributes(void);
+
+extern void
+qeth_remove_driver_attributes(void);
+
+/*
+ * utility functions used in qeth_proc.c and qeth_sys.c
+ */
+
+static inline const char *
+qeth_get_checksum_str(struct qeth_card *card)
+{
+	if (card->options.checksum_type == SW_CHECKSUMMING)
+		return "sw";
+	else if (card->options.checksum_type == HW_CHECKSUMMING)
+		return "hw";
+	else
+		return "no";
+}
+
+static inline const char *
+qeth_get_prioq_str(struct qeth_card *card, char *buf)
+{
+	if (card->qdio.do_prio_queueing == QETH_NO_PRIO_QUEUEING)
+		sprintf(buf, "always_q_%i", card->qdio.default_out_queue);
+	else
+		strcpy(buf, (card->qdio.do_prio_queueing ==
+					QETH_PRIO_Q_ING_PREC)?
+				"by_prec." : "by_ToS");
+	return buf;
+}
+
+static inline const char *
+qeth_get_bufsize_str(struct qeth_card *card)
+{
+	if (card->qdio.in_buf_size == 16384)
+		return "16k";
+	else if (card->qdio.in_buf_size == 24576)
+		return "24k";
+	else if (card->qdio.in_buf_size == 32768)
+		return "32k";
+	else if (card->qdio.in_buf_size == 40960)
+		return "40k";
+	else
+		return "64k";
+}
+
+static inline const char *
+qeth_get_cardname(struct qeth_card *card)
+{
+ 	if (card->info.guestlan) {
+ 		switch (card->info.type) {
+ 		case QETH_CARD_TYPE_OSAE:
+			return " Guest LAN QDIO";
+ 		case QETH_CARD_TYPE_IQD:
+			return " Guest LAN Hiper";
+		default:
+			return " unknown";
+ 		}
+	} else {
+		switch (card->info.type) {
+		case QETH_CARD_TYPE_OSAE:
+			return " OSD Express";
+		case QETH_CARD_TYPE_IQD:
+			return " HiperSockets";
+		default:
+			return " unknown";
+		}
+	}
+	return " n/a";
+}
+
+/* max length to be returned: 14 */
+static inline const char *
+qeth_get_cardname_short(struct qeth_card *card)
+{
+	if (card->info.guestlan){
+		switch (card->info.type){
+		case QETH_CARD_TYPE_OSAE:
+			return "GuestLAN QDIO";
+		case QETH_CARD_TYPE_IQD:
+			return "GuestLAN Hiper";
+		default:
+			return "unknown";
+		}
+	} else {
+		switch (card->info.type) {
+		case QETH_CARD_TYPE_OSAE:
+			switch (card->info.link_type) {
+			case QETH_LINK_TYPE_FAST_ETH:
+				return "OSD_100";
+			case QETH_LINK_TYPE_HSTR:
+				return "HSTR";
+			case QETH_LINK_TYPE_GBIT_ETH:
+				return "OSD_1000";
+			case QETH_LINK_TYPE_LANE_ETH100:
+				return "OSD_FE_LANE";
+			case QETH_LINK_TYPE_LANE_TR:
+				return "OSD_TR_LANE";
+			case QETH_LINK_TYPE_LANE_ETH1000:
+				return "OSD_GbE_LANE";
+			case QETH_LINK_TYPE_LANE:
+				return "OSD_ATM_LANE";
+			default:
+				return "OSD_Express";
+			}
+		case QETH_CARD_TYPE_IQD:
+			return "HiperSockets";
+		default:
+			return "unknown";
+		}
+	}
+	return "n/a";
+}
+
+#endif /* __QETH_FS_H__ */
diff --git a/drivers/s390/net/qeth_main.c b/drivers/s390/net/qeth_main.c
new file mode 100644
index 000000000000..414d9c3de0d5
--- /dev/null
+++ b/drivers/s390/net/qeth_main.c
@@ -0,0 +1,6820 @@
+/*
+ *
+ * linux/drivers/s390/net/qeth_main.c ($Revision: 1.77 $)
+ *
+ * Linux on zSeries OSA Express and HiperSockets support
+ *
+ * Copyright 2000,2003 IBM Corporation
+ *
+ *    Author(s): Original Code written by
+ *			  Utz Bacher (utz.bacher@de.ibm.com)
+ *		 Rewritten by
+ *			  Frank Pavlic (pavlic@de.ibm.com) and
+ *		 	  Thomas Spatzier <tspat@de.ibm.com>
+ *
+ *    $Revision: 1.77 $	 $Date: 2004/04/06 14:38:19 $
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+
+/***
+ * eye catcher; just for debugging purposes
+ */
+void volatile
+qeth_eyecatcher(void)
+{
+	return;
+}
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+
+#include <asm/io.h>
+#include <asm/ebcdic.h>
+#include <linux/ctype.h>
+#include <asm/semaphore.h>
+#include <asm/timex.h>
+#include <linux/ip.h>
+#include <linux/inetdevice.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/tcp.h>
+#include <linux/icmp.h>
+#include <linux/skbuff.h>
+#include <net/route.h>
+#include <net/arp.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <net/ip.h>
+#include <asm/uaccess.h>
+#include <linux/init.h>
+#include <linux/reboot.h>
+#include <asm/qeth.h>
+#include <linux/mii.h>
+
+#include "qeth.h"
+#include "qeth_mpc.h"
+#include "qeth_fs.h"
+
+#define VERSION_QETH_C "$Revision: 1.77 $"
+static const char *version = "qeth S/390 OSA-Express driver ("
+	VERSION_QETH_C "/" VERSION_QETH_H "/" VERSION_QETH_MPC_H
+	QETH_VERSION_IPV6 QETH_VERSION_VLAN ")";
+/**
+ * Debug Facility Stuff
+ */
+static debug_info_t *qeth_dbf_setup = NULL;
+static debug_info_t *qeth_dbf_data = NULL;
+static debug_info_t *qeth_dbf_misc = NULL;
+static debug_info_t *qeth_dbf_control = NULL;
+static debug_info_t *qeth_dbf_trace = NULL;
+static debug_info_t *qeth_dbf_sense = NULL;
+static debug_info_t *qeth_dbf_qerr = NULL;
+static char qeth_dbf_text_buf[255];
+
+/**
+ * some more definitions and declarations
+ */
+static unsigned int known_devices[][10] = QETH_MODELLIST_ARRAY;
+
+/* list of our cards */
+struct qeth_card_list_struct qeth_card_list;
+
+static void qeth_send_control_data_cb(struct qeth_channel *,
+				      struct qeth_cmd_buffer *);
+
+static atomic_t qeth_hsi_count;
+
+/**
+ * here we go with function implementation
+ */
+static void
+qeth_init_qdio_info(struct qeth_card *card);
+
+static int
+qeth_init_qdio_queues(struct qeth_card *card);
+
+static int
+qeth_alloc_qdio_buffers(struct qeth_card *card);
+
+static void
+qeth_free_qdio_buffers(struct qeth_card *);
+
+static void
+qeth_clear_qdio_buffers(struct qeth_card *);
+
+static void
+qeth_clear_ip_list(struct qeth_card *, int, int);
+
+static void
+qeth_clear_ipacmd_list(struct qeth_card *);
+
+static int
+qeth_qdio_clear_card(struct qeth_card *, int);
+
+static void
+qeth_clear_working_pool_list(struct qeth_card *);
+
+static void
+qeth_clear_cmd_buffers(struct qeth_channel *);
+
+static int
+qeth_stop(struct net_device *);
+
+static void
+qeth_clear_ipato_list(struct qeth_card *);
+
+static int
+qeth_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *);
+
+static void
+qeth_irq_tasklet(unsigned long);
+
+static int
+qeth_set_online(struct ccwgroup_device *);
+/**
+ * free channel command buffers
+ */
+static void
+qeth_clean_channel(struct qeth_channel *channel)
+{
+	int cnt;
+
+	QETH_DBF_TEXT(setup, 2, "freech");
+	for (cnt = 0; cnt < QETH_CMD_BUFFER_NO; cnt++)
+		kfree(channel->iob[cnt].data);
+}
+
+/**
+ * free card
+ */
+static void
+qeth_free_card(struct qeth_card *card)
+{
+
+	QETH_DBF_TEXT(setup, 2, "freecrd");
+	QETH_DBF_HEX(setup, 2, &card, sizeof(void *));
+	qeth_clean_channel(&card->read);
+	qeth_clean_channel(&card->write);
+	if (card->dev)
+		free_netdev(card->dev);
+	qeth_clear_ip_list(card, 0, 0);
+	qeth_clear_ipato_list(card);
+	qeth_free_qdio_buffers(card);
+	kfree(card);
+}
+
+/**
+ * alloc memory for command buffer per channel
+ */
+static int
+qeth_setup_channel(struct qeth_channel *channel)
+{
+	int cnt;
+
+	QETH_DBF_TEXT(setup, 2, "setupch");
+	for (cnt=0; cnt < QETH_CMD_BUFFER_NO; cnt++) {
+		channel->iob[cnt].data = (char *)
+			kmalloc(QETH_BUFSIZE, GFP_DMA|GFP_KERNEL);
+		if (channel->iob[cnt].data == NULL)
+			break;
+		channel->iob[cnt].state = BUF_STATE_FREE;
+		channel->iob[cnt].channel = channel;
+		channel->iob[cnt].callback = qeth_send_control_data_cb;
+		channel->iob[cnt].rc = 0;
+	}
+	if (cnt < QETH_CMD_BUFFER_NO) {
+		while (cnt-- > 0)
+			kfree(channel->iob[cnt].data);
+		return -ENOMEM;
+	}
+	channel->buf_no = 0;
+	channel->io_buf_no = 0;
+	atomic_set(&channel->irq_pending, 0);
+	spin_lock_init(&channel->iob_lock);
+
+	init_waitqueue_head(&channel->wait_q);
+	channel->irq_tasklet.data = (unsigned long) channel;
+	channel->irq_tasklet.func = qeth_irq_tasklet;
+	return 0;
+}
+
+/**
+ * alloc memory for card structure
+ */
+static struct qeth_card *
+qeth_alloc_card(void)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(setup, 2, "alloccrd");
+	card = (struct qeth_card *) kmalloc(sizeof(struct qeth_card),
+					    GFP_DMA|GFP_KERNEL);
+	if (!card)
+		return NULL;
+	QETH_DBF_HEX(setup, 2, &card, sizeof(void *));
+	memset(card, 0, sizeof(struct qeth_card));
+	if (qeth_setup_channel(&card->read)) {
+		kfree(card);
+		return NULL;
+	}
+	if (qeth_setup_channel(&card->write)) {
+		qeth_clean_channel(&card->read);
+		kfree(card);
+		return NULL;
+	}
+	return card;
+}
+
+static long
+__qeth_check_irb_error(struct ccw_device *cdev, struct irb *irb)
+{
+	if (!IS_ERR(irb))
+		return 0;
+
+	switch (PTR_ERR(irb)) {
+	case -EIO:
+		PRINT_WARN("i/o-error on device %s\n", cdev->dev.bus_id);
+		QETH_DBF_TEXT(trace, 2, "ckirberr");
+		QETH_DBF_TEXT_(trace, 2, "  rc%d", -EIO);
+		break;
+	case -ETIMEDOUT:
+		PRINT_WARN("timeout on device %s\n", cdev->dev.bus_id);
+		QETH_DBF_TEXT(trace, 2, "ckirberr");
+		QETH_DBF_TEXT_(trace, 2, "  rc%d", -ETIMEDOUT);
+		break;
+	default:
+		PRINT_WARN("unknown error %ld on device %s\n", PTR_ERR(irb),
+			   cdev->dev.bus_id);
+		QETH_DBF_TEXT(trace, 2, "ckirberr");
+		QETH_DBF_TEXT(trace, 2, "  rc???");
+	}
+	return PTR_ERR(irb);
+}
+
+static int
+qeth_get_problem(struct ccw_device *cdev, struct irb *irb)
+{
+	int dstat,cstat;
+	char *sense;
+
+	sense = (char *) irb->ecw;
+	cstat = irb->scsw.cstat;
+	dstat = irb->scsw.dstat;
+
+	if (cstat & (SCHN_STAT_CHN_CTRL_CHK | SCHN_STAT_INTF_CTRL_CHK |
+		     SCHN_STAT_CHN_DATA_CHK | SCHN_STAT_CHAIN_CHECK |
+		     SCHN_STAT_PROT_CHECK | SCHN_STAT_PROG_CHECK)) {
+		QETH_DBF_TEXT(trace,2, "CGENCHK");
+		PRINT_WARN("check on device %s, dstat=x%x, cstat=x%x ",
+			   cdev->dev.bus_id, dstat, cstat);
+		HEXDUMP16(WARN, "irb: ", irb);
+		HEXDUMP16(WARN, "irb: ", ((char *) irb) + 32);
+		return 1;
+	}
+
+	if (dstat & DEV_STAT_UNIT_CHECK) {
+		if (sense[SENSE_RESETTING_EVENT_BYTE] &
+		    SENSE_RESETTING_EVENT_FLAG) {
+			QETH_DBF_TEXT(trace,2,"REVIND");
+			return 1;
+		}
+		if (sense[SENSE_COMMAND_REJECT_BYTE] &
+		    SENSE_COMMAND_REJECT_FLAG) {
+			QETH_DBF_TEXT(trace,2,"CMDREJi");
+			return 0;
+		}
+		if ((sense[2] == 0xaf) && (sense[3] == 0xfe)) {
+			QETH_DBF_TEXT(trace,2,"AFFE");
+			return 1;
+		}
+		if ((!sense[0]) && (!sense[1]) && (!sense[2]) && (!sense[3])) {
+			QETH_DBF_TEXT(trace,2,"ZEROSEN");
+			return 0;
+		}
+		QETH_DBF_TEXT(trace,2,"DGENCHK");
+			return 1;
+	}
+	return 0;
+}
+static int qeth_issue_next_read(struct qeth_card *);
+
+/**
+ * interrupt handler
+ */
+static void
+qeth_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb)
+{
+	int rc;
+	int cstat,dstat;
+	struct qeth_cmd_buffer *buffer;
+	struct qeth_channel *channel;
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,5,"irq");
+
+	if (__qeth_check_irb_error(cdev, irb))
+		return;
+	cstat = irb->scsw.cstat;
+	dstat = irb->scsw.dstat;
+
+	card = CARD_FROM_CDEV(cdev);
+	if (!card)
+		return;
+
+	if (card->read.ccwdev == cdev){
+		channel = &card->read;
+		QETH_DBF_TEXT(trace,5,"read");
+	} else if (card->write.ccwdev == cdev) {
+		channel = &card->write;
+		QETH_DBF_TEXT(trace,5,"write");
+	} else {
+		channel = &card->data;
+		QETH_DBF_TEXT(trace,5,"data");
+	}
+	atomic_set(&channel->irq_pending, 0);
+
+	if (irb->scsw.fctl & (SCSW_FCTL_CLEAR_FUNC))
+		channel->state = CH_STATE_STOPPED;
+
+	if (irb->scsw.fctl & (SCSW_FCTL_HALT_FUNC))
+		channel->state = CH_STATE_HALTED;
+
+	/*let's wake up immediately on data channel*/
+	if ((channel == &card->data) && (intparm != 0))
+		goto out;
+
+	if (intparm == QETH_CLEAR_CHANNEL_PARM) {
+		QETH_DBF_TEXT(trace, 6, "clrchpar");
+		/* we don't have to handle this further */
+		intparm = 0;
+	}
+	if (intparm == QETH_HALT_CHANNEL_PARM) {
+		QETH_DBF_TEXT(trace, 6, "hltchpar");
+		/* we don't have to handle this further */
+		intparm = 0;
+	}
+	if ((dstat & DEV_STAT_UNIT_EXCEP) ||
+	    (dstat & DEV_STAT_UNIT_CHECK) ||
+	    (cstat)) {
+		if (irb->esw.esw0.erw.cons) {
+			/* TODO: we should make this s390dbf */
+			PRINT_WARN("sense data available on channel %s.\n",
+				   CHANNEL_ID(channel));
+			PRINT_WARN(" cstat 0x%X\n dstat 0x%X\n", cstat, dstat);
+			HEXDUMP16(WARN,"irb: ",irb);
+			HEXDUMP16(WARN,"sense data: ",irb->ecw);
+		}
+		rc = qeth_get_problem(cdev,irb);
+		if (rc) {
+			qeth_schedule_recovery(card);
+			goto out;
+		}
+	}
+
+	if (intparm) {
+		buffer = (struct qeth_cmd_buffer *) __va((addr_t)intparm);
+		buffer->state = BUF_STATE_PROCESSED;
+	}
+	if (channel == &card->data)
+		return;
+
+	if (channel == &card->read &&
+	    channel->state == CH_STATE_UP)
+		qeth_issue_next_read(card);
+
+	tasklet_schedule(&channel->irq_tasklet);
+	return;
+out:
+	wake_up(&card->wait_q);
+}
+
+/**
+ * tasklet function scheduled from irq handler
+ */
+static void
+qeth_irq_tasklet(unsigned long data)
+{
+	struct qeth_card *card;
+	struct qeth_channel *channel;
+	struct qeth_cmd_buffer *iob;
+	__u8 index;
+
+	QETH_DBF_TEXT(trace,5,"irqtlet");
+	channel = (struct qeth_channel *) data;
+	iob = channel->iob;
+	index = channel->buf_no;
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	while (iob[index].state == BUF_STATE_PROCESSED) {
+		if (iob[index].callback !=NULL) {
+			iob[index].callback(channel,iob + index);
+		}
+		index = (index + 1) % QETH_CMD_BUFFER_NO;
+	}
+	channel->buf_no = index;
+	wake_up(&card->wait_q);
+}
+
+static int qeth_stop_card(struct qeth_card *);
+
+static int
+qeth_set_offline(struct ccwgroup_device *cgdev)
+{
+	struct qeth_card *card = (struct qeth_card *) cgdev->dev.driver_data;
+	enum qeth_card_states recover_flag;
+
+	QETH_DBF_TEXT(setup, 3, "setoffl");
+	QETH_DBF_HEX(setup, 3, &card, sizeof(void *));
+
+	recover_flag = card->state;
+	if (qeth_stop_card(card) == -ERESTARTSYS){
+		PRINT_WARN("Stopping card %s interrupted by user!\n",
+			   CARD_BUS_ID(card));
+		return -ERESTARTSYS;
+	}
+	ccw_device_set_offline(CARD_DDEV(card));
+	ccw_device_set_offline(CARD_WDEV(card));
+	ccw_device_set_offline(CARD_RDEV(card));
+	if ((recover_flag == CARD_STATE_UP_LAN_ONLINE) ||
+	    (recover_flag == CARD_STATE_UP_LAN_OFFLINE))
+		card->state = CARD_STATE_RECOVER;
+	return 0;
+}
+
+static void
+qeth_remove_device(struct ccwgroup_device *cgdev)
+{
+	struct qeth_card *card = (struct qeth_card *) cgdev->dev.driver_data;
+	unsigned long flags;
+
+	QETH_DBF_TEXT(setup, 3, "rmdev");
+	QETH_DBF_HEX(setup, 3, &card, sizeof(void *));
+
+	if (!card)
+		return;
+
+	if (cgdev->state == CCWGROUP_ONLINE){
+		card->use_hard_stop = 1;
+		qeth_set_offline(cgdev);
+	}
+	if (card->info.type == QETH_CARD_TYPE_IQD)
+		atomic_dec(&qeth_hsi_count);
+	/* remove form our internal list */
+	write_lock_irqsave(&qeth_card_list.rwlock, flags);
+	list_del(&card->list);
+	write_unlock_irqrestore(&qeth_card_list.rwlock, flags);
+	unregister_netdev(card->dev);
+	qeth_free_card(card);
+	cgdev->dev.driver_data = NULL;
+	put_device(&cgdev->dev);
+}
+
+static int
+qeth_register_addr_entry(struct qeth_card *, struct qeth_ipaddr *);
+static int
+qeth_deregister_addr_entry(struct qeth_card *, struct qeth_ipaddr *);
+
+/**
+ * Add/remove address to/from card's ip list, i.e. try to add or remove
+ * reference to/from an IP address that is already registered on the card.
+ * Returns:
+ * 	0  address was on card and its reference count has been adjusted,
+ * 	   but is still > 0, so nothing has to be done
+ * 	   also returns 0 if card was not on card and the todo was to delete
+ * 	   the address -> there is also nothing to be done
+ * 	1  address was not on card and the todo is to add it to the card's ip
+ * 	   list
+ * 	-1 address was on card and its reference count has been decremented
+ * 	   to <= 0 by the todo -> address must be removed from card
+ */
+static int
+__qeth_ref_ip_on_card(struct qeth_card *card, struct qeth_ipaddr *todo,
+		      struct qeth_ipaddr **__addr)
+{
+	struct qeth_ipaddr *addr;
+	int found = 0;
+
+	list_for_each_entry(addr, &card->ip_list, entry) {
+		if ((addr->proto     == QETH_PROT_IPV4)  &&
+		    (todo->proto     == QETH_PROT_IPV4)  &&
+		    (addr->type      == todo->type)      &&
+		    (addr->u.a4.addr == todo->u.a4.addr) &&
+		    (addr->u.a4.mask == todo->u.a4.mask)   ){
+			found = 1;
+			break;
+		}
+		if ((addr->proto       == QETH_PROT_IPV6)     &&
+		    (todo->proto       == QETH_PROT_IPV6)     &&
+		    (addr->type        == todo->type)         &&
+		    (addr->u.a6.pfxlen == todo->u.a6.pfxlen)  &&
+		    (memcmp(&addr->u.a6.addr, &todo->u.a6.addr,
+			    sizeof(struct in6_addr)) == 0))     {
+			found = 1;
+			break;
+		}
+	}
+	if (found){
+		addr->users += todo->users;
+		if (addr->users <= 0){
+			*__addr = addr;
+			return -1;
+		} else {
+			/* for VIPA and RXIP limit refcount to 1 */
+			if (addr->type != QETH_IP_TYPE_NORMAL)
+				addr->users = 1;
+			return 0;
+		}
+	}
+	if (todo->users > 0){
+		/* for VIPA and RXIP limit refcount to 1 */
+		if (todo->type != QETH_IP_TYPE_NORMAL)
+			addr->users = 1;
+		return 1;
+	} else
+		return 0;
+}
+
+static inline int
+__qeth_address_exists_in_list(struct list_head *list, struct qeth_ipaddr *addr,
+		              int same_type)
+{
+	struct qeth_ipaddr *tmp;
+
+	list_for_each_entry(tmp, list, entry) {
+		if ((tmp->proto     == QETH_PROT_IPV4)            &&
+		    (addr->proto    == QETH_PROT_IPV4)            &&
+		    ((same_type && (tmp->type == addr->type)) ||
+		     (!same_type && (tmp->type != addr->type))  ) &&
+		    (tmp->u.a4.addr == addr->u.a4.addr)             ){
+			return 1;
+		}
+		if ((tmp->proto  == QETH_PROT_IPV6)               &&
+		    (addr->proto == QETH_PROT_IPV6)               &&
+		    ((same_type && (tmp->type == addr->type)) ||
+		     (!same_type && (tmp->type != addr->type))  ) &&
+		    (memcmp(&tmp->u.a6.addr, &addr->u.a6.addr,
+			    sizeof(struct in6_addr)) == 0)          ) {
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * Add IP to be added to todo list. If there is already an "add todo"
+ * in this list we just incremenent the reference count.
+ * Returns 0 if we  just incremented reference count.
+ */
+static int
+__qeth_insert_ip_todo(struct qeth_card *card, struct qeth_ipaddr *addr, int add)
+{
+	struct qeth_ipaddr *tmp, *t;
+	int found = 0;
+
+	list_for_each_entry_safe(tmp, t, &card->ip_tbd_list, entry) {
+		if ((tmp->proto        == QETH_PROT_IPV4)     &&
+		    (addr->proto       == QETH_PROT_IPV4)     &&
+		    (tmp->type         == addr->type)         &&
+		    (tmp->is_multicast == addr->is_multicast) &&
+		    (tmp->u.a4.addr    == addr->u.a4.addr)    &&
+		    (tmp->u.a4.mask    == addr->u.a4.mask)      ){
+			found = 1;
+			break;
+		}
+		if ((tmp->proto        == QETH_PROT_IPV6)      &&
+		    (addr->proto       == QETH_PROT_IPV6)      &&
+		    (tmp->type         == addr->type)          &&
+		    (tmp->is_multicast == addr->is_multicast)  &&
+		    (tmp->u.a6.pfxlen  == addr->u.a6.pfxlen)   &&
+		    (memcmp(&tmp->u.a6.addr, &addr->u.a6.addr,
+			    sizeof(struct in6_addr)) == 0)        ){
+			found = 1;
+			break;
+		}
+	}
+	if (found){
+		if (addr->users != 0)
+			tmp->users += addr->users;
+		else
+			tmp->users += add? 1:-1;
+		if (tmp->users == 0){
+			list_del(&tmp->entry);
+			kfree(tmp);
+		}
+		return 0;
+	} else {
+		if (addr->users == 0)
+			addr->users += add? 1:-1;
+		if (add && (addr->type == QETH_IP_TYPE_NORMAL) &&
+		    qeth_is_addr_covered_by_ipato(card, addr)){
+			QETH_DBF_TEXT(trace, 2, "tkovaddr");
+			addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+		}
+		list_add_tail(&addr->entry, &card->ip_tbd_list);
+		return 1;
+	}
+}
+
+/**
+ * Remove IP address from list
+ */
+static int
+qeth_delete_ip(struct qeth_card *card, struct qeth_ipaddr *addr)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,4,"delip");
+	if (addr->proto == QETH_PROT_IPV4)
+		QETH_DBF_HEX(trace,4,&addr->u.a4.addr,4);
+	else {
+		QETH_DBF_HEX(trace,4,&addr->u.a6.addr,4);
+		QETH_DBF_HEX(trace,4,((char *)&addr->u.a6.addr)+4,4);
+	}
+	spin_lock_irqsave(&card->ip_lock, flags);
+	rc = __qeth_insert_ip_todo(card, addr, 0);
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	return rc;
+}
+
+static int
+qeth_add_ip(struct qeth_card *card, struct qeth_ipaddr *addr)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,4,"addip");
+	if (addr->proto == QETH_PROT_IPV4)
+		QETH_DBF_HEX(trace,4,&addr->u.a4.addr,4);
+	else {
+		QETH_DBF_HEX(trace,4,&addr->u.a6.addr,4);
+		QETH_DBF_HEX(trace,4,((char *)&addr->u.a6.addr)+4,4);
+	}
+	spin_lock_irqsave(&card->ip_lock, flags);
+	rc = __qeth_insert_ip_todo(card, addr, 1);
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	return rc;
+}
+
+static void
+qeth_reinsert_todos(struct qeth_card *card, struct list_head *todos)
+{
+	struct qeth_ipaddr *todo, *tmp;
+
+	list_for_each_entry_safe(todo, tmp, todos, entry){
+		list_del_init(&todo->entry);
+		if (todo->users < 0) {
+			if (!qeth_delete_ip(card, todo))
+				kfree(todo);
+		} else {
+			if (!qeth_add_ip(card, todo))
+				kfree(todo);
+		}
+	}
+}
+
+static void
+qeth_set_ip_addr_list(struct qeth_card *card)
+{
+	struct list_head failed_todos;
+	struct qeth_ipaddr *todo, *addr, *tmp;
+	unsigned long flags;
+	int rc;
+
+	QETH_DBF_TEXT(trace, 2, "sdiplist");
+	QETH_DBF_HEX(trace, 2, &card, sizeof(void *));
+
+	INIT_LIST_HEAD(&failed_todos);
+
+process_todos:
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry_safe(todo, tmp, &card->ip_tbd_list, entry) {
+		list_del_init(&todo->entry);
+		rc = __qeth_ref_ip_on_card(card, todo, &addr);
+		if (rc == 0) {
+			/* nothing to be done; only adjusted refcount */
+			kfree(todo);
+		} else if (rc == 1) {
+			/* new entry to be added to on-card list */
+			spin_unlock_irqrestore(&card->ip_lock, flags);
+			rc = qeth_register_addr_entry(card, todo);
+			if (!rc){
+				spin_lock_irqsave(&card->ip_lock, flags);
+				list_add_tail(&todo->entry, &card->ip_list);
+				spin_unlock_irqrestore(&card->ip_lock, flags);
+			} else
+				list_add_tail(&todo->entry, &failed_todos);
+			goto process_todos;
+		} else if (rc == -1) {
+			/* on-card entry to be removed */
+			list_del_init(&addr->entry);
+			spin_unlock_irqrestore(&card->ip_lock, flags);
+			rc = qeth_deregister_addr_entry(card, addr);
+			if (!rc) {
+				kfree(addr);
+				kfree(todo);
+			} else {
+				spin_lock_irqsave(&card->ip_lock, flags);
+				list_add_tail(&addr->entry, &card->ip_list);
+				list_add_tail(&todo->entry, &failed_todos);
+				spin_unlock_irqrestore(&card->ip_lock, flags);
+			}
+			goto process_todos;
+		}
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	qeth_reinsert_todos(card, &failed_todos);
+}
+
+static void qeth_delete_mc_addresses(struct qeth_card *);
+static void qeth_add_multicast_ipv4(struct qeth_card *);
+#ifdef CONFIG_QETH_IPV6
+static void qeth_add_multicast_ipv6(struct qeth_card *);
+#endif
+
+static void
+qeth_set_thread_start_bit(struct qeth_card *card, unsigned long thread)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	card->thread_start_mask |= thread;
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+}
+
+static void
+qeth_clear_thread_start_bit(struct qeth_card *card, unsigned long thread)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	card->thread_start_mask &= ~thread;
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+	wake_up(&card->wait_q);
+}
+
+static void
+qeth_clear_thread_running_bit(struct qeth_card *card, unsigned long thread)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	card->thread_running_mask &= ~thread;
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+	wake_up(&card->wait_q);
+}
+
+static inline int
+__qeth_do_run_thread(struct qeth_card *card, unsigned long thread)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	if (card->thread_start_mask & thread){
+		if ((card->thread_allowed_mask & thread) &&
+		    !(card->thread_running_mask & thread)){
+			rc = 1;
+			card->thread_start_mask &= ~thread;
+			card->thread_running_mask |= thread;
+		} else
+			rc = -EPERM;
+	}
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+	return rc;
+}
+
+static int
+qeth_do_run_thread(struct qeth_card *card, unsigned long thread)
+{
+	int rc = 0;
+
+	wait_event(card->wait_q,
+		   (rc = __qeth_do_run_thread(card, thread)) >= 0);
+	return rc;
+}
+
+static int
+qeth_register_mc_addresses(void *ptr)
+{
+	struct qeth_card *card;
+
+	card = (struct qeth_card *) ptr;
+	daemonize("getmcaddr");
+	QETH_DBF_TEXT(trace,4,"regmcth1");
+	if (!qeth_do_run_thread(card, QETH_SET_MC_THREAD))
+		return 0;
+	QETH_DBF_TEXT(trace,4,"regmcth2");
+	qeth_delete_mc_addresses(card);
+	qeth_add_multicast_ipv4(card);
+#ifdef CONFIG_QETH_IPV6
+	qeth_add_multicast_ipv6(card);
+#endif
+	qeth_set_ip_addr_list(card);
+	qeth_clear_thread_running_bit(card, QETH_SET_MC_THREAD);
+	return 0;
+}
+
+static int
+qeth_register_ip_address(void *ptr)
+{
+	struct qeth_card *card;
+
+	card = (struct qeth_card *) ptr;
+	daemonize("regip");
+	QETH_DBF_TEXT(trace,4,"regipth1");
+	if (!qeth_do_run_thread(card, QETH_SET_IP_THREAD))
+		return 0;
+	QETH_DBF_TEXT(trace,4,"regipth2");
+	qeth_set_ip_addr_list(card);
+	qeth_clear_thread_running_bit(card, QETH_SET_IP_THREAD);
+	return 0;
+}
+
+static int
+qeth_recover(void *ptr)
+{
+	struct qeth_card *card;
+	int rc = 0;
+
+	card = (struct qeth_card *) ptr;
+	daemonize("recover");
+	QETH_DBF_TEXT(trace,2,"recover1");
+	QETH_DBF_HEX(trace, 2, &card, sizeof(void *));
+	if (!qeth_do_run_thread(card, QETH_RECOVER_THREAD))
+		return 0;
+	QETH_DBF_TEXT(trace,2,"recover2");
+	PRINT_WARN("Recovery of device %s started ...\n",
+		   CARD_BUS_ID(card));
+	card->use_hard_stop = 1;
+	qeth_set_offline(card->gdev);
+	rc = qeth_set_online(card->gdev);
+	if (!rc)
+		PRINT_INFO("Device %s successfully recovered!\n",
+			   CARD_BUS_ID(card));
+	else
+		PRINT_INFO("Device %s could not be recovered!\n",
+			   CARD_BUS_ID(card));
+	/* don't run another scheduled recovery */
+	qeth_clear_thread_start_bit(card, QETH_RECOVER_THREAD);
+	qeth_clear_thread_running_bit(card, QETH_RECOVER_THREAD);
+	return 0;
+}
+
+void
+qeth_schedule_recovery(struct qeth_card *card)
+{
+	QETH_DBF_TEXT(trace,2,"startrec");
+
+	qeth_set_thread_start_bit(card, QETH_RECOVER_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+}
+
+static int
+qeth_do_start_thread(struct qeth_card *card, unsigned long thread)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	QETH_DBF_TEXT_(trace, 4, "  %02x%02x%02x",
+			(u8) card->thread_start_mask,
+			(u8) card->thread_allowed_mask,
+			(u8) card->thread_running_mask);
+	rc = (card->thread_start_mask & thread);
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+	return rc;
+}
+
+static void
+qeth_start_kernel_thread(struct qeth_card *card)
+{
+	QETH_DBF_TEXT(trace , 2, "strthrd");
+
+	if (card->read.state != CH_STATE_UP &&
+	    card->write.state != CH_STATE_UP)
+		return;
+
+	if (qeth_do_start_thread(card, QETH_SET_IP_THREAD))
+		kernel_thread(qeth_register_ip_address, (void *) card, SIGCHLD);
+	if (qeth_do_start_thread(card, QETH_SET_MC_THREAD))
+		kernel_thread(qeth_register_mc_addresses, (void *)card,SIGCHLD);
+	if (qeth_do_start_thread(card, QETH_RECOVER_THREAD))
+		kernel_thread(qeth_recover, (void *) card, SIGCHLD);
+}
+
+
+static void
+qeth_set_intial_options(struct qeth_card *card)
+{
+	card->options.route4.type = NO_ROUTER;
+#ifdef CONFIG_QETH_IPV6
+	card->options.route6.type = NO_ROUTER;
+#endif /* QETH_IPV6 */
+	card->options.checksum_type = QETH_CHECKSUM_DEFAULT;
+	card->options.broadcast_mode = QETH_TR_BROADCAST_ALLRINGS;
+	card->options.macaddr_mode = QETH_TR_MACADDR_NONCANONICAL;
+	card->options.enable_takeover = 1;
+	card->options.fake_broadcast = 0;
+	card->options.add_hhlen = DEFAULT_ADD_HHLEN;
+	card->options.fake_ll = 0;
+}
+
+/**
+ * initialize channels ,card and all state machines
+ */
+static int
+qeth_setup_card(struct qeth_card *card)
+{
+
+	QETH_DBF_TEXT(setup, 2, "setupcrd");
+	QETH_DBF_HEX(setup, 2, &card, sizeof(void *));
+
+	card->read.state  = CH_STATE_DOWN;
+	card->write.state = CH_STATE_DOWN;
+	card->data.state  = CH_STATE_DOWN;
+	card->state = CARD_STATE_DOWN;
+	card->lan_online = 0;
+	card->use_hard_stop = 0;
+	card->dev = NULL;
+#ifdef CONFIG_QETH_VLAN
+	spin_lock_init(&card->vlanlock);
+	card->vlangrp = NULL;
+#endif
+	spin_lock_init(&card->ip_lock);
+	spin_lock_init(&card->thread_mask_lock);
+	card->thread_start_mask = 0;
+	card->thread_allowed_mask = 0;
+	card->thread_running_mask = 0;
+	INIT_WORK(&card->kernel_thread_starter,
+		  (void *)qeth_start_kernel_thread,card);
+	INIT_LIST_HEAD(&card->ip_list);
+	INIT_LIST_HEAD(&card->ip_tbd_list);
+	INIT_LIST_HEAD(&card->cmd_waiter_list);
+	init_waitqueue_head(&card->wait_q);
+	/* intial options */
+	qeth_set_intial_options(card);
+	/* IP address takeover */
+	INIT_LIST_HEAD(&card->ipato.entries);
+	card->ipato.enabled = 0;
+	card->ipato.invert4 = 0;
+	card->ipato.invert6 = 0;
+	/* init QDIO stuff */
+	qeth_init_qdio_info(card);
+	return 0;
+}
+
+static int
+qeth_determine_card_type(struct qeth_card *card)
+{
+	int i = 0;
+
+	QETH_DBF_TEXT(setup, 2, "detcdtyp");
+
+	while (known_devices[i][4]) {
+		if ((CARD_RDEV(card)->id.dev_type == known_devices[i][2]) &&
+		    (CARD_RDEV(card)->id.dev_model == known_devices[i][3])) {
+			card->info.type = known_devices[i][4];
+			if (card->options.enable_takeover)
+				card->info.func_level = known_devices[i][6];
+			else
+				card->info.func_level = known_devices[i][7];
+			card->qdio.no_out_queues = known_devices[i][8];
+			card->info.is_multicast_different = known_devices[i][9];
+			return 0;
+		}
+		i++;
+	}
+	card->info.type = QETH_CARD_TYPE_UNKNOWN;
+	PRINT_ERR("unknown card type on device %s\n", CARD_BUS_ID(card));
+	return -ENOENT;
+}
+
+static int
+qeth_probe_device(struct ccwgroup_device *gdev)
+{
+	struct qeth_card *card;
+	struct device *dev;
+	unsigned long flags;
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "probedev");
+
+	dev = &gdev->dev;
+	if (!get_device(dev))
+		return -ENODEV;
+
+	card = qeth_alloc_card();
+	if (!card) {
+		put_device(dev);
+		QETH_DBF_TEXT_(setup, 2, "1err%d", -ENOMEM);
+		return -ENOMEM;
+	}
+	if ((rc = qeth_setup_card(card))){
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		put_device(dev);
+		qeth_free_card(card);
+		return rc;
+	}
+	gdev->dev.driver_data = card;
+	card->gdev = gdev;
+	gdev->cdev[0]->handler = qeth_irq;
+	gdev->cdev[1]->handler = qeth_irq;
+	gdev->cdev[2]->handler = qeth_irq;
+
+	rc = qeth_create_device_attributes(dev);
+	if (rc) {
+		put_device(dev);
+		qeth_free_card(card);
+		return rc;
+	}
+	card->read.ccwdev  = gdev->cdev[0];
+	card->write.ccwdev = gdev->cdev[1];
+	card->data.ccwdev  = gdev->cdev[2];
+	if ((rc = qeth_determine_card_type(card))){
+		PRINT_WARN("%s: not a valid card type\n", __func__);
+		QETH_DBF_TEXT_(setup, 2, "3err%d", rc);
+		put_device(dev);
+		qeth_free_card(card);
+		return rc;
+	}
+	/* insert into our internal list */
+	write_lock_irqsave(&qeth_card_list.rwlock, flags);
+	list_add_tail(&card->list, &qeth_card_list.list);
+	write_unlock_irqrestore(&qeth_card_list.rwlock, flags);
+	return rc;
+}
+
+
+static int
+qeth_get_unitaddr(struct qeth_card *card)
+{
+ 	int length;
+	char *prcd;
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "getunit");
+	rc = read_conf_data(CARD_DDEV(card), (void **) &prcd, &length);
+	if (rc) {
+		PRINT_ERR("read_conf_data for device %s returned %i\n",
+			  CARD_DDEV_ID(card), rc);
+		return rc;
+	}
+	card->info.chpid = prcd[30];
+	card->info.unit_addr2 = prcd[31];
+	card->info.cula = prcd[63];
+	card->info.guestlan = ((prcd[0x10] == _ascebc['V']) &&
+			       (prcd[0x11] == _ascebc['M']));
+	return 0;
+}
+
+static void
+qeth_init_tokens(struct qeth_card *card)
+{
+	card->token.issuer_rm_w = 0x00010103UL;
+	card->token.cm_filter_w = 0x00010108UL;
+	card->token.cm_connection_w = 0x0001010aUL;
+	card->token.ulp_filter_w = 0x0001010bUL;
+	card->token.ulp_connection_w = 0x0001010dUL;
+}
+
+static inline __u16
+raw_devno_from_bus_id(char *id)
+{
+        id += (strlen(id) - 4);
+        return (__u16) simple_strtoul(id, &id, 16);
+}
+/**
+ * setup channel
+ */
+static void
+qeth_setup_ccw(struct qeth_channel *channel,unsigned char *iob, __u32 len)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace, 4, "setupccw");
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	if (channel == &card->read)
+		memcpy(&channel->ccw, READ_CCW, sizeof(struct ccw1));
+	else
+		memcpy(&channel->ccw, WRITE_CCW, sizeof(struct ccw1));
+	channel->ccw.count = len;
+	channel->ccw.cda = (__u32) __pa(iob);
+}
+
+/**
+ * get free buffer for ccws (IDX activation, lancmds,ipassists...)
+ */
+static struct qeth_cmd_buffer *
+__qeth_get_buffer(struct qeth_channel *channel)
+{
+	__u8 index;
+
+	QETH_DBF_TEXT(trace, 6, "getbuff");
+	index = channel->io_buf_no;
+	do {
+		if (channel->iob[index].state == BUF_STATE_FREE) {
+			channel->iob[index].state = BUF_STATE_LOCKED;
+			channel->io_buf_no = (channel->io_buf_no + 1) %
+				QETH_CMD_BUFFER_NO;
+			memset(channel->iob[index].data, 0, QETH_BUFSIZE);
+			return channel->iob + index;
+		}
+		index = (index + 1) % QETH_CMD_BUFFER_NO;
+	} while(index != channel->io_buf_no);
+
+	return NULL;
+}
+
+/**
+ * release command buffer
+ */
+static void
+qeth_release_buffer(struct qeth_channel *channel, struct qeth_cmd_buffer *iob)
+{
+	unsigned long flags;
+
+	QETH_DBF_TEXT(trace, 6, "relbuff");
+	spin_lock_irqsave(&channel->iob_lock, flags);
+	memset(iob->data, 0, QETH_BUFSIZE);
+	iob->state = BUF_STATE_FREE;
+	iob->callback = qeth_send_control_data_cb;
+	iob->rc = 0;
+	spin_unlock_irqrestore(&channel->iob_lock, flags);
+}
+
+static struct qeth_cmd_buffer *
+qeth_get_buffer(struct qeth_channel *channel)
+{
+	struct qeth_cmd_buffer *buffer = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&channel->iob_lock, flags);
+	buffer = __qeth_get_buffer(channel);
+	spin_unlock_irqrestore(&channel->iob_lock, flags);
+	return buffer;
+}
+
+static struct qeth_cmd_buffer *
+qeth_wait_for_buffer(struct qeth_channel *channel)
+{
+	struct qeth_cmd_buffer *buffer;
+	wait_event(channel->wait_q,
+		   ((buffer = qeth_get_buffer(channel)) != NULL));
+	return buffer;
+}
+
+static void
+qeth_clear_cmd_buffers(struct qeth_channel *channel)
+{
+	int cnt = 0;
+
+	for (cnt=0; cnt < QETH_CMD_BUFFER_NO; cnt++)
+		qeth_release_buffer(channel,&channel->iob[cnt]);
+	channel->buf_no = 0;
+	channel->io_buf_no = 0;
+}
+
+/**
+ * start IDX for read and write channel
+ */
+static int
+qeth_idx_activate_get_answer(struct qeth_channel *channel,
+			      void (*idx_reply_cb)(struct qeth_channel *,
+						   struct qeth_cmd_buffer *))
+{
+	struct qeth_cmd_buffer *iob;
+	unsigned long flags;
+	int rc;
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(setup, 2, "idxanswr");
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	iob = qeth_get_buffer(channel);
+	iob->callback = idx_reply_cb;
+	memcpy(&channel->ccw, READ_CCW, sizeof(struct ccw1));
+	channel->ccw.count = QETH_BUFSIZE;
+	channel->ccw.cda = (__u32) __pa(iob->data);
+
+	wait_event(card->wait_q,
+		   atomic_compare_and_swap(0,1,&channel->irq_pending) == 0);
+	QETH_DBF_TEXT(setup, 6, "noirqpnd");
+	spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+	rc = ccw_device_start(channel->ccwdev,
+			      &channel->ccw,(addr_t) iob, 0, 0);
+	spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+
+	if (rc) {
+		PRINT_ERR("qeth: Error2 in activating channel rc=%d\n",rc);
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		atomic_set(&channel->irq_pending, 0);
+		wake_up(&card->wait_q);
+		return rc;
+	}
+	rc = wait_event_interruptible_timeout(card->wait_q,
+			 channel->state == CH_STATE_UP, QETH_TIMEOUT);
+	if (rc == -ERESTARTSYS)
+		return rc;
+	if (channel->state != CH_STATE_UP){
+		rc = -ETIME;
+		QETH_DBF_TEXT_(setup, 2, "3err%d", rc);
+	} else
+		rc = 0;
+	return rc;
+}
+
+static int
+qeth_idx_activate_channel(struct qeth_channel *channel,
+			   void (*idx_reply_cb)(struct qeth_channel *,
+						struct qeth_cmd_buffer *))
+{
+	struct qeth_card *card;
+	struct qeth_cmd_buffer *iob;
+	unsigned long flags;
+	__u16 temp;
+	int rc;
+
+	card = CARD_FROM_CDEV(channel->ccwdev);
+
+	QETH_DBF_TEXT(setup, 2, "idxactch");
+
+	iob = qeth_get_buffer(channel);
+	iob->callback = idx_reply_cb;
+	memcpy(&channel->ccw, WRITE_CCW, sizeof(struct ccw1));
+	channel->ccw.count = IDX_ACTIVATE_SIZE;
+	channel->ccw.cda = (__u32) __pa(iob->data);
+	if (channel == &card->write) {
+		memcpy(iob->data, IDX_ACTIVATE_WRITE, IDX_ACTIVATE_SIZE);
+		memcpy(QETH_TRANSPORT_HEADER_SEQ_NO(iob->data),
+		       &card->seqno.trans_hdr, QETH_SEQ_NO_LENGTH);
+		card->seqno.trans_hdr++;
+	} else {
+		memcpy(iob->data, IDX_ACTIVATE_READ, IDX_ACTIVATE_SIZE);
+		memcpy(QETH_TRANSPORT_HEADER_SEQ_NO(iob->data),
+		       &card->seqno.trans_hdr, QETH_SEQ_NO_LENGTH);
+	}
+	memcpy(QETH_IDX_ACT_ISSUER_RM_TOKEN(iob->data),
+	       &card->token.issuer_rm_w,QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_IDX_ACT_FUNC_LEVEL(iob->data),
+	       &card->info.func_level,sizeof(__u16));
+	temp = raw_devno_from_bus_id(CARD_DDEV_ID(card));
+	memcpy(QETH_IDX_ACT_QDIO_DEV_CUA(iob->data), &temp, 2);
+	temp = (card->info.cula << 8) + card->info.unit_addr2;
+	memcpy(QETH_IDX_ACT_QDIO_DEV_REALADDR(iob->data), &temp, 2);
+
+	wait_event(card->wait_q,
+		   atomic_compare_and_swap(0,1,&channel->irq_pending) == 0);
+	QETH_DBF_TEXT(setup, 6, "noirqpnd");
+	spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+	rc = ccw_device_start(channel->ccwdev,
+			      &channel->ccw,(addr_t) iob, 0, 0);
+	spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+
+	if (rc) {
+		PRINT_ERR("qeth: Error1 in activating channel. rc=%d\n",rc);
+		QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		atomic_set(&channel->irq_pending, 0);
+		wake_up(&card->wait_q);
+		return rc;
+	}
+	rc = wait_event_interruptible_timeout(card->wait_q,
+			channel->state == CH_STATE_ACTIVATING, QETH_TIMEOUT);
+	if (rc == -ERESTARTSYS)
+		return rc;
+	if (channel->state != CH_STATE_ACTIVATING) {
+		PRINT_WARN("qeth: IDX activate timed out!\n");
+		QETH_DBF_TEXT_(setup, 2, "2err%d", -ETIME);
+		return -ETIME;
+	}
+	return qeth_idx_activate_get_answer(channel,idx_reply_cb);
+}
+
+static int
+qeth_peer_func_level(int level)
+{
+	if ((level & 0xff) == 8)
+		return (level & 0xff) + 0x400;
+	if (((level >> 8) & 3) == 1)
+		return (level & 0xff) + 0x200;
+	return level;
+}
+
+static void
+qeth_idx_write_cb(struct qeth_channel *channel, struct qeth_cmd_buffer *iob)
+{
+	struct qeth_card *card;
+	__u16 temp;
+
+	QETH_DBF_TEXT(setup ,2, "idxwrcb");
+
+	if (channel->state == CH_STATE_DOWN) {
+		channel->state = CH_STATE_ACTIVATING;
+		goto out;
+	}
+	card = CARD_FROM_CDEV(channel->ccwdev);
+
+	if (!(QETH_IS_IDX_ACT_POS_REPLY(iob->data))) {
+		PRINT_ERR("IDX_ACTIVATE on write channel device %s: negative "
+			  "reply\n", CARD_WDEV_ID(card));
+		goto out;
+	}
+	memcpy(&temp, QETH_IDX_ACT_FUNC_LEVEL(iob->data), 2);
+	if ((temp & ~0x0100) != qeth_peer_func_level(card->info.func_level)) {
+		PRINT_WARN("IDX_ACTIVATE on write channel device %s: "
+			   "function level mismatch "
+			   "(sent: 0x%x, received: 0x%x)\n",
+			   CARD_WDEV_ID(card), card->info.func_level, temp);
+		goto out;
+	}
+	channel->state = CH_STATE_UP;
+out:
+	qeth_release_buffer(channel, iob);
+}
+
+static int
+qeth_check_idx_response(unsigned char *buffer)
+{
+	if (!buffer)
+		return 0;
+
+	QETH_DBF_HEX(control, 2, buffer, QETH_DBF_CONTROL_LEN);
+	if ((buffer[2] & 0xc0) == 0xc0) {
+		PRINT_WARN("received an IDX TERMINATE "
+			   "with cause code 0x%02x%s\n",
+			   buffer[4],
+			   ((buffer[4] == 0x22) ?
+			    " -- try another portname" : ""));
+		QETH_DBF_TEXT(trace, 2, "ckidxres");
+		QETH_DBF_TEXT(trace, 2, " idxterm");
+		QETH_DBF_TEXT_(trace, 2, "  rc%d", -EIO);
+		return -EIO;
+	}
+	return 0;
+}
+
+static void
+qeth_idx_read_cb(struct qeth_channel *channel, struct qeth_cmd_buffer *iob)
+{
+	struct qeth_card *card;
+	__u16 temp;
+
+	QETH_DBF_TEXT(setup , 2, "idxrdcb");
+	if (channel->state == CH_STATE_DOWN) {
+		channel->state = CH_STATE_ACTIVATING;
+		goto out;
+	}
+
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	if (qeth_check_idx_response(iob->data)) {
+			goto out;
+	}
+	if (!(QETH_IS_IDX_ACT_POS_REPLY(iob->data))) {
+		PRINT_ERR("IDX_ACTIVATE on read channel device %s: negative "
+			  "reply\n", CARD_RDEV_ID(card));
+		goto out;
+	}
+
+/**
+ * temporary fix for microcode bug
+ * to revert it,replace OR by AND
+ */
+	if ( (!QETH_IDX_NO_PORTNAME_REQUIRED(iob->data)) ||
+	     (card->info.type == QETH_CARD_TYPE_OSAE) )
+		card->info.portname_required = 1;
+
+	memcpy(&temp, QETH_IDX_ACT_FUNC_LEVEL(iob->data), 2);
+	if (temp != qeth_peer_func_level(card->info.func_level)) {
+		PRINT_WARN("IDX_ACTIVATE on read channel device %s: function "
+			   "level mismatch (sent: 0x%x, received: 0x%x)\n",
+			   CARD_RDEV_ID(card), card->info.func_level, temp);
+		goto out;
+	}
+	memcpy(&card->token.issuer_rm_r,
+	       QETH_IDX_ACT_ISSUER_RM_TOKEN(iob->data),
+	       QETH_MPC_TOKEN_LENGTH);
+	memcpy(&card->info.mcl_level[0],
+	       QETH_IDX_REPLY_LEVEL(iob->data), QETH_MCL_LENGTH);
+	channel->state = CH_STATE_UP;
+out:
+	qeth_release_buffer(channel,iob);
+}
+
+static int
+qeth_issue_next_read(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(trace,5,"issnxrd");
+	if (card->read.state != CH_STATE_UP)
+		return -EIO;
+	iob = qeth_get_buffer(&card->read);
+	if (!iob) {
+		PRINT_WARN("issue_next_read failed: no iob available!\n");
+		return -ENOMEM;
+	}
+	qeth_setup_ccw(&card->read, iob->data, QETH_BUFSIZE);
+	wait_event(card->wait_q,
+		   atomic_compare_and_swap(0,1,&card->read.irq_pending) == 0);
+	QETH_DBF_TEXT(trace, 6, "noirqpnd");
+	rc = ccw_device_start(card->read.ccwdev, &card->read.ccw,
+			      (addr_t) iob, 0, 0);
+	if (rc) {
+		PRINT_ERR("Error in starting next read ccw! rc=%i\n", rc);
+		atomic_set(&card->read.irq_pending, 0);
+		qeth_schedule_recovery(card);
+		wake_up(&card->wait_q);
+	}
+	return rc;
+}
+
+static struct qeth_reply *
+qeth_alloc_reply(struct qeth_card *card)
+{
+	struct qeth_reply *reply;
+
+	reply = kmalloc(sizeof(struct qeth_reply), GFP_KERNEL|GFP_ATOMIC);
+	if (reply){
+		memset(reply, 0, sizeof(struct qeth_reply));
+		atomic_set(&reply->refcnt, 1);
+		reply->card = card;
+	};
+	return reply;
+}
+
+static void
+qeth_get_reply(struct qeth_reply *reply)
+{
+	WARN_ON(atomic_read(&reply->refcnt) <= 0);
+	atomic_inc(&reply->refcnt);
+}
+
+static void
+qeth_put_reply(struct qeth_reply *reply)
+{
+	WARN_ON(atomic_read(&reply->refcnt) <= 0);
+	if (atomic_dec_and_test(&reply->refcnt))
+		kfree(reply);
+}
+
+static void
+qeth_cmd_timeout(unsigned long data)
+{
+	struct qeth_reply *reply, *list_reply, *r;
+	unsigned long flags;
+
+	reply = (struct qeth_reply *) data;
+	spin_lock_irqsave(&reply->card->lock, flags);
+	list_for_each_entry_safe(list_reply, r,
+				 &reply->card->cmd_waiter_list, list) {
+		if (reply == list_reply){
+			qeth_get_reply(reply);
+			list_del_init(&reply->list);
+			spin_unlock_irqrestore(&reply->card->lock, flags);
+			reply->rc = -ETIME;
+			reply->received = 1;
+			wake_up(&reply->wait_q);
+			qeth_put_reply(reply);
+			return;
+		}
+	}
+	spin_unlock_irqrestore(&reply->card->lock, flags);
+}
+
+static struct qeth_ipa_cmd *
+qeth_check_ipa_data(struct qeth_card *card, struct qeth_cmd_buffer *iob)
+{
+	struct qeth_ipa_cmd *cmd = NULL;
+	enum qeth_card_states old_state;
+
+	QETH_DBF_TEXT(trace,5,"chkipad");
+	if (IS_IPA(iob->data)){
+		cmd = (struct qeth_ipa_cmd *) PDU_ENCAPSULATION(iob->data);
+		if (IS_IPA_REPLY(cmd))
+			return cmd;
+		else {
+			switch (cmd->hdr.command) {
+			case IPA_CMD_STOPLAN:
+				PRINT_WARN("Link failure on %s (CHPID 0x%X) - "
+					   "there is a network problem or "
+					   "someone pulled the cable or "
+					   "disabled the port. Setting state "
+					   "of interface to DOWN.\n",
+					   card->info.if_name,
+					   card->info.chpid);
+				card->lan_online = 0;
+				old_state = card->state;
+				rtnl_lock();
+				dev_close(card->dev);
+				rtnl_unlock();
+				if ((old_state == CARD_STATE_UP_LAN_ONLINE) ||
+				    (old_state == CARD_STATE_UP_LAN_OFFLINE))
+					card->state = CARD_STATE_UP_LAN_OFFLINE;
+				return NULL;
+			case IPA_CMD_STARTLAN:
+				PRINT_INFO("Link reestablished on %s "
+					   "(CHPID 0x%X)\n",
+					   card->info.if_name,
+					   card->info.chpid);
+				card->lan_online = 1;
+				if (card->state == CARD_STATE_UP_LAN_OFFLINE){
+					rtnl_lock();
+					dev_open(card->dev);
+					rtnl_unlock();
+				}
+				return NULL;
+			case IPA_CMD_REGISTER_LOCAL_ADDR:
+				QETH_DBF_TEXT(trace,3, "irla");
+				break;
+			case IPA_CMD_UNREGISTER_LOCAL_ADDR:
+				PRINT_WARN("probably problem on %s: "
+					   "received IPA command 0x%X\n",
+					   card->info.if_name,
+					   cmd->hdr.command);
+				break;
+			default:
+				PRINT_WARN("Received data is IPA "
+					   "but not a reply!\n");
+				break;
+			}
+		}
+	}
+	return cmd;
+}
+
+/**
+ * wake all waiting ipa commands
+ */
+static void
+qeth_clear_ipacmd_list(struct qeth_card *card)
+{
+	struct qeth_reply *reply, *r;
+	unsigned long flags;
+
+	QETH_DBF_TEXT(trace, 4, "clipalst");
+
+	spin_lock_irqsave(&card->lock, flags);
+	list_for_each_entry_safe(reply, r, &card->cmd_waiter_list, list) {
+		qeth_get_reply(reply);
+		reply->rc = -EIO;
+		reply->received = 1;
+		list_del_init(&reply->list);
+		wake_up(&reply->wait_q);
+		qeth_put_reply(reply);
+	}
+	spin_unlock_irqrestore(&card->lock, flags);
+}
+
+static void
+qeth_send_control_data_cb(struct qeth_channel *channel,
+			  struct qeth_cmd_buffer *iob)
+{
+	struct qeth_card *card;
+	struct qeth_reply *reply, *r;
+	struct qeth_ipa_cmd *cmd;
+	unsigned long flags;
+	int keep_reply;
+
+	QETH_DBF_TEXT(trace,4,"sndctlcb");
+
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	if (qeth_check_idx_response(iob->data)) {
+		qeth_clear_ipacmd_list(card);
+		qeth_schedule_recovery(card);
+		goto out;
+	}
+
+	cmd = qeth_check_ipa_data(card, iob);
+	if ((cmd == NULL) && (card->state != CARD_STATE_DOWN))
+		goto out;
+
+	spin_lock_irqsave(&card->lock, flags);
+	list_for_each_entry_safe(reply, r, &card->cmd_waiter_list, list) {
+		if ((reply->seqno == QETH_IDX_COMMAND_SEQNO) ||
+		    ((cmd) && (reply->seqno == cmd->hdr.seqno))) {
+			qeth_get_reply(reply);
+			list_del_init(&reply->list);
+			spin_unlock_irqrestore(&card->lock, flags);
+			keep_reply = 0;
+			if (reply->callback != NULL) {
+				if (cmd)
+					keep_reply = reply->callback(card,
+							reply,
+							(unsigned long)cmd);
+				else
+					keep_reply = reply->callback(card,
+							reply,
+							(unsigned long)iob);
+			}
+			if (cmd)
+				reply->rc = cmd->hdr.return_code;
+			else if (iob->rc)
+				reply->rc = iob->rc;
+			if (keep_reply) {
+				spin_lock_irqsave(&card->lock, flags);
+				list_add_tail(&reply->list,
+					      &card->cmd_waiter_list);
+				spin_unlock_irqrestore(&card->lock, flags);
+			} else {
+				reply->received = 1;
+				wake_up(&reply->wait_q);
+			}
+			qeth_put_reply(reply);
+			goto out;
+		}
+	}
+	spin_unlock_irqrestore(&card->lock, flags);
+out:
+	memcpy(&card->seqno.pdu_hdr_ack,
+		QETH_PDU_HEADER_SEQ_NO(iob->data),
+		QETH_SEQ_NO_LENGTH);
+	qeth_release_buffer(channel,iob);
+}
+
+static int
+qeth_send_control_data(struct qeth_card *card, int len,
+		       struct qeth_cmd_buffer *iob,
+		       int (*reply_cb)
+		       (struct qeth_card *, struct qeth_reply*, unsigned long),
+		       void *reply_param)
+
+{
+	int rc;
+	unsigned long flags;
+	struct qeth_reply *reply;
+	struct timer_list timer;
+
+	QETH_DBF_TEXT(trace, 2, "sendctl");
+
+	qeth_setup_ccw(&card->write,iob->data,len);
+
+	memcpy(QETH_TRANSPORT_HEADER_SEQ_NO(iob->data),
+	       &card->seqno.trans_hdr, QETH_SEQ_NO_LENGTH);
+	card->seqno.trans_hdr++;
+
+	memcpy(QETH_PDU_HEADER_SEQ_NO(iob->data),
+	       &card->seqno.pdu_hdr, QETH_SEQ_NO_LENGTH);
+	card->seqno.pdu_hdr++;
+	memcpy(QETH_PDU_HEADER_ACK_SEQ_NO(iob->data),
+	       &card->seqno.pdu_hdr_ack, QETH_SEQ_NO_LENGTH);
+	iob->callback = qeth_release_buffer;
+
+	reply = qeth_alloc_reply(card);
+	if (!reply) {
+		PRINT_WARN("Could no alloc qeth_reply!\n");
+		return -ENOMEM;
+	}
+	reply->callback = reply_cb;
+	reply->param = reply_param;
+	if (card->state == CARD_STATE_DOWN)
+		reply->seqno = QETH_IDX_COMMAND_SEQNO;
+	else
+		reply->seqno = card->seqno.ipa++;
+	init_timer(&timer);
+	timer.function = qeth_cmd_timeout;
+	timer.data = (unsigned long) reply;
+	timer.expires = jiffies + QETH_TIMEOUT;
+	init_waitqueue_head(&reply->wait_q);
+	spin_lock_irqsave(&card->lock, flags);
+	list_add_tail(&reply->list, &card->cmd_waiter_list);
+	spin_unlock_irqrestore(&card->lock, flags);
+	QETH_DBF_HEX(control, 2, iob->data, QETH_DBF_CONTROL_LEN);
+	wait_event(card->wait_q,
+		   atomic_compare_and_swap(0,1,&card->write.irq_pending) == 0);
+	QETH_DBF_TEXT(trace, 6, "noirqpnd");
+	spin_lock_irqsave(get_ccwdev_lock(card->write.ccwdev), flags);
+	rc = ccw_device_start(card->write.ccwdev, &card->write.ccw,
+			      (addr_t) iob, 0, 0);
+	spin_unlock_irqrestore(get_ccwdev_lock(card->write.ccwdev), flags);
+	if (rc){
+		PRINT_WARN("qeth_send_control_data: "
+			   "ccw_device_start rc = %i\n", rc);
+		QETH_DBF_TEXT_(trace, 2, " err%d", rc);
+		spin_lock_irqsave(&card->lock, flags);
+		list_del_init(&reply->list);
+		qeth_put_reply(reply);
+		spin_unlock_irqrestore(&card->lock, flags);
+		qeth_release_buffer(iob->channel, iob);
+		atomic_set(&card->write.irq_pending, 0);
+		wake_up(&card->wait_q);
+		return rc;
+	}
+	add_timer(&timer);
+	wait_event(reply->wait_q, reply->received);
+	del_timer(&timer);
+	rc = reply->rc;
+	qeth_put_reply(reply);
+	return rc;
+}
+
+static int
+qeth_send_ipa_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob,
+		  int (*reply_cb)
+		  (struct qeth_card *,struct qeth_reply*, unsigned long),
+		  void *reply_param)
+{
+	struct qeth_ipa_cmd *cmd;
+	int rc;
+
+	QETH_DBF_TEXT(trace,4,"sendipa");
+
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	memcpy(iob->data, IPA_PDU_HEADER, IPA_PDU_HEADER_SIZE);
+	memcpy(QETH_IPA_CMD_DEST_ADDR(iob->data),
+	       &card->token.ulp_connection_r, QETH_MPC_TOKEN_LENGTH);
+
+	rc = qeth_send_control_data(card, IPA_CMD_LENGTH, iob,
+				    reply_cb, reply_param);
+	return rc;
+}
+
+
+static int
+qeth_cm_enable_cb(struct qeth_card *card, struct qeth_reply *reply,
+		  unsigned long data)
+{
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup, 2, "cmenblcb");
+
+	iob = (struct qeth_cmd_buffer *) data;
+	memcpy(&card->token.cm_filter_r,
+	       QETH_CM_ENABLE_RESP_FILTER_TOKEN(iob->data),
+	       QETH_MPC_TOKEN_LENGTH);
+	QETH_DBF_TEXT_(setup, 2, "  rc%d", iob->rc);
+	return 0;
+}
+
+static int
+qeth_cm_enable(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup,2,"cmenable");
+
+	iob = qeth_wait_for_buffer(&card->write);
+	memcpy(iob->data, CM_ENABLE, CM_ENABLE_SIZE);
+	memcpy(QETH_CM_ENABLE_ISSUER_RM_TOKEN(iob->data),
+	       &card->token.issuer_rm_r, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_CM_ENABLE_FILTER_TOKEN(iob->data),
+	       &card->token.cm_filter_w, QETH_MPC_TOKEN_LENGTH);
+
+	rc = qeth_send_control_data(card, CM_ENABLE_SIZE, iob,
+				    qeth_cm_enable_cb, NULL);
+	return rc;
+}
+
+static int
+qeth_cm_setup_cb(struct qeth_card *card, struct qeth_reply *reply,
+		 unsigned long data)
+{
+
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup, 2, "cmsetpcb");
+
+	iob = (struct qeth_cmd_buffer *) data;
+	memcpy(&card->token.cm_connection_r,
+	       QETH_CM_SETUP_RESP_DEST_ADDR(iob->data),
+	       QETH_MPC_TOKEN_LENGTH);
+	QETH_DBF_TEXT_(setup, 2, "  rc%d", iob->rc);
+	return 0;
+}
+
+static int
+qeth_cm_setup(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup,2,"cmsetup");
+
+	iob = qeth_wait_for_buffer(&card->write);
+	memcpy(iob->data, CM_SETUP, CM_SETUP_SIZE);
+	memcpy(QETH_CM_SETUP_DEST_ADDR(iob->data),
+	       &card->token.issuer_rm_r, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_CM_SETUP_CONNECTION_TOKEN(iob->data),
+	       &card->token.cm_connection_w, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_CM_SETUP_FILTER_TOKEN(iob->data),
+	       &card->token.cm_filter_r, QETH_MPC_TOKEN_LENGTH);
+	rc = qeth_send_control_data(card, CM_SETUP_SIZE, iob,
+				    qeth_cm_setup_cb, NULL);
+	return rc;
+
+}
+
+static int
+qeth_ulp_enable_cb(struct qeth_card *card, struct qeth_reply *reply,
+		   unsigned long data)
+{
+
+	__u16 mtu, framesize;
+	__u16 len;
+	__u8 link_type;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup, 2, "ulpenacb");
+
+	iob = (struct qeth_cmd_buffer *) data;
+	memcpy(&card->token.ulp_filter_r,
+	       QETH_ULP_ENABLE_RESP_FILTER_TOKEN(iob->data),
+	       QETH_MPC_TOKEN_LENGTH);
+	if (qeth_get_mtu_out_of_mpc(card->info.type)) {
+		memcpy(&framesize, QETH_ULP_ENABLE_RESP_MAX_MTU(iob->data), 2);
+		mtu = qeth_get_mtu_outof_framesize(framesize);
+		if (!mtu) {
+			iob->rc = -EINVAL;
+			QETH_DBF_TEXT_(setup, 2, "  rc%d", iob->rc);
+			return 0;
+		}
+		card->info.max_mtu = mtu;
+		card->info.initial_mtu = mtu;
+		card->qdio.in_buf_size = mtu + 2 * PAGE_SIZE;
+	} else {
+		card->info.initial_mtu = qeth_get_initial_mtu_for_card(card);
+		card->info.max_mtu = qeth_get_max_mtu_for_card(card->info.type);
+		card->qdio.in_buf_size = QETH_IN_BUF_SIZE_DEFAULT;
+	}
+
+	memcpy(&len, QETH_ULP_ENABLE_RESP_DIFINFO_LEN(iob->data), 2);
+	if (len >= QETH_MPC_DIFINFO_LEN_INDICATES_LINK_TYPE) {
+		memcpy(&link_type,
+		       QETH_ULP_ENABLE_RESP_LINK_TYPE(iob->data), 1);
+		card->info.link_type = link_type;
+	} else
+		card->info.link_type = 0;
+	QETH_DBF_TEXT_(setup, 2, "  rc%d", iob->rc);
+	return 0;
+}
+
+static int
+qeth_ulp_enable(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	/*FIXME: trace view callbacks*/
+	QETH_DBF_TEXT(setup,2,"ulpenabl");
+
+	iob = qeth_wait_for_buffer(&card->write);
+	memcpy(iob->data, ULP_ENABLE, ULP_ENABLE_SIZE);
+
+	*(QETH_ULP_ENABLE_LINKNUM(iob->data)) =
+		(__u8) card->info.portno;
+
+	memcpy(QETH_ULP_ENABLE_DEST_ADDR(iob->data),
+	       &card->token.cm_connection_r, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_ULP_ENABLE_FILTER_TOKEN(iob->data),
+	       &card->token.ulp_filter_w, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_ULP_ENABLE_PORTNAME_AND_LL(iob->data),
+	       card->info.portname, 9);
+	rc = qeth_send_control_data(card, ULP_ENABLE_SIZE, iob,
+				    qeth_ulp_enable_cb, NULL);
+	return rc;
+
+}
+
+static inline __u16
+__raw_devno_from_bus_id(char *id)
+{
+	id += (strlen(id) - 4);
+	return (__u16) simple_strtoul(id, &id, 16);
+}
+
+static int
+qeth_ulp_setup_cb(struct qeth_card *card, struct qeth_reply *reply,
+		  unsigned long data)
+{
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup, 2, "ulpstpcb");
+
+	iob = (struct qeth_cmd_buffer *) data;
+	memcpy(&card->token.ulp_connection_r,
+	       QETH_ULP_SETUP_RESP_CONNECTION_TOKEN(iob->data),
+	       QETH_MPC_TOKEN_LENGTH);
+	QETH_DBF_TEXT_(setup, 2, "  rc%d", iob->rc);
+	return 0;
+}
+
+static int
+qeth_ulp_setup(struct qeth_card *card)
+{
+	int rc;
+	__u16 temp;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup,2,"ulpsetup");
+
+	iob = qeth_wait_for_buffer(&card->write);
+	memcpy(iob->data, ULP_SETUP, ULP_SETUP_SIZE);
+
+	memcpy(QETH_ULP_SETUP_DEST_ADDR(iob->data),
+	       &card->token.cm_connection_r, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_ULP_SETUP_CONNECTION_TOKEN(iob->data),
+	       &card->token.ulp_connection_w, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_ULP_SETUP_FILTER_TOKEN(iob->data),
+	       &card->token.ulp_filter_r, QETH_MPC_TOKEN_LENGTH);
+
+	temp = __raw_devno_from_bus_id(CARD_DDEV_ID(card));
+	memcpy(QETH_ULP_SETUP_CUA(iob->data), &temp, 2);
+	temp = (card->info.cula << 8) + card->info.unit_addr2;
+	memcpy(QETH_ULP_SETUP_REAL_DEVADDR(iob->data), &temp, 2);
+	rc = qeth_send_control_data(card, ULP_SETUP_SIZE, iob,
+				    qeth_ulp_setup_cb, NULL);
+	return rc;
+}
+
+static inline int
+qeth_check_for_inbound_error(struct qeth_qdio_buffer *buf,
+			     unsigned int qdio_error,
+			     unsigned int siga_error)
+{
+	int rc = 0;
+
+	if (qdio_error || siga_error) {
+		QETH_DBF_TEXT(trace, 2, "qdinerr");
+		QETH_DBF_TEXT(qerr, 2, "qdinerr");
+		QETH_DBF_TEXT_(qerr, 2, " F15=%02X",
+			       buf->buffer->element[15].flags & 0xff);
+		QETH_DBF_TEXT_(qerr, 2, " F14=%02X",
+			       buf->buffer->element[14].flags & 0xff);
+		QETH_DBF_TEXT_(qerr, 2, " qerr=%X", qdio_error);
+		QETH_DBF_TEXT_(qerr, 2, " serr=%X", siga_error);
+		rc = 1;
+	}
+	return rc;
+}
+
+static void
+qeth_qdio_input_handler(struct ccw_device * ccwdev, unsigned int status,
+		        unsigned int qdio_err, unsigned int siga_err,
+			unsigned int queue, int first_element, int count,
+			unsigned long card_ptr)
+{
+	struct net_device *net_dev;
+	struct qeth_card *card;
+	struct qeth_qdio_buffer *buffer;
+	int i;
+
+	QETH_DBF_TEXT(trace, 6, "qdinput");
+	card = (struct qeth_card *) card_ptr;
+	net_dev = card->dev;
+#ifdef CONFIG_QETH_PERF_STATS
+	card->perf_stats.inbound_start_time = qeth_get_micros();
+#endif
+	if (status & QDIO_STATUS_LOOK_FOR_ERROR) {
+		if (status & QDIO_STATUS_ACTIVATE_CHECK_CONDITION){
+			QETH_DBF_TEXT(trace, 1,"qdinchk");
+			QETH_DBF_TEXT_(trace,1,"%s",CARD_BUS_ID(card));
+			QETH_DBF_TEXT_(trace,1,"%04X%04X",first_element,count);
+			QETH_DBF_TEXT_(trace,1,"%04X%04X", queue, status);
+			qeth_schedule_recovery(card);
+			return;
+		}
+	}
+	for (i = first_element; i < (first_element + count); ++i) {
+		buffer = &card->qdio.in_q->bufs[i % QDIO_MAX_BUFFERS_PER_Q];
+		if ((status == QDIO_STATUS_LOOK_FOR_ERROR) &&
+		    qeth_check_for_inbound_error(buffer, qdio_err, siga_err))
+			buffer->state = QETH_QDIO_BUF_ERROR;
+		else
+			buffer->state = QETH_QDIO_BUF_PRIMED;
+	}
+
+	tasklet_schedule(&card->qdio.in_tasklet);
+}
+
+static inline struct sk_buff *
+qeth_get_skb(unsigned int length)
+{
+	struct sk_buff* skb;
+#ifdef CONFIG_QETH_VLAN
+	if ((skb = dev_alloc_skb(length + VLAN_HLEN)))
+		skb_reserve(skb, VLAN_HLEN);
+#else
+	skb = dev_alloc_skb(length);
+#endif
+	return skb;
+}
+
+static inline struct sk_buff *
+qeth_get_next_skb(struct qeth_card *card, struct qdio_buffer *buffer,
+		  struct qdio_buffer_element **__element, int *__offset,
+		  struct qeth_hdr **hdr)
+{
+	struct qdio_buffer_element *element = *__element;
+	int offset = *__offset;
+	struct sk_buff *skb = NULL;
+	int skb_len;
+	void *data_ptr;
+	int data_len;
+
+	QETH_DBF_TEXT(trace,6,"nextskb");
+	/* qeth_hdr must not cross element boundaries */
+	if (element->length < offset + sizeof(struct qeth_hdr)){
+		if (qeth_is_last_sbale(element))
+			return NULL;
+		element++;
+		offset = 0;
+		if (element->length < sizeof(struct qeth_hdr))
+			return NULL;
+	}
+	*hdr = element->addr + offset;
+
+	offset += sizeof(struct qeth_hdr);
+	skb_len = (*hdr)->length;
+	if (!skb_len)
+		return NULL;
+	if (card->options.fake_ll){
+		if (!(skb = qeth_get_skb(skb_len + QETH_FAKE_LL_LEN)))
+			goto no_mem;
+		skb_pull(skb, QETH_FAKE_LL_LEN);
+	} else if (!(skb = qeth_get_skb(skb_len)))
+		goto no_mem;
+	data_ptr = element->addr + offset;
+	while (skb_len) {
+		data_len = min(skb_len, (int)(element->length - offset));
+		if (data_len)
+			memcpy(skb_put(skb, data_len), data_ptr, data_len);
+		skb_len -= data_len;
+		if (skb_len){
+			if (qeth_is_last_sbale(element)){
+				QETH_DBF_TEXT(trace,4,"unexeob");
+				QETH_DBF_TEXT_(trace,4,"%s",CARD_BUS_ID(card));
+				QETH_DBF_TEXT(qerr,2,"unexeob");
+				QETH_DBF_TEXT_(qerr,2,"%s",CARD_BUS_ID(card));
+				QETH_DBF_HEX(misc,4,buffer,sizeof(*buffer));
+				dev_kfree_skb_irq(skb);
+				card->stats.rx_errors++;
+				return NULL;
+			}
+			element++;
+			offset = 0;
+			data_ptr = element->addr;
+		} else {
+			offset += data_len;
+		}
+	}
+	*__element = element;
+	*__offset = offset;
+	return skb;
+no_mem:
+	if (net_ratelimit()){
+		PRINT_WARN("No memory for packet received on %s.\n",
+			   card->info.if_name);
+		QETH_DBF_TEXT(trace,2,"noskbmem");
+		QETH_DBF_TEXT_(trace,2,"%s",CARD_BUS_ID(card));
+	}
+	card->stats.rx_dropped++;
+	return NULL;
+}
+
+static inline unsigned short
+qeth_type_trans(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ethhdr *eth;
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,5,"typtrans");
+
+	card = (struct qeth_card *)dev->priv;
+#ifdef CONFIG_TR
+	if ((card->info.link_type == QETH_LINK_TYPE_HSTR) ||
+	    (card->info.link_type == QETH_LINK_TYPE_LANE_TR))
+	 	return tr_type_trans(skb,dev);
+#endif /* CONFIG_TR */
+
+	skb->mac.raw = skb->data;
+	skb_pull(skb, ETH_ALEN * 2 + sizeof (short));
+	eth = skb->mac.ethernet;
+
+	if (*eth->h_dest & 1) {
+		if (memcmp(eth->h_dest, dev->broadcast, ETH_ALEN) == 0)
+			skb->pkt_type = PACKET_BROADCAST;
+		else
+			skb->pkt_type = PACKET_MULTICAST;
+	} else {
+		skb->pkt_type = PACKET_OTHERHOST;
+	}
+	if (ntohs(eth->h_proto) >= 1536)
+		return eth->h_proto;
+	if (*(unsigned short *) (skb->data) == 0xFFFF)
+		return htons(ETH_P_802_3);
+	return htons(ETH_P_802_2);
+}
+
+
+static inline void
+qeth_rebuild_skb_fake_ll(struct qeth_card *card, struct sk_buff *skb,
+			 struct qeth_hdr *hdr)
+{
+	struct ethhdr *fake_hdr;
+	struct iphdr *ip_hdr;
+
+	QETH_DBF_TEXT(trace,5,"skbfake");
+	skb->mac.raw = skb->data - QETH_FAKE_LL_LEN;
+	/* this is a fake ethernet header */
+	fake_hdr = (struct ethhdr *) skb->mac.raw;
+
+	/* the destination MAC address */
+	switch (skb->pkt_type){
+	case PACKET_MULTICAST:
+		switch (skb->protocol){
+#ifdef CONFIG_QETH_IPV6
+		case __constant_htons(ETH_P_IPV6):
+			ndisc_mc_map((struct in6_addr *)
+				     skb->data + QETH_FAKE_LL_V6_ADDR_POS,
+				     fake_hdr->h_dest, card->dev, 0);
+			break;
+#endif /* CONFIG_QETH_IPV6 */
+		case __constant_htons(ETH_P_IP):
+			ip_hdr = (struct iphdr *)skb->data;
+			if (card->dev->type == ARPHRD_IEEE802_TR)
+				ip_tr_mc_map(ip_hdr->daddr, fake_hdr->h_dest);
+			else
+				ip_eth_mc_map(ip_hdr->daddr, fake_hdr->h_dest);
+			break;
+		default:
+			memcpy(fake_hdr->h_dest, card->dev->dev_addr, ETH_ALEN);
+		}
+		break;
+	case PACKET_BROADCAST:
+		memset(fake_hdr->h_dest, 0xff, ETH_ALEN);
+		break;
+	default:
+		memcpy(fake_hdr->h_dest, card->dev->dev_addr, ETH_ALEN);
+	}
+	/* the source MAC address */
+	if (hdr->ext_flags & QETH_HDR_EXT_SRC_MAC_ADDR)
+		memcpy(fake_hdr->h_source, &hdr->dest_addr[2], ETH_ALEN);
+	else
+		memset(fake_hdr->h_source, 0, ETH_ALEN);
+	/* the protocol */
+	fake_hdr->h_proto = skb->protocol;
+}
+
+static inline void
+qeth_rebuild_skb_vlan(struct qeth_card *card, struct sk_buff *skb,
+		      struct qeth_hdr *hdr)
+{
+#ifdef CONFIG_QETH_VLAN
+	u16 *vlan_tag;
+
+	if (hdr->ext_flags & QETH_HDR_EXT_VLAN_FRAME) {
+		vlan_tag = (u16 *) skb_push(skb, VLAN_HLEN);
+		*vlan_tag = hdr->vlan_id;
+		*(vlan_tag + 1) = skb->protocol;
+		skb->protocol = __constant_htons(ETH_P_8021Q);
+	}
+#endif /* CONFIG_QETH_VLAN */
+}
+
+
+static inline void
+qeth_rebuild_skb(struct qeth_card *card, struct sk_buff *skb,
+		 struct qeth_hdr *hdr)
+{
+#ifdef CONFIG_QETH_IPV6
+	if (hdr->flags & QETH_HDR_PASSTHRU){
+		skb->protocol = qeth_type_trans(skb, card->dev);
+		return;
+	}
+#endif /* CONFIG_QETH_IPV6 */
+	skb->protocol = htons((hdr->flags & QETH_HDR_IPV6)? ETH_P_IPV6 :
+			      ETH_P_IP);
+	switch (hdr->flags & QETH_HDR_CAST_MASK){
+	case QETH_CAST_UNICAST:
+		skb->pkt_type = PACKET_HOST;
+		break;
+	case QETH_CAST_MULTICAST:
+		skb->pkt_type = PACKET_MULTICAST;
+		card->stats.multicast++;
+		break;
+	case QETH_CAST_BROADCAST:
+		skb->pkt_type = PACKET_BROADCAST;
+		card->stats.multicast++;
+		break;
+	case QETH_CAST_ANYCAST:
+	case QETH_CAST_NOCAST:
+	default:
+		skb->pkt_type = PACKET_HOST;
+	}
+	if (card->options.fake_ll)
+		qeth_rebuild_skb_fake_ll(card, skb, hdr);
+	else
+		skb->mac.raw = skb->data;
+	skb->ip_summed = card->options.checksum_type;
+	if (card->options.checksum_type == HW_CHECKSUMMING){
+		if ( (hdr->ext_flags &
+		      (QETH_HDR_EXT_CSUM_HDR_REQ |
+		       QETH_HDR_EXT_CSUM_TRANSP_REQ)) ==
+		     (QETH_HDR_EXT_CSUM_HDR_REQ |
+		      QETH_HDR_EXT_CSUM_TRANSP_REQ) )
+			skb->ip_summed = CHECKSUM_UNNECESSARY;
+		else
+			skb->ip_summed = SW_CHECKSUMMING;
+	}
+	qeth_rebuild_skb_vlan(card, skb, hdr);
+}
+
+
+static inline struct qeth_buffer_pool_entry *
+qeth_get_buffer_pool_entry(struct qeth_card *card)
+{
+	struct qeth_buffer_pool_entry *entry, *tmp;
+
+	QETH_DBF_TEXT(trace, 6, "gtbfplen");
+	entry = NULL;
+	list_for_each_entry_safe(entry, tmp,
+				 &card->qdio.in_buf_pool.entry_list, list){
+		list_del_init(&entry->list);
+		break;
+	}
+	return entry;
+}
+
+static inline void
+qeth_init_input_buffer(struct qeth_card *card, struct qeth_qdio_buffer *buf)
+{
+	struct qeth_buffer_pool_entry *pool_entry;
+	int i;
+
+	pool_entry = qeth_get_buffer_pool_entry(card);
+	/*
+	 * since the buffer is accessed only from the input_tasklet
+	 * there shouldn't be a need to synchronize; also, since we use
+	 * the QETH_IN_BUF_REQUEUE_THRESHOLD we should never run  out off
+	 * buffers
+	 */
+	BUG_ON(!pool_entry);
+
+	buf->pool_entry = pool_entry;
+	for(i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i){
+		buf->buffer->element[i].length = PAGE_SIZE;
+		buf->buffer->element[i].addr =  pool_entry->elements[i];
+		if (i == QETH_MAX_BUFFER_ELEMENTS(card) - 1)
+			buf->buffer->element[i].flags = SBAL_FLAGS_LAST_ENTRY;
+		else
+			buf->buffer->element[i].flags = 0;
+	}
+	buf->state = QETH_QDIO_BUF_EMPTY;
+}
+
+static void
+qeth_clear_output_buffer(struct qeth_card *card,
+			 struct qeth_qdio_out_buffer *buf)
+{
+	int i;
+	struct sk_buff *skb;
+
+	for(i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i){
+		buf->buffer->element[i].length = 0;
+		buf->buffer->element[i].addr = NULL;
+		buf->buffer->element[i].flags = 0;
+		while ((skb = skb_dequeue(&buf->skb_list))){
+			atomic_dec(&skb->users);
+			dev_kfree_skb_irq(skb);
+		}
+	}
+	buf->next_element_to_fill = 0;
+	buf->state = QETH_QDIO_BUF_EMPTY;
+}
+
+static inline void
+qeth_queue_input_buffer(struct qeth_card *card, int index)
+{
+	struct qeth_qdio_q *queue = card->qdio.in_q;
+	int count;
+	int i;
+	int rc;
+
+	QETH_DBF_TEXT(trace,6,"queinbuf");
+	count = (index < queue->next_buf_to_init)?
+		card->qdio.in_buf_pool.buf_count -
+		(queue->next_buf_to_init - index) :
+		card->qdio.in_buf_pool.buf_count -
+		(queue->next_buf_to_init + QDIO_MAX_BUFFERS_PER_Q - index);
+	/* only requeue at a certain threshold to avoid SIGAs */
+	if (count >= QETH_IN_BUF_REQUEUE_THRESHOLD(card)){
+		for (i = queue->next_buf_to_init;
+		     i < queue->next_buf_to_init + count; ++i)
+			qeth_init_input_buffer(card,
+				&queue->bufs[i % QDIO_MAX_BUFFERS_PER_Q]);
+		/*
+		 * according to old code it should be avoided to requeue all
+		 * 128 buffers in order to benefit from PCI avoidance.
+		 * this function keeps at least one buffer (the buffer at
+		 * 'index') un-requeued -> this buffer is the first buffer that
+		 * will be requeued the next time
+		 */
+		rc = do_QDIO(CARD_DDEV(card),
+			     QDIO_FLAG_SYNC_INPUT,
+			     0, queue->next_buf_to_init, count, NULL);
+		if (rc){
+			PRINT_WARN("qeth_queue_input_buffer's do_QDIO "
+				   "return %i (device %s).\n",
+				   rc, CARD_DDEV_ID(card));
+			QETH_DBF_TEXT(trace,2,"qinberr");
+			QETH_DBF_TEXT_(trace,2,"%s",CARD_BUS_ID(card));
+		}
+		queue->next_buf_to_init = (queue->next_buf_to_init + count) %
+					  QDIO_MAX_BUFFERS_PER_Q;
+	}
+}
+
+static inline void
+qeth_put_buffer_pool_entry(struct qeth_card *card,
+			   struct qeth_buffer_pool_entry *entry)
+{
+	QETH_DBF_TEXT(trace, 6, "ptbfplen");
+	list_add_tail(&entry->list, &card->qdio.in_buf_pool.entry_list);
+}
+
+static void
+qeth_qdio_input_tasklet(unsigned long data)
+{
+	struct qeth_card *card = (struct qeth_card *) data;
+	int current_buf = card->qdio.in_q->next_buf_to_process;
+	struct qeth_qdio_buffer *buf;
+	struct qdio_buffer_element *element;
+	int offset;
+	struct sk_buff *skb;
+	struct qeth_hdr *hdr;
+	int rxrc;
+
+	QETH_DBF_TEXT(trace,6,"qdintlet");
+	buf = &card->qdio.in_q->bufs[current_buf];
+	while((buf->state == QETH_QDIO_BUF_PRIMED) ||
+	      (buf->state == QETH_QDIO_BUF_ERROR)){
+		if (buf->state == QETH_QDIO_BUF_ERROR)
+			goto clear_buffer;
+		if (netif_queue_stopped(card->dev))
+			goto clear_buffer;
+		/* get first element of current buffer */
+		element = (struct qdio_buffer_element *)
+			&buf->buffer->element[0];
+		offset = 0;
+#ifdef CONFIG_QETH_PERF_STATS
+		card->perf_stats.bufs_rec++;
+#endif
+		while((skb = qeth_get_next_skb(card, buf->buffer, &element,
+					       &offset, &hdr))){
+
+			qeth_rebuild_skb(card, skb, hdr);
+#ifdef CONFIG_QETH_PERF_STATS
+			card->perf_stats.inbound_time += qeth_get_micros() -
+				card->perf_stats.inbound_start_time;
+			card->perf_stats.inbound_cnt++;
+#endif
+			skb->dev = card->dev;
+			if (netif_queue_stopped(card->dev)) {
+				dev_kfree_skb_irq(skb);
+				card->stats.rx_dropped++;
+			} else {
+				rxrc = netif_rx(skb);
+				card->dev->last_rx = jiffies;
+				card->stats.rx_packets++;
+				card->stats.rx_bytes += skb->len;
+			}
+		}
+clear_buffer:
+		qeth_put_buffer_pool_entry(card, buf->pool_entry);
+		/* give buffer back to hardware */
+		qeth_queue_input_buffer(card, current_buf);
+		current_buf = (current_buf + 1) % QDIO_MAX_BUFFERS_PER_Q;
+		buf = &card->qdio.in_q->bufs[current_buf];
+	}
+	/* set index for next time the tasklet is scheduled */
+	card->qdio.in_q->next_buf_to_process = current_buf;
+}
+
+static inline int
+qeth_handle_send_error(struct qeth_card *card,
+		       struct qeth_qdio_out_buffer *buffer,
+		       int qdio_err, int siga_err)
+{
+	int sbalf15 = buffer->buffer->element[15].flags & 0xff;
+	int cc = siga_err & 3;
+
+	QETH_DBF_TEXT(trace, 6, "hdsnderr");
+	switch (cc) {
+	case 0:
+		if (qdio_err){
+			QETH_DBF_TEXT(trace, 1,"lnkfail");
+			QETH_DBF_TEXT_(trace,1,"%s",CARD_BUS_ID(card));
+			QETH_DBF_TEXT_(trace,1,"%04x %02x",
+				       (u16)qdio_err, (u8)sbalf15);
+			return QETH_SEND_ERROR_LINK_FAILURE;
+		}
+		return QETH_SEND_ERROR_NONE;
+	case 2:
+		if (siga_err & QDIO_SIGA_ERROR_B_BIT_SET) {
+			QETH_DBF_TEXT(trace, 1, "SIGAcc2B");
+			QETH_DBF_TEXT_(trace,1,"%s",CARD_BUS_ID(card));
+			return QETH_SEND_ERROR_KICK_IT;
+		}
+		if ((sbalf15 >= 15) && (sbalf15 <= 31))
+			return QETH_SEND_ERROR_RETRY;
+		return QETH_SEND_ERROR_LINK_FAILURE;
+		/* look at qdio_error and sbalf 15 */
+	case 1:
+		QETH_DBF_TEXT(trace, 1, "SIGAcc1");
+		QETH_DBF_TEXT_(trace,1,"%s",CARD_BUS_ID(card));
+		return QETH_SEND_ERROR_LINK_FAILURE;
+	case 3:
+		QETH_DBF_TEXT(trace, 1, "SIGAcc3");
+		QETH_DBF_TEXT_(trace,1,"%s",CARD_BUS_ID(card));
+		return QETH_SEND_ERROR_KICK_IT;
+	}
+	return QETH_SEND_ERROR_LINK_FAILURE;
+}
+
+static inline void
+qeth_flush_buffers(struct qeth_qdio_out_q *queue, int under_int,
+		   int index, int count)
+{
+	struct qeth_qdio_out_buffer *buf;
+	int rc;
+	int i;
+
+	QETH_DBF_TEXT(trace, 6, "flushbuf");
+
+	for (i = index; i < index + count; ++i) {
+		buf = &queue->bufs[i % QDIO_MAX_BUFFERS_PER_Q];
+		buf->buffer->element[buf->next_element_to_fill - 1].flags |=
+				SBAL_FLAGS_LAST_ENTRY;
+
+		if (!queue->do_pack){
+			if ((atomic_read(&queue->used_buffers) >=
+		    		(QETH_HIGH_WATERMARK_PACK -
+				 QETH_WATERMARK_PACK_FUZZ)) &&
+		    	    !atomic_read(&queue->set_pci_flags_count)){
+				/* it's likely that we'll go to packing
+				 * mode soon */
+				atomic_inc(&queue->set_pci_flags_count);
+				buf->buffer->element[0].flags |= 0x40;
+			}
+		} else {
+			if (!atomic_read(&queue->set_pci_flags_count)){
+				/*
+				 * there's no outstanding PCI any more, so we
+				 * have to request a PCI to be sure the the PCI
+				 * will wake at some time in the future then we
+				 * can flush packed buffers that might still be
+				 * hanging around, which can happen if no
+				 * further send was requested by the stack
+				 */
+				atomic_inc(&queue->set_pci_flags_count);
+				buf->buffer->element[0].flags |= 0x40;
+			}
+#ifdef CONFIG_QETH_PERF_STATS
+			queue->card->perf_stats.bufs_sent_pack++;
+#endif
+		}
+	}
+
+	queue->card->dev->trans_start = jiffies;
+	if (under_int)
+		rc = do_QDIO(CARD_DDEV(queue->card),
+			     QDIO_FLAG_SYNC_OUTPUT | QDIO_FLAG_UNDER_INTERRUPT,
+			     queue->queue_no, index, count, NULL);
+	else
+		rc = do_QDIO(CARD_DDEV(queue->card), QDIO_FLAG_SYNC_OUTPUT,
+			     queue->queue_no, index, count, NULL);
+	if (rc){
+		QETH_DBF_SPRINTF(trace, 0, "qeth_flush_buffers: do_QDIO "
+				 "returned error (%i) on device %s.",
+				 rc, CARD_DDEV_ID(queue->card));
+		QETH_DBF_TEXT(trace, 2, "flushbuf");
+		QETH_DBF_TEXT_(trace, 2, " err%d", rc);
+		queue->card->stats.tx_errors += count;
+		return;
+	}
+#ifdef CONFIG_QETH_PERF_STATS
+	queue->card->perf_stats.bufs_sent += count;
+	queue->card->perf_stats.outbound_cnt++;
+#endif
+}
+
+/*
+ * switches between PACKING and non-PACKING state if needed.
+ * has to be called holding queue->lock
+ */
+static inline void
+qeth_switch_packing_state(struct qeth_qdio_out_q *queue)
+{
+	struct qeth_qdio_out_buffer *buffer;
+
+	QETH_DBF_TEXT(trace, 6, "swipack");
+	if (!queue->do_pack) {
+		if (atomic_read(&queue->used_buffers)
+		    >= QETH_HIGH_WATERMARK_PACK){
+			/* switch non-PACKING -> PACKING */
+			QETH_DBF_TEXT(trace, 6, "np->pack");
+#ifdef CONFIG_QETH_PERF_STATS
+			queue->card->perf_stats.sc_dp_p++;
+#endif
+			queue->do_pack = 1;
+		}
+	} else {
+		if (atomic_read(&queue->used_buffers)
+		    <= QETH_LOW_WATERMARK_PACK) {
+			/* switch PACKING -> non-PACKING */
+			QETH_DBF_TEXT(trace, 6, "pack->np");
+#ifdef CONFIG_QETH_PERF_STATS
+			queue->card->perf_stats.sc_p_dp++;
+#endif
+			queue->do_pack = 0;
+			/* flush packing buffers */
+			buffer = &queue->bufs[queue->next_buf_to_fill];
+			BUG_ON(buffer->state == QETH_QDIO_BUF_PRIMED);
+			if (buffer->next_element_to_fill > 0) {
+				buffer->state = QETH_QDIO_BUF_PRIMED;
+				atomic_inc(&queue->used_buffers);
+				queue->next_buf_to_fill =
+					(queue->next_buf_to_fill + 1) %
+					QDIO_MAX_BUFFERS_PER_Q;
+		 	}
+		}
+	}
+}
+
+static void
+qeth_qdio_output_handler(struct ccw_device * ccwdev, unsigned int status,
+		        unsigned int qdio_error, unsigned int siga_error,
+			unsigned int __queue, int first_element, int count,
+			unsigned long card_ptr)
+{
+	struct qeth_card *card        = (struct qeth_card *) card_ptr;
+	struct qeth_qdio_out_q *queue = card->qdio.out_qs[__queue];
+	struct qeth_qdio_out_buffer *buffer;
+	int i;
+
+	QETH_DBF_TEXT(trace, 6, "qdouhdl");
+	if (status & QDIO_STATUS_LOOK_FOR_ERROR) {
+		if (status & QDIO_STATUS_ACTIVATE_CHECK_CONDITION){
+			QETH_DBF_SPRINTF(trace, 2, "On device %s: "
+					 "received active check "
+				         "condition (0x%08x).",
+					 CARD_BUS_ID(card), status);
+			QETH_DBF_TEXT(trace, 2, "chkcond");
+			QETH_DBF_TEXT_(trace, 2, "%08x", status);
+			netif_stop_queue(card->dev);
+			qeth_schedule_recovery(card);
+			return;
+		}
+	}
+
+	for(i = first_element; i < (first_element + count); ++i){
+		buffer = &queue->bufs[i % QDIO_MAX_BUFFERS_PER_Q];
+		/*we only handle the KICK_IT error by doing a recovery */
+		if (qeth_handle_send_error(card, buffer, qdio_error, siga_error)
+				== QETH_SEND_ERROR_KICK_IT){
+			netif_stop_queue(card->dev);
+			qeth_schedule_recovery(card);
+			return;
+		}
+		/* is PCI flag set on buffer? */
+		if (buffer->buffer->element[0].flags & 0x40)
+			atomic_dec(&queue->set_pci_flags_count);
+
+		qeth_clear_output_buffer(card, buffer);
+	}
+	atomic_sub(count, &queue->used_buffers);
+
+	//if (!atomic_read(&queue->set_pci_flags_count))
+		tasklet_schedule(&queue->tasklet);
+
+	netif_wake_queue(card->dev);
+}
+
+static void
+qeth_qdio_output_tasklet(unsigned long data)
+{
+	struct qeth_qdio_out_q *queue = (struct qeth_qdio_out_q *) data;
+	struct qeth_qdio_out_buffer *buffer;
+	int index;
+	int count;
+
+	QETH_DBF_TEXT(trace, 6, "outtlet");
+
+	/* flush all PRIMED buffers */
+	index = queue->next_buf_to_flush;
+	count = 0;
+	while (queue->bufs[index].state == QETH_QDIO_BUF_PRIMED) {
+		count++;
+		index = (index + 1) % QDIO_MAX_BUFFERS_PER_Q;
+	}
+	qeth_flush_buffers(queue, 0, queue->next_buf_to_flush, count);
+	queue->next_buf_to_flush = index;
+
+	/* flush a buffer with data, if no more PCIs are
+	 * outstanding */
+	if (!atomic_read(&queue->set_pci_flags_count)){
+		spin_lock(&queue->lock);
+		buffer = &queue->bufs[index];
+		if (buffer->state == QETH_QDIO_BUF_PRIMED){
+			qeth_flush_buffers(queue, 0, index, 1);
+			index = (index + 1) % QDIO_MAX_BUFFERS_PER_Q;
+			queue->next_buf_to_flush = index;
+		} else if (buffer->next_element_to_fill > 0){
+			/* it's a packing buffer */
+			BUG_ON(index != queue->next_buf_to_fill);
+			buffer->state = QETH_QDIO_BUF_PRIMED;
+			atomic_inc(&queue->used_buffers);
+			qeth_flush_buffers(queue, 0, index, 1);
+			index = (index + 1) % QDIO_MAX_BUFFERS_PER_Q;
+			queue->next_buf_to_flush = index;
+			queue->next_buf_to_fill = index;
+		}
+		spin_unlock(&queue->lock);
+	}
+}
+
+static char*
+qeth_create_qib_param_field(struct qeth_card *card)
+{
+	char *param_field;
+
+	param_field = kmalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof(char),
+			      GFP_KERNEL);
+ 	if (!param_field)
+		return NULL;
+
+ 	memset(param_field, 0, QDIO_MAX_BUFFERS_PER_Q * sizeof(char));
+
+	param_field[0] = _ascebc['P'];
+	param_field[1] = _ascebc['C'];
+	param_field[2] = _ascebc['I'];
+	param_field[3] = _ascebc['T'];
+	*((unsigned int *) (&param_field[4])) = QETH_PCI_THRESHOLD_A(card);
+	*((unsigned int *) (&param_field[8])) = QETH_PCI_THRESHOLD_B(card);
+	*((unsigned int *) (&param_field[12])) = QETH_PCI_TIMER_VALUE(card);
+
+	return param_field;
+}
+
+static void
+qeth_initialize_working_pool_list(struct qeth_card *card)
+{
+	struct qeth_buffer_pool_entry *entry;
+
+	QETH_DBF_TEXT(trace,5,"inwrklst");
+
+	list_for_each_entry(entry,
+			    &card->qdio.init_pool.entry_list, init_list) {
+		qeth_put_buffer_pool_entry(card,entry);
+	}
+}
+
+static void
+qeth_clear_working_pool_list(struct qeth_card *card)
+{
+	struct qeth_buffer_pool_entry *pool_entry, *tmp;
+
+	QETH_DBF_TEXT(trace,5,"clwrklst");
+	list_for_each_entry_safe(pool_entry, tmp,
+			    &card->qdio.in_buf_pool.entry_list, list){
+			list_del(&pool_entry->list);
+	}
+}
+
+static void
+qeth_free_buffer_pool(struct qeth_card *card)
+{
+	struct qeth_buffer_pool_entry *pool_entry, *tmp;
+	int i=0;
+	QETH_DBF_TEXT(trace,5,"freepool");
+	list_for_each_entry_safe(pool_entry, tmp,
+				 &card->qdio.init_pool.entry_list, init_list){
+		for (i = 0; i < QETH_MAX_BUFFER_ELEMENTS(card); ++i)
+			free_page((unsigned long)pool_entry->elements[i]);
+		list_del(&pool_entry->init_list);
+		kfree(pool_entry);
+	}
+}
+
+static int
+qeth_alloc_buffer_pool(struct qeth_card *card)
+{
+	struct qeth_buffer_pool_entry *pool_entry;
+	void *ptr;
+	int i, j;
+
+	for (i = 0; i < card->qdio.init_pool.buf_count; ++i){
+	 	pool_entry = kmalloc(sizeof(*pool_entry), GFP_KERNEL);
+		if (!pool_entry){
+			qeth_free_buffer_pool(card);
+			return -ENOMEM;
+		}
+		for(j = 0; j < QETH_MAX_BUFFER_ELEMENTS(card); ++j){
+			ptr = (void *) __get_free_page(GFP_KERNEL);
+			if (!ptr) {
+				while (j > 0)
+					free_page((unsigned long)
+						  pool_entry->elements[--j]);
+				kfree(pool_entry);
+				qeth_free_buffer_pool(card);
+				return -ENOMEM;
+			}
+			pool_entry->elements[j] = ptr;
+		}
+		list_add(&pool_entry->init_list,
+			 &card->qdio.init_pool.entry_list);
+		list_add(&pool_entry->list,
+			 &card->qdio.in_buf_pool.entry_list);
+	}
+	return 0;
+}
+
+static int
+qeth_alloc_qdio_buffers(struct qeth_card *card)
+{
+	int i, j;
+
+	QETH_DBF_TEXT(setup, 2, "allcqdbf");
+
+	if (card->qdio.state == QETH_QDIO_ALLOCATED) {
+		qeth_initialize_working_pool_list(card);
+		return 0;
+	}
+	card->qdio.in_q = kmalloc(sizeof(struct qeth_qdio_q), GFP_KERNEL);
+	if (!card->qdio.in_q)
+		return - ENOMEM;
+	QETH_DBF_TEXT(setup, 2, "inq");
+	QETH_DBF_HEX(setup, 2, &card->qdio.in_q, sizeof(void *));
+	memset(card->qdio.in_q, 0, sizeof(struct qeth_qdio_q));
+	/* give inbound qeth_qdio_buffers their qdio_buffers */
+	for (i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i)
+		card->qdio.in_q->bufs[i].buffer =
+			&card->qdio.in_q->qdio_bufs[i];
+	/* inbound buffer pool */
+	if (qeth_alloc_buffer_pool(card)){
+		kfree(card->qdio.in_q);
+		return -ENOMEM;
+	}
+	/* outbound */
+	card->qdio.out_qs =
+		kmalloc(card->qdio.no_out_queues *
+			sizeof(struct qeth_qdio_out_q *), GFP_KERNEL);
+	if (!card->qdio.out_qs){
+		qeth_free_buffer_pool(card);
+		return -ENOMEM;
+	}
+	for (i = 0; i < card->qdio.no_out_queues; ++i){
+		card->qdio.out_qs[i] = kmalloc(sizeof(struct qeth_qdio_out_q),
+					       GFP_KERNEL);
+		if (!card->qdio.out_qs[i]){
+			while (i > 0)
+				kfree(card->qdio.out_qs[--i]);
+			kfree(card->qdio.out_qs);
+			return -ENOMEM;
+		}
+		QETH_DBF_TEXT_(setup, 2, "outq %i", i);
+		QETH_DBF_HEX(setup, 2, &card->qdio.out_qs[i], sizeof(void *));
+		memset(card->qdio.out_qs[i], 0, sizeof(struct qeth_qdio_out_q));
+		card->qdio.out_qs[i]->queue_no = i;
+		/* give inbound qeth_qdio_buffers their qdio_buffers */
+		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j){
+			card->qdio.out_qs[i]->bufs[j].buffer =
+				&card->qdio.out_qs[i]->qdio_bufs[j];
+			skb_queue_head_init(&card->qdio.out_qs[i]->bufs[j].
+					    skb_list);
+		}
+	}
+	card->qdio.state = QETH_QDIO_ALLOCATED;
+	return 0;
+}
+
+static void
+qeth_free_qdio_buffers(struct qeth_card *card)
+{
+	int i, j;
+
+	QETH_DBF_TEXT(trace, 2, "freeqdbf");
+	if (card->qdio.state == QETH_QDIO_UNINITIALIZED)
+		return;
+	kfree(card->qdio.in_q);
+	/* inbound buffer pool */
+	qeth_free_buffer_pool(card);
+	/* free outbound qdio_qs */
+	for (i = 0; i < card->qdio.no_out_queues; ++i){
+		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j)
+			qeth_clear_output_buffer(card, &card->qdio.
+						out_qs[i]->bufs[j]);
+		kfree(card->qdio.out_qs[i]);
+	}
+	kfree(card->qdio.out_qs);
+	card->qdio.state = QETH_QDIO_UNINITIALIZED;
+}
+
+static void
+qeth_clear_qdio_buffers(struct qeth_card *card)
+{
+	int i, j;
+
+	QETH_DBF_TEXT(trace, 2, "clearqdbf");
+	/* clear outbound buffers to free skbs */
+	for (i = 0; i < card->qdio.no_out_queues; ++i)
+		if (card->qdio.out_qs[i]){
+			for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j)
+				qeth_clear_output_buffer(card, &card->qdio.
+						out_qs[i]->bufs[j]);
+		}
+}
+
+static void
+qeth_init_qdio_info(struct qeth_card *card)
+{
+	QETH_DBF_TEXT(setup, 4, "intqdinf");
+	card->qdio.state = QETH_QDIO_UNINITIALIZED;
+	/* inbound */
+	card->qdio.in_buf_size = QETH_IN_BUF_SIZE_DEFAULT;
+	card->qdio.init_pool.buf_count = QETH_IN_BUF_COUNT_DEFAULT;
+	card->qdio.in_buf_pool.buf_count = card->qdio.init_pool.buf_count;
+	INIT_LIST_HEAD(&card->qdio.in_buf_pool.entry_list);
+	INIT_LIST_HEAD(&card->qdio.init_pool.entry_list);
+	card->qdio.in_tasklet.data = (unsigned long) card;
+	card->qdio.in_tasklet.func = qeth_qdio_input_tasklet;
+	/* outbound */
+	card->qdio.do_prio_queueing = QETH_PRIOQ_DEFAULT;
+	card->qdio.default_out_queue = QETH_DEFAULT_QUEUE;
+}
+
+static int
+qeth_init_qdio_queues(struct qeth_card *card)
+{
+	int i, j;
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "initqdqs");
+
+	/* inbound queue */
+	memset(card->qdio.in_q->qdio_bufs, 0,
+	       QDIO_MAX_BUFFERS_PER_Q * sizeof(struct qdio_buffer));
+	card->qdio.in_q->next_buf_to_process = 0;
+	card->qdio.in_q->next_buf_to_init = 0;
+	/*give only as many buffers to hardware as we have buffer pool entries*/
+	for (i = 0; i < card->qdio.in_buf_pool.buf_count; ++i)
+		qeth_init_input_buffer(card, &card->qdio.in_q->bufs[i]);
+	card->qdio.in_q->next_buf_to_init = card->qdio.in_buf_pool.buf_count;
+	rc = do_QDIO(CARD_DDEV(card), QDIO_FLAG_SYNC_INPUT, 0, 0,
+		     card->qdio.in_buf_pool.buf_count, NULL);
+	if (rc) {
+		QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		return rc;
+	}
+	rc = qdio_synchronize(CARD_DDEV(card), QDIO_FLAG_SYNC_INPUT, 0);
+	if (rc) {
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		return rc;
+	}
+	/* outbound queue */
+	for (i = 0; i < card->qdio.no_out_queues; ++i){
+		memset(card->qdio.out_qs[i]->qdio_bufs, 0,
+		       QDIO_MAX_BUFFERS_PER_Q * sizeof(struct qdio_buffer));
+		for (j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j){
+			qeth_clear_output_buffer(card, &card->qdio.
+						 out_qs[i]->bufs[j]);
+		}
+		card->qdio.out_qs[i]->card = card;
+		card->qdio.out_qs[i]->next_buf_to_fill = 0;
+		card->qdio.out_qs[i]->next_buf_to_flush = 0;
+		card->qdio.out_qs[i]->do_pack = 0;
+		atomic_set(&card->qdio.out_qs[i]->used_buffers,0);
+		atomic_set(&card->qdio.out_qs[i]->set_pci_flags_count, 0);
+		card->qdio.out_qs[i]->tasklet.data =
+			(unsigned long) card->qdio.out_qs[i];
+		card->qdio.out_qs[i]->tasklet.func = qeth_qdio_output_tasklet;
+		spin_lock_init(&card->qdio.out_qs[i]->lock);
+	}
+	return 0;
+}
+
+static int
+qeth_qdio_establish(struct qeth_card *card)
+{
+	struct qdio_initialize init_data;
+	char *qib_param_field;
+	struct qdio_buffer **in_sbal_ptrs;
+	struct qdio_buffer **out_sbal_ptrs;
+	int i, j, k;
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "qdioest");
+	qib_param_field = qeth_create_qib_param_field(card);
+	if (!qib_param_field)
+		return -ENOMEM;
+
+	in_sbal_ptrs = kmalloc(QDIO_MAX_BUFFERS_PER_Q * sizeof(void *),
+			       GFP_KERNEL);
+	if (!in_sbal_ptrs) {
+		kfree(qib_param_field);
+		return -ENOMEM;
+	}
+	for(i = 0; i < QDIO_MAX_BUFFERS_PER_Q; ++i)
+		in_sbal_ptrs[i] = (struct qdio_buffer *)
+			virt_to_phys(card->qdio.in_q->bufs[i].buffer);
+
+	out_sbal_ptrs =
+		kmalloc(card->qdio.no_out_queues * QDIO_MAX_BUFFERS_PER_Q *
+			sizeof(void *), GFP_KERNEL);
+	if (!out_sbal_ptrs) {
+		kfree(in_sbal_ptrs);
+		kfree(qib_param_field);
+		return -ENOMEM;
+	}
+	for(i = 0, k = 0; i < card->qdio.no_out_queues; ++i)
+		for(j = 0; j < QDIO_MAX_BUFFERS_PER_Q; ++j, ++k){
+			out_sbal_ptrs[k] = (struct qdio_buffer *)
+				virt_to_phys(card->qdio.out_qs[i]->
+					     bufs[j].buffer);
+		}
+
+	memset(&init_data, 0, sizeof(struct qdio_initialize));
+	init_data.cdev                   = CARD_DDEV(card);
+	init_data.q_format               = qeth_get_qdio_q_format(card);
+	init_data.qib_param_field_format = 0;
+	init_data.qib_param_field        = qib_param_field;
+	init_data.min_input_threshold    = QETH_MIN_INPUT_THRESHOLD;
+	init_data.max_input_threshold    = QETH_MAX_INPUT_THRESHOLD;
+	init_data.min_output_threshold   = QETH_MIN_OUTPUT_THRESHOLD;
+	init_data.max_output_threshold   = QETH_MAX_OUTPUT_THRESHOLD;
+	init_data.no_input_qs            = 1;
+	init_data.no_output_qs           = card->qdio.no_out_queues;
+	init_data.input_handler          = (qdio_handler_t *)
+					   qeth_qdio_input_handler;
+	init_data.output_handler         = (qdio_handler_t *)
+					   qeth_qdio_output_handler;
+	init_data.int_parm               = (unsigned long) card;
+	init_data.flags                  = QDIO_INBOUND_0COPY_SBALS |
+					   QDIO_OUTBOUND_0COPY_SBALS |
+					   QDIO_USE_OUTBOUND_PCIS;
+	init_data.input_sbal_addr_array  = (void **) in_sbal_ptrs;
+	init_data.output_sbal_addr_array = (void **) out_sbal_ptrs;
+
+	if (!(rc = qdio_initialize(&init_data)))
+		card->qdio.state = QETH_QDIO_ESTABLISHED;
+
+	kfree(out_sbal_ptrs);
+	kfree(in_sbal_ptrs);
+	kfree(qib_param_field);
+	return rc;
+}
+
+static int
+qeth_qdio_activate(struct qeth_card *card)
+{
+	QETH_DBF_TEXT(setup,3,"qdioact");
+	return qdio_activate(CARD_DDEV(card), 0);
+}
+
+static int
+qeth_clear_channel(struct qeth_channel *channel)
+{
+	unsigned long flags;
+	struct qeth_card *card;
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"clearch");
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+	rc = ccw_device_clear(channel->ccwdev, QETH_CLEAR_CHANNEL_PARM);
+	spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+
+	if (rc)
+		return rc;
+	rc = wait_event_interruptible_timeout(card->wait_q,
+			channel->state==CH_STATE_STOPPED, QETH_TIMEOUT);
+	if (rc == -ERESTARTSYS)
+		return rc;
+	if (channel->state != CH_STATE_STOPPED)
+		return -ETIME;
+	channel->state = CH_STATE_DOWN;
+	return 0;
+}
+
+static int
+qeth_halt_channel(struct qeth_channel *channel)
+{
+	unsigned long flags;
+	struct qeth_card *card;
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"haltch");
+	card = CARD_FROM_CDEV(channel->ccwdev);
+	spin_lock_irqsave(get_ccwdev_lock(channel->ccwdev), flags);
+	rc = ccw_device_halt(channel->ccwdev, QETH_HALT_CHANNEL_PARM);
+	spin_unlock_irqrestore(get_ccwdev_lock(channel->ccwdev), flags);
+
+	if (rc)
+		return rc;
+	rc = wait_event_interruptible_timeout(card->wait_q,
+			channel->state==CH_STATE_HALTED, QETH_TIMEOUT);
+	if (rc == -ERESTARTSYS)
+		return rc;
+	if (channel->state != CH_STATE_HALTED)
+		return -ETIME;
+	return 0;
+}
+
+static int
+qeth_halt_channels(struct qeth_card *card)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"haltchs");
+	if ((rc = qeth_halt_channel(&card->read)))
+		return rc;
+	if ((rc = qeth_halt_channel(&card->write)))
+		return rc;
+	return  qeth_halt_channel(&card->data);
+}
+static int
+qeth_clear_channels(struct qeth_card *card)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"clearchs");
+	if ((rc = qeth_clear_channel(&card->read)))
+		return rc;
+	if ((rc = qeth_clear_channel(&card->write)))
+		return rc;
+	return  qeth_clear_channel(&card->data);
+}
+
+static int
+qeth_clear_halt_card(struct qeth_card *card, int halt)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"clhacrd");
+	QETH_DBF_HEX(trace, 3, &card, sizeof(void *));
+
+	if (halt)
+		rc = qeth_halt_channels(card);
+	if (rc)
+		return rc;
+	return qeth_clear_channels(card);
+}
+
+static int
+qeth_qdio_clear_card(struct qeth_card *card, int use_halt)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"qdioclr");
+	if (card->qdio.state == QETH_QDIO_ESTABLISHED){
+		qdio_cleanup(CARD_DDEV(card),
+			     (card->info.type == QETH_CARD_TYPE_IQD) ?
+			     QDIO_FLAG_CLEANUP_USING_HALT :
+			     QDIO_FLAG_CLEANUP_USING_CLEAR);
+		card->qdio.state = QETH_QDIO_ALLOCATED;
+	}
+	rc = qeth_clear_halt_card(card, use_halt);
+	card->state = CARD_STATE_DOWN;
+	return rc;
+}
+
+static int
+qeth_dm_act(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(setup,2,"dmact");
+
+	iob = qeth_wait_for_buffer(&card->write);
+	memcpy(iob->data, DM_ACT, DM_ACT_SIZE);
+
+	memcpy(QETH_DM_ACT_DEST_ADDR(iob->data),
+	       &card->token.cm_connection_r, QETH_MPC_TOKEN_LENGTH);
+	memcpy(QETH_DM_ACT_CONNECTION_TOKEN(iob->data),
+	       &card->token.ulp_connection_r, QETH_MPC_TOKEN_LENGTH);
+	rc = qeth_send_control_data(card, DM_ACT_SIZE, iob, NULL, NULL);
+	return rc;
+}
+
+static int
+qeth_mpc_initialize(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(setup,2,"mpcinit");
+
+	if ((rc = qeth_issue_next_read(card))){
+		QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		return rc;
+	}
+	if ((rc = qeth_cm_enable(card))){
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		return rc;
+	}
+	if ((rc = qeth_cm_setup(card))){
+		QETH_DBF_TEXT_(setup, 2, "3err%d", rc);
+		return rc;
+	}
+	if ((rc = qeth_ulp_enable(card))){
+		QETH_DBF_TEXT_(setup, 2, "4err%d", rc);
+		return rc;
+	}
+	if ((rc = qeth_ulp_setup(card))){
+		QETH_DBF_TEXT_(setup, 2, "5err%d", rc);
+		return rc;
+	}
+	if ((rc = qeth_alloc_qdio_buffers(card))){
+		QETH_DBF_TEXT_(setup, 2, "5err%d", rc);
+		return rc;
+	}
+	if ((rc = qeth_qdio_establish(card))){
+		QETH_DBF_TEXT_(setup, 2, "6err%d", rc);
+		qeth_free_qdio_buffers(card);
+		goto out_qdio;
+	}
+ 	if ((rc = qeth_qdio_activate(card))){
+		QETH_DBF_TEXT_(setup, 2, "7err%d", rc);
+		goto out_qdio;
+	}
+	if ((rc = qeth_dm_act(card))){
+		QETH_DBF_TEXT_(setup, 2, "8err%d", rc);
+		goto out_qdio;
+	}
+
+	return 0;
+out_qdio:
+	qeth_qdio_clear_card(card, card->info.type==QETH_CARD_TYPE_OSAE);
+	return rc;
+}
+
+static void
+qeth_set_device_name(struct qeth_card *card)
+{
+	char buf[IF_NAME_LEN];
+
+	memset(buf, 0, IF_NAME_LEN);
+	if (card->info.type == QETH_CARD_TYPE_IQD) {
+		sprintf(buf,"hsi%d", atomic_read(&qeth_hsi_count));
+		atomic_inc(&qeth_hsi_count);
+		memcpy(card->dev->name,buf,IF_NAME_LEN);
+	}
+
+}
+
+static struct net_device *
+qeth_get_netdevice(enum qeth_card_types type, enum qeth_link_types linktype)
+{
+	struct net_device *dev = NULL;
+
+	switch (type) {
+	case QETH_CARD_TYPE_OSAE:
+		switch (linktype) {
+		case QETH_LINK_TYPE_LANE_TR:
+		case QETH_LINK_TYPE_HSTR:
+#ifdef CONFIG_TR
+			dev = alloc_trdev(0);
+#endif /* CONFIG_TR */
+			break;
+		default:
+			dev = alloc_etherdev(0);
+		}
+		break;
+	case QETH_CARD_TYPE_IQD:
+	default:
+		dev = alloc_etherdev(0);
+	}
+	return dev;
+}
+
+static inline int
+qeth_send_packet(struct qeth_card *, struct sk_buff *);
+
+static int
+qeth_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	int rc;
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace, 6, "hrdstxmi");
+	card = (struct qeth_card *)dev->priv;
+	if (skb==NULL) {
+		card->stats.tx_dropped++;
+		card->stats.tx_errors++;
+		return -EIO;
+	}
+	if (card->state != CARD_STATE_UP_LAN_ONLINE) {
+		card->stats.tx_dropped++;
+		card->stats.tx_errors++;
+		card->stats.tx_carrier_errors++;
+		return -EIO;
+	}
+	if (netif_queue_stopped(dev) ) {
+		card->stats.tx_dropped++;
+		return -EBUSY;
+	}
+#ifdef CONFIG_QETH_PERF_STATS
+	card->perf_stats.outbound_start_time = qeth_get_micros();
+#endif
+	/*
+	 * dev_queue_xmit should ensure that we are called packet
+	 * after packet
+	 */
+	netif_stop_queue(dev);
+	if (!(rc = qeth_send_packet(card, skb)))
+		netif_wake_queue(dev);
+
+	return rc;
+}
+
+static int
+qeth_verify_vlan_dev(struct net_device *dev, struct qeth_card *card)
+{
+	int rc = 0;
+#ifdef CONFIG_QETH_VLAN
+	struct vlan_group *vg;
+	int i;
+
+	if (!(vg = card->vlangrp))
+		return rc;
+
+	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++){
+		if (vg->vlan_devices[i] == dev){
+			rc = QETH_VLAN_CARD;
+			break;
+		}
+	}
+#endif
+	return rc;
+}
+
+static int
+qeth_verify_dev(struct net_device *dev)
+{
+	struct qeth_card *card;
+	unsigned long flags;
+	int rc = 0;
+
+	read_lock_irqsave(&qeth_card_list.rwlock, flags);
+	list_for_each_entry(card, &qeth_card_list.list, list){
+		if (card->dev == dev){
+			rc = QETH_REAL_CARD;
+			break;
+		}
+		rc = qeth_verify_vlan_dev(dev, card);
+		if (rc)
+			break;
+	}
+	read_unlock_irqrestore(&qeth_card_list.rwlock, flags);
+
+	return rc;
+}
+
+static struct qeth_card *
+qeth_get_card_from_dev(struct net_device *dev)
+{
+	struct qeth_card *card = NULL;
+	int rc;
+
+	rc = qeth_verify_dev(dev);
+	if (rc == QETH_REAL_CARD)
+		card = (struct qeth_card *)dev->priv;
+	else if (rc == QETH_VLAN_CARD)
+		card = (struct qeth_card *)
+			VLAN_DEV_INFO(dev)->real_dev->priv;
+
+	QETH_DBF_TEXT_(trace, 4, "%d", rc);
+	return card ;
+}
+
+static void
+qeth_tx_timeout(struct net_device *dev)
+{
+	struct qeth_card *card;
+
+	card = (struct qeth_card *) dev->priv;
+	card->stats.tx_errors++;
+	qeth_schedule_recovery(card);
+}
+
+static int
+qeth_open(struct net_device *dev)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace, 4, "qethopen");
+
+	card = (struct qeth_card *) dev->priv;
+
+	if ((card->state != CARD_STATE_SOFTSETUP) &&
+	    (card->state != CARD_STATE_UP_LAN_OFFLINE))
+		return -ENODEV;
+	if (!card->lan_online){
+		card->state = CARD_STATE_UP_LAN_OFFLINE;
+		return -EIO;
+	}
+
+	card->dev->flags |= IFF_UP;
+	netif_start_queue(dev);
+	card->data.state = CH_STATE_UP;
+	card->state = CARD_STATE_UP_LAN_ONLINE;
+	return 0;
+}
+
+static int
+qeth_stop(struct net_device *dev)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace, 4, "qethstop");
+
+	card = (struct qeth_card *) dev->priv;
+
+	netif_stop_queue(dev);
+	card->dev->flags &= ~IFF_UP;
+	if ((card->state == CARD_STATE_UP_LAN_ONLINE) ||
+	    (card->state == CARD_STATE_UP_LAN_OFFLINE))
+		card->state = CARD_STATE_SOFTSETUP;
+	return 0;
+}
+
+static inline int
+qeth_get_cast_type(struct qeth_card *card, struct sk_buff *skb)
+{
+	int cast_type = RTN_UNSPEC;
+
+	if (skb->dst && skb->dst->neighbour){
+		cast_type = skb->dst->neighbour->type;
+		if ((cast_type == RTN_BROADCAST) ||
+		    (cast_type == RTN_MULTICAST) ||
+		    (cast_type == RTN_ANYCAST))
+			return cast_type;
+		else
+			return RTN_UNSPEC;
+	}
+	/* try something else */
+	if (skb->protocol == ETH_P_IPV6)
+		return (skb->nh.raw[24] == 0xff) ? RTN_MULTICAST : 0;
+	else if (skb->protocol == ETH_P_IP)
+		return ((skb->nh.raw[16] & 0xf0) == 0xe0) ? RTN_MULTICAST : 0;
+	/* ... */
+	if (!memcmp(skb->nh.raw, skb->dev->broadcast, 6))
+		return RTN_BROADCAST;
+	else {
+		u16 hdr_mac;
+
+	        hdr_mac = *((u16 *)skb->nh.raw);
+	        /* tr multicast? */
+	        switch (card->info.link_type) {
+	        case QETH_LINK_TYPE_HSTR:
+	        case QETH_LINK_TYPE_LANE_TR:
+	        	if ((hdr_mac == QETH_TR_MAC_NC) ||
+			    (hdr_mac == QETH_TR_MAC_C))
+				return RTN_MULTICAST;
+	        /* eth or so multicast? */
+                default:
+                      	if ((hdr_mac == QETH_ETH_MAC_V4) ||
+			    (hdr_mac == QETH_ETH_MAC_V6))
+			        return RTN_MULTICAST;
+	        }
+        }
+	return cast_type;
+}
+
+static inline int
+qeth_get_priority_queue(struct qeth_card *card, struct sk_buff *skb,
+		        int ipv, int cast_type)
+{
+	if (!ipv && (card->info.type == QETH_CARD_TYPE_OSAE))
+		return card->qdio.default_out_queue;
+	switch (card->qdio.no_out_queues) {
+	case 4:
+		if (cast_type && card->info.is_multicast_different)
+			return card->info.is_multicast_different &
+				(card->qdio.no_out_queues - 1);
+		if (card->qdio.do_prio_queueing && (ipv == 4)) {
+			if (card->qdio.do_prio_queueing==QETH_PRIO_Q_ING_TOS){
+				if (skb->nh.iph->tos & IP_TOS_NOTIMPORTANT)
+					return 3;
+				if (skb->nh.iph->tos & IP_TOS_HIGHRELIABILITY)
+					return 2;
+				if (skb->nh.iph->tos & IP_TOS_HIGHTHROUGHPUT)
+					return 1;
+				if (skb->nh.iph->tos & IP_TOS_LOWDELAY)
+					return 0;
+			}
+			if (card->qdio.do_prio_queueing==QETH_PRIO_Q_ING_PREC)
+				return 3 - (skb->nh.iph->tos >> 6);
+		} else if (card->qdio.do_prio_queueing && (ipv == 6)) {
+			/* TODO: IPv6!!! */
+		}
+		return card->qdio.default_out_queue;
+	default:
+		return 0;
+	}
+}
+
+static inline int
+qeth_get_ip_version(struct sk_buff *skb)
+{
+	switch (skb->protocol) {
+	case ETH_P_IPV6:
+		return 6;
+	case ETH_P_IP:
+		return 4;
+	default:
+		return 0;
+	}
+}
+
+static inline int
+qeth_prepare_skb(struct qeth_card *card, struct sk_buff **skb,
+		 struct qeth_hdr **hdr, int ipv)
+{
+	struct sk_buff *new_skb;
+#ifdef CONFIG_QETH_VLAN
+	u16 *tag;
+#endif
+
+	QETH_DBF_TEXT(trace, 6, "prepskb");
+	if (skb_headroom(*skb) < sizeof(struct qeth_hdr)){
+		new_skb = skb_realloc_headroom(*skb, sizeof(struct qeth_hdr));
+		if (!new_skb) {
+			PRINT_ERR("qeth_prepare_skb: could "
+				  "not realloc headroom for qeth_hdr "
+				  "on interface %s", card->info.if_name);
+			return -ENOMEM;
+		}
+		*skb = new_skb;
+	}
+#ifdef CONFIG_QETH_VLAN
+	if (card->vlangrp && vlan_tx_tag_present(*skb) && (ipv == 6)){
+		/*
+		 * Move the mac addresses (6 bytes src, 6 bytes dest)
+		 * to the beginning of the new header.  We are using three
+		 * memcpys instead of one memmove to save cycles.
+		 */
+		skb_push(*skb, VLAN_HLEN);
+		memcpy((*skb)->data, (*skb)->data + 4, 4);
+		memcpy((*skb)->data + 4, (*skb)->data + 8, 4);
+		memcpy((*skb)->data + 8, (*skb)->data + 12, 4);
+		tag = (u16 *) (*skb)->data + 12;
+		/*
+		 * first two bytes  = ETH_P_8021Q (0x8100)
+		 * second two bytes = VLANID
+		 */
+		*tag = __constant_htons(ETH_P_8021Q);
+		*(tag + 1) = vlan_tx_tag_get(*skb);
+		*(tag + 1) = htons(*(tag + 1));
+	}
+#endif
+	*hdr = (struct qeth_hdr *) skb_push(*skb, sizeof(struct qeth_hdr));
+	/*
+	 * sanity check, the Linux memory allocation scheme should
+	 * never present us cases like this one (the 32bytes header plus
+	 * the first 40 bytes of the paket cross a 4k boundary)
+	 */
+	if ((((unsigned long) *hdr) & (~(PAGE_SIZE - 1))) !=
+	    (((unsigned long) *hdr + sizeof(struct qeth_hdr) +
+	      QETH_IP_HEADER_SIZE) & (~(PAGE_SIZE - 1)))) {
+		PRINT_ERR("qeth_prepare_skb: misaligned "
+			  "packet on interface %s. Discarded.",
+			  card->info.if_name);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline u8
+qeth_get_qeth_hdr_flags4(int cast_type)
+{
+	if (cast_type == RTN_MULTICAST)
+		return QETH_CAST_MULTICAST;
+	if (cast_type == RTN_BROADCAST)
+		return QETH_CAST_BROADCAST;
+	return QETH_CAST_UNICAST;
+}
+
+static inline u8
+qeth_get_qeth_hdr_flags6(int cast_type)
+{
+	u8 ct = QETH_HDR_PASSTHRU | QETH_HDR_IPV6;
+	if (cast_type == RTN_MULTICAST)
+		return ct | QETH_CAST_MULTICAST;
+	if (cast_type == RTN_ANYCAST)
+		return ct | QETH_CAST_ANYCAST;
+	if (cast_type == RTN_BROADCAST)
+		return ct | QETH_CAST_BROADCAST;
+	return ct | QETH_CAST_UNICAST;
+}
+
+static inline void
+qeth_fill_header(struct qeth_card *card, struct qeth_hdr *hdr,
+		struct sk_buff *skb, int ipv, int cast_type)
+{
+	hdr->id = 1;
+	hdr->ext_flags = 0;
+
+	QETH_DBF_TEXT(trace, 6, "fillhdr");
+#ifdef CONFIG_QETH_VLAN
+	/*
+	 * before we're going to overwrite this location with next hop ip.
+	 * v6 uses passthrough, v4 sets the tag in the QDIO header.
+	 */
+	if (card->vlangrp && vlan_tx_tag_present(skb)) {
+		hdr->ext_flags = (ipv == 4)? QETH_EXT_HDR_VLAN_FRAME :
+					     QETH_EXT_HDR_INCLUDE_VLAN_TAG;
+		hdr->vlan_id = vlan_tx_tag_get(skb);
+	}
+#endif /* CONFIG_QETH_VLAN */
+	hdr->length = skb->len - sizeof(struct qeth_hdr);
+	if (ipv == 4) {	 /* IPv4 */
+		hdr->flags = qeth_get_qeth_hdr_flags4(cast_type);
+		memset(hdr->dest_addr, 0, 12);
+		if ((skb->dst) && (skb->dst->neighbour)) {
+			*((u32 *) (&hdr->dest_addr[12])) =
+			    *((u32 *) skb->dst->neighbour->primary_key);
+		} else {
+			/* fill in destination address used in ip header */
+			*((u32 *) (&hdr->dest_addr[12])) = skb->nh.iph->daddr;
+		}
+	} else if (ipv == 6) { /* IPv6 or passthru */
+		hdr->flags = qeth_get_qeth_hdr_flags6(cast_type);
+		if ((skb->dst) && (skb->dst->neighbour)) {
+			memcpy(hdr->dest_addr,
+			       skb->dst->neighbour->primary_key, 16);
+		} else {
+			/* fill in destination address used in ip header */
+			memcpy(hdr->dest_addr, &skb->nh.ipv6h->daddr, 16);
+		}
+	} else { /* passthrough */
+		if (!memcmp(skb->data + sizeof(struct qeth_hdr),
+			    skb->dev->broadcast, 6)) {   /* broadcast? */
+			hdr->flags = QETH_CAST_BROADCAST | QETH_HDR_PASSTHRU;
+		} else {
+ 			hdr->flags = (cast_type == RTN_MULTICAST) ?
+ 				QETH_CAST_MULTICAST | QETH_HDR_PASSTHRU :
+ 				QETH_CAST_UNICAST | QETH_HDR_PASSTHRU;
+		}
+	}
+}
+
+static inline int
+qeth_fill_buffer(struct qeth_qdio_out_q *queue, struct qeth_qdio_out_buffer *buf,
+		 char *data, struct sk_buff *skb)
+{
+	struct qdio_buffer *buffer;
+	int length = skb->len;
+	int length_here;
+	int element;
+	int first_lap = 1;
+
+	QETH_DBF_TEXT(trace, 6, "qdfillbf");
+
+	buffer = buf->buffer;
+	atomic_inc(&skb->users);
+	skb_queue_tail(&buf->skb_list, skb);
+	element = buf->next_element_to_fill;
+	while (length > 0) {
+		/* length_here is the remaining amount of data in this page */
+		length_here = PAGE_SIZE - ((unsigned long) data % PAGE_SIZE);
+		if (length < length_here)
+			length_here = length;
+		buffer->element[element].addr = data;
+		buffer->element[element].length = length_here;
+		length -= length_here;
+		if (!length){
+			if (first_lap)
+				buffer->element[element].flags = 0;
+			else
+				buffer->element[element].flags =
+				    SBAL_FLAGS_LAST_FRAG;
+		} else {
+			if (first_lap)
+				buffer->element[element].flags =
+				    SBAL_FLAGS_FIRST_FRAG;
+			else
+				buffer->element[element].flags =
+				    SBAL_FLAGS_MIDDLE_FRAG;
+		}
+		data += length_here;
+		element++;
+		first_lap = 0;
+	}
+	buf->next_element_to_fill = element;
+	if (!queue->do_pack) {
+		QETH_DBF_TEXT(trace, 6, "fillbfnp");
+		/* set state to PRIMED -> will be flushed */
+		buf->state = QETH_QDIO_BUF_PRIMED;
+	} else {
+		QETH_DBF_TEXT(trace, 6, "fillbfpa");
+#ifdef CONFIG_QETH_PERF_STATS
+		queue->card->perf_stats.skbs_sent_pack++;
+#endif
+		if (buf->next_element_to_fill >=
+				QETH_MAX_BUFFER_ELEMENTS(queue->card)) {
+			/*
+			 * packed buffer if full -> set state PRIMED
+			 * -> will be flushed
+			 */
+			buf->state = QETH_QDIO_BUF_PRIMED;
+		}
+	}
+	return 0;
+}
+
+static inline int
+qeth_do_send_packet(struct qeth_card *card, struct sk_buff *skb,
+		    struct qeth_qdio_out_q *queue, int ipv,
+		    int cast_type)
+{
+	struct qeth_hdr *hdr;
+	struct qeth_qdio_out_buffer *buffer;
+	int elements_needed;
+	int rc;
+
+	QETH_DBF_TEXT(trace, 6, "dosndpkt");
+
+	if ((rc = qeth_prepare_skb(card, &skb, &hdr, ipv))){
+		QETH_DBF_TEXT_(trace, 4, "1err%d", rc);
+		return rc;
+	}
+	qeth_fill_header(card, hdr, skb, ipv, cast_type);
+	elements_needed = 1 + (((((unsigned long) hdr) % PAGE_SIZE) + skb->len)
+				>> PAGE_SHIFT);
+	if (elements_needed > QETH_MAX_BUFFER_ELEMENTS(card)){
+		PRINT_ERR("qeth_do_send_packet: invalid size of "
+				 "IP packet. Discarded.");
+		return -EINVAL;
+	}
+
+	spin_lock(&queue->lock);
+	/* check if we need to switch packing state of this queue */
+	if (card->info.type != QETH_CARD_TYPE_IQD)
+		qeth_switch_packing_state(queue);
+	buffer = &queue->bufs[queue->next_buf_to_fill];
+	BUG_ON(buffer->state == QETH_QDIO_BUF_PRIMED);
+	if (queue->do_pack){
+		/* does packet fit in current buffer? */
+		if((QETH_MAX_BUFFER_ELEMENTS(card) - buffer->next_element_to_fill)
+				< elements_needed){
+			/* ... no -> set state PRIMED */
+			buffer->state = QETH_QDIO_BUF_PRIMED;
+			atomic_inc(&queue->used_buffers);
+			queue->next_buf_to_fill =
+				(queue->next_buf_to_fill + 1) %
+				QDIO_MAX_BUFFERS_PER_Q;
+			buffer = &queue->bufs[queue->next_buf_to_fill];
+		}
+	}
+
+	rc = qeth_fill_buffer(queue, buffer, (char *)hdr, skb);
+	if (rc) {
+		PRINT_WARN("qeth_do_send_packet: error during "
+			      "qeth_fill_buffer.");
+		card->stats.tx_dropped++;
+		spin_unlock(&queue->lock);
+		return rc;
+	}
+	if (buffer->state == QETH_QDIO_BUF_PRIMED){
+		/* next time fill the next buffer */
+		atomic_inc(&queue->used_buffers);
+		queue->next_buf_to_fill = (queue->next_buf_to_fill + 1) %
+			QDIO_MAX_BUFFERS_PER_Q;
+	}
+	spin_unlock(&queue->lock);
+
+	tasklet_schedule(&queue->tasklet);
+
+	return rc;
+}
+
+static inline int
+qeth_send_packet(struct qeth_card *card, struct sk_buff *skb)
+{
+	int ipv;
+	int cast_type;
+	struct qeth_qdio_out_q *queue;
+	int rc;
+
+	QETH_DBF_TEXT(trace, 6, "sendpkt");
+
+	ipv = qeth_get_ip_version(skb);
+	cast_type = qeth_get_cast_type(card, skb);
+	queue = card->qdio.out_qs
+		[qeth_get_priority_queue(card, skb, ipv, cast_type)];
+	/* do we have empty buffers? */
+	rc = (atomic_read(&queue->used_buffers) >=
+	      QDIO_MAX_BUFFERS_PER_Q - 1) ? -EBUSY : 0;
+	if (rc) {
+		card->stats.tx_dropped++;
+		QETH_DBF_TEXT_(trace, 4, "1err%d", rc);
+		return rc;
+	}
+
+	rc = qeth_do_send_packet(card, skb, queue, ipv, cast_type);
+
+	if (!rc){
+		card->stats.tx_packets++;
+		card->stats.tx_bytes += skb->len;
+#ifdef CONFIG_QETH_PERF_STATS
+		card->perf_stats.outbound_time += qeth_get_micros() -
+			card->perf_stats.outbound_start_time;
+#endif
+	}
+	return rc;
+}
+
+static int
+qeth_mdio_read(struct net_device *dev, int phy_id, int regnum)
+{
+	struct qeth_card *card = (struct qeth_card *) dev->priv;
+	int rc = 0;
+
+	switch(regnum){
+	case MII_BMCR: /* Basic mode control register */
+		rc = BMCR_FULLDPLX;
+		if(card->info.link_type != QETH_LINK_TYPE_GBIT_ETH)
+			rc |= BMCR_SPEED100;
+		break;
+	case MII_BMSR: /* Basic mode status register */
+		rc = BMSR_ERCAP | BMSR_ANEGCOMPLETE | BMSR_LSTATUS |
+		     BMSR_10HALF | BMSR_10FULL | BMSR_100HALF | BMSR_100FULL |
+		     BMSR_100BASE4;
+		break;
+	case MII_PHYSID1: /* PHYS ID 1 */
+		rc = (dev->dev_addr[0] << 16) | (dev->dev_addr[1] << 8) |
+		     dev->dev_addr[2];
+		rc = (rc >> 5) & 0xFFFF;
+		break;
+	case MII_PHYSID2: /* PHYS ID 2 */
+		rc = (dev->dev_addr[2] << 10) & 0xFFFF;
+		break;
+	case MII_ADVERTISE: /* Advertisement control reg */
+		rc = ADVERTISE_ALL;
+		break;
+	case MII_LPA: /* Link partner ability reg */
+		rc = LPA_10HALF | LPA_10FULL | LPA_100HALF | LPA_100FULL |
+		     LPA_100BASE4 | LPA_LPACK;
+		break;
+	case MII_EXPANSION: /* Expansion register */
+		break;
+	case MII_DCOUNTER: /* disconnect counter */
+		break;
+	case MII_FCSCOUNTER: /* false carrier counter */
+		break;
+	case MII_NWAYTEST: /* N-way auto-neg test register */
+		break;
+	case MII_RERRCOUNTER: /* rx error counter */
+		rc = card->stats.rx_errors;
+		break;
+	case MII_SREVISION: /* silicon revision */
+		break;
+	case MII_RESV1: /* reserved 1 */
+		break;
+	case MII_LBRERROR: /* loopback, rx, bypass error */
+		break;
+	case MII_PHYADDR: /* physical address */
+		break;
+	case MII_RESV2: /* reserved 2 */
+		break;
+	case MII_TPISTATUS: /* TPI status for 10mbps */
+		break;
+	case MII_NCONFIG: /* network interface config */
+		break;
+	default:
+		rc = 0;
+		break;
+	}
+	return rc;
+}
+
+static void
+qeth_mdio_write(struct net_device *dev, int phy_id, int regnum, int value)
+{
+	switch(regnum){
+	case MII_BMCR: /* Basic mode control register */
+	case MII_BMSR: /* Basic mode status register */
+	case MII_PHYSID1: /* PHYS ID 1 */
+	case MII_PHYSID2: /* PHYS ID 2 */
+	case MII_ADVERTISE: /* Advertisement control reg */
+	case MII_LPA: /* Link partner ability reg */
+	case MII_EXPANSION: /* Expansion register */
+	case MII_DCOUNTER: /* disconnect counter */
+	case MII_FCSCOUNTER: /* false carrier counter */
+	case MII_NWAYTEST: /* N-way auto-neg test register */
+	case MII_RERRCOUNTER: /* rx error counter */
+	case MII_SREVISION: /* silicon revision */
+	case MII_RESV1: /* reserved 1 */
+	case MII_LBRERROR: /* loopback, rx, bypass error */
+	case MII_PHYADDR: /* physical address */
+	case MII_RESV2: /* reserved 2 */
+	case MII_TPISTATUS: /* TPI status for 10mbps */
+	case MII_NCONFIG: /* network interface config */
+	default:
+		break;
+	}
+}
+
+static inline const char *
+qeth_arp_get_error_cause(int *rc)
+{
+	switch (*rc) {
+	case QETH_IPA_ARP_RC_FAILED:
+		*rc = -EIO;
+		return "operation failed";
+	case QETH_IPA_ARP_RC_NOTSUPP:
+		*rc = -EOPNOTSUPP;
+		return "operation not supported";
+	case QETH_IPA_ARP_RC_OUT_OF_RANGE:
+		*rc = -EINVAL;
+		return "argument out of range";
+	case QETH_IPA_ARP_RC_Q_NOTSUPP:
+		*rc = -EOPNOTSUPP;
+		return "query operation not supported";
+	case QETH_IPA_ARP_RC_Q_NO_DATA:
+		*rc = -ENOENT;
+		return "no query data available";
+	default:
+		return "unknown error";
+	}
+}
+
+static int
+qeth_send_simple_setassparms(struct qeth_card *, enum qeth_ipa_funcs,
+			     __u16, long);
+
+static int
+qeth_arp_set_no_entries(struct qeth_card *card, int no_entries)
+{
+	int tmp;
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"arpstnoe");
+
+	/* TODO: really not supported by GuestLAN? */
+	if (card->info.guestlan)
+		return -EOPNOTSUPP;
+	if (!qeth_is_supported(card,IPA_ARP_PROCESSING)) {
+		PRINT_WARN("ARP processing not supported "
+			   "on %s!\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+	rc = qeth_send_simple_setassparms(card, IPA_ARP_PROCESSING,
+					  IPA_CMD_ASS_ARP_SET_NO_ENTRIES,
+					  no_entries);
+	if (rc) {
+		tmp = rc;
+		PRINT_WARN("Could not set number of ARP entries on %s: "
+			   "%s (0x%x)\n",
+			   card->info.if_name, qeth_arp_get_error_cause(&rc),
+			   tmp);
+	}
+	return rc;
+}
+
+static int
+qeth_arp_query_cb(struct qeth_card *card, struct qeth_reply *reply,
+		  unsigned long data)
+{
+	struct qeth_ipa_arp_cmd *cmd;
+	struct qeth_arp_query_data *qdata;
+	struct qeth_arp_query_info *qinfo;
+	int entry_size;
+	int i;
+
+	QETH_DBF_TEXT(trace,4,"arpquecb");
+
+	qinfo = (struct qeth_arp_query_info *) reply->param;
+	cmd = (struct qeth_ipa_arp_cmd *) data;
+	if (cmd->ihdr.return_code) {
+		QETH_DBF_TEXT_(trace,4,"qaer1%i", cmd->ihdr.return_code);
+		return 0;
+	}
+	if (cmd->shdr.return_code) {
+		cmd->ihdr.return_code = cmd->shdr.return_code;
+		QETH_DBF_TEXT_(trace,4,"qaer2%i", cmd->ihdr.return_code);
+		return 0;
+	}
+	qdata = &cmd->data.query_arp;
+	switch(qdata->reply_bits){
+	case 5:
+		entry_size = sizeof(struct qeth_arp_qi_entry5);
+		break;
+	case 7:
+		entry_size = sizeof(struct qeth_arp_qi_entry7);
+		break;
+	default:
+		/* tr is the same as eth -> entry7 */
+		entry_size = sizeof(struct qeth_arp_qi_entry7);
+		break;
+	}
+	/* check if there is enough room in userspace */
+	if ((qinfo->udata_len - qinfo->udata_offset) <
+			qdata->no_entries * entry_size){
+		QETH_DBF_TEXT_(trace, 4, "qaer3%i", -ENOMEM);
+		cmd->ihdr.return_code = -ENOMEM;
+		goto out_error;
+	}
+	QETH_DBF_TEXT_(trace, 4, "anore%i", cmd->shdr.number_of_replies);
+	QETH_DBF_TEXT_(trace, 4, "aseqn%i", cmd->shdr.seq_no);
+	QETH_DBF_TEXT_(trace, 4, "anoen%i", qdata->no_entries);
+	for (i = 0; i < qdata->no_entries; ++i){
+		memcpy(qinfo->udata + qinfo->udata_offset,
+		       qdata->data + i*entry_size, entry_size);
+		qinfo->no_entries++;
+		qinfo->udata_offset += entry_size;
+	}
+	/* check if all replies received ... */
+	if (cmd->shdr.seq_no < cmd->shdr.number_of_replies)
+		return 1;
+	memcpy(qinfo->udata, &qinfo->no_entries, 4);
+	memcpy(qinfo->udata + QETH_QARP_MASK_OFFSET,&qdata->reply_bits,2);
+	return 0;
+out_error:
+	i = 0;
+	memcpy(qinfo->udata, &i, 4);
+	return 0;
+}
+
+static struct qeth_cmd_buffer *
+qeth_get_ipacmd_buffer(struct qeth_card *, enum qeth_ipa_cmds,
+		       enum qeth_prot_versions);
+
+struct qeth_cmd_buffer *
+qeth_get_ipa_arp_cmd_buffer(struct qeth_card *card, u16 cmd_code,
+			    u32 data_len, enum qeth_prot_versions proto)
+{
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_arp_cmd *cmd;
+	u16 s1, s2;
+
+	QETH_DBF_TEXT(trace,4,"getarpcm");
+	iob = qeth_get_ipacmd_buffer(card, IPA_CMD_SETASSPARMS, proto);
+
+	memcpy(iob->data, IPA_PDU_HEADER, IPA_PDU_HEADER_SIZE);
+	/* adjust sizes in IPA_PDU_HEADER */
+	s1 = (u32) IPA_PDU_HEADER_SIZE + QETH_ARP_CMD_BASE_LEN + data_len;
+	s2 = (u32) QETH_ARP_CMD_BASE_LEN + data_len;
+	memcpy(QETH_IPA_PDU_LEN_TOTAL(iob->data), &s1, 2);
+	memcpy(QETH_IPA_PDU_LEN_PDU1(iob->data), &s2, 2);
+	memcpy(QETH_IPA_PDU_LEN_PDU2(iob->data), &s2, 2);
+	memcpy(QETH_IPA_PDU_LEN_PDU3(iob->data), &s2, 2);
+
+	cmd = (struct qeth_ipa_arp_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	cmd->shdr.assist_no = IPA_ARP_PROCESSING;
+	cmd->shdr.length = 8 + data_len;
+	cmd->shdr.command_code = cmd_code;
+	cmd->shdr.return_code = 0;
+	cmd->shdr.seq_no = 0;
+
+	return iob;
+}
+
+static int
+qeth_send_ipa_arp_cmd(struct qeth_card *card, struct qeth_cmd_buffer *iob,
+		      char *data, int data_len,
+		      int (*reply_cb)
+		      (struct qeth_card *,struct qeth_reply*, unsigned long),
+		      void *reply_param)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,4,"sendarp");
+
+	memcpy(QETH_IPA_ARP_DATA_POS(iob->data), data, data_len);
+	memcpy(QETH_IPA_CMD_DEST_ADDR(iob->data),
+	       &card->token.ulp_connection_r, QETH_MPC_TOKEN_LENGTH);
+
+	rc = qeth_send_control_data(card, IPA_PDU_HEADER_SIZE +
+				    QETH_ARP_CMD_BASE_LEN + data_len, iob,
+				    reply_cb, reply_param);
+	return rc;
+}
+
+static int
+qeth_arp_query(struct qeth_card *card, char *udata)
+{
+	struct qeth_cmd_buffer *iob;
+	struct qeth_arp_query_data *qdata;
+	struct qeth_arp_query_info qinfo = {0, };
+	int tmp;
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"arpquery");
+
+	/* TODO: really not supported by GuestLAN? */
+	if (card->info.guestlan)
+		return -EOPNOTSUPP;
+	if (!qeth_is_supported(card,IPA_ARP_PROCESSING)) {
+		PRINT_WARN("ARP processing not supported "
+			   "on %s!\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+	/* get size of userspace mem area */
+	if (copy_from_user(&qinfo.udata_len, udata, 4))
+		return -EFAULT;
+	if (!(qinfo.udata = kmalloc(qinfo.udata_len, GFP_KERNEL)))
+		return -ENOMEM;
+	memset(qinfo.udata, 0, qinfo.udata_len);
+	qinfo.udata_offset = QETH_QARP_ENTRIES_OFFSET;
+	/* alloc mem area for the actual query */
+	if (!(qdata = kmalloc(sizeof(struct qeth_arp_query_data),
+			      GFP_KERNEL))){
+		kfree(qinfo.udata);
+		return -ENOMEM;
+	}
+	memset(qdata, 0, sizeof(struct qeth_arp_query_data));
+	iob = qeth_get_ipa_arp_cmd_buffer(card, IPA_CMD_ASS_ARP_QUERY_INFO,
+					  sizeof(struct qeth_arp_query_data),
+					  QETH_PROT_IPV4);
+	rc = qeth_send_ipa_arp_cmd(card, iob,
+				   (char *) qdata,
+				   sizeof(struct qeth_arp_query_data),
+				   qeth_arp_query_cb,
+				   (void *)&qinfo);
+	if (rc) {
+		tmp = rc;
+		PRINT_WARN("Error while querying ARP cache on %s: %s (0x%x)\n",
+			   card->info.if_name, qeth_arp_get_error_cause(&rc),
+			   tmp);
+		copy_to_user(udata, qinfo.udata, 4);
+	} else {
+		copy_to_user(udata, qinfo.udata, qinfo.udata_len);
+	}
+	kfree(qinfo.udata);
+	return rc;
+}
+
+static int
+qeth_default_setassparms_cb(struct qeth_card *, struct qeth_reply *,
+			    unsigned long);
+
+static struct qeth_cmd_buffer *
+qeth_get_setassparms_cmd(struct qeth_card *, enum qeth_ipa_funcs,
+			 __u16, __u16, enum qeth_prot_versions);
+
+static int
+qeth_send_setassparms(struct qeth_card *, struct qeth_cmd_buffer *,
+		      __u16, long,
+		      int (*reply_cb)
+		      (struct qeth_card *, struct qeth_reply *, unsigned long),
+		      void *reply_param);
+
+static int
+qeth_arp_add_entry(struct qeth_card *card, struct qeth_arp_cache_entry *entry)
+{
+	struct qeth_cmd_buffer *iob;
+	char buf[16];
+	int tmp;
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"arpadent");
+
+	/* TODO: really not supported by GuestLAN? */
+	if (card->info.guestlan)
+		return -EOPNOTSUPP;
+	if (!qeth_is_supported(card,IPA_ARP_PROCESSING)) {
+		PRINT_WARN("ARP processing not supported "
+			   "on %s!\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+
+	iob = qeth_get_setassparms_cmd(card, IPA_ARP_PROCESSING,
+				       IPA_CMD_ASS_ARP_ADD_ENTRY,
+				       sizeof(struct qeth_arp_cache_entry),
+				       QETH_PROT_IPV4);
+	rc = qeth_send_setassparms(card, iob,
+				   sizeof(struct qeth_arp_cache_entry),
+				   (unsigned long) entry,
+				   qeth_default_setassparms_cb, NULL);
+	if (rc) {
+		tmp = rc;
+		qeth_ipaddr4_to_string((u8 *)entry->ipaddr, buf);
+		PRINT_WARN("Could not add ARP entry for address %s on %s: "
+			   "%s (0x%x)\n",
+			   buf, card->info.if_name,
+			   qeth_arp_get_error_cause(&rc), tmp);
+	}
+	return rc;
+}
+
+static int
+qeth_arp_remove_entry(struct qeth_card *card, struct qeth_arp_cache_entry *entry)
+{
+	struct qeth_cmd_buffer *iob;
+	char buf[16] = {0, };
+	int tmp;
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"arprment");
+
+	/* TODO: really not supported by GuestLAN? */
+	if (card->info.guestlan)
+		return -EOPNOTSUPP;
+	if (!qeth_is_supported(card,IPA_ARP_PROCESSING)) {
+		PRINT_WARN("ARP processing not supported "
+			   "on %s!\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+	memcpy(buf, entry, 12);
+	iob = qeth_get_setassparms_cmd(card, IPA_ARP_PROCESSING,
+				       IPA_CMD_ASS_ARP_REMOVE_ENTRY,
+				       12,
+				       QETH_PROT_IPV4);
+	rc = qeth_send_setassparms(card, iob,
+				   12, (unsigned long)buf,
+				   qeth_default_setassparms_cb, NULL);
+	if (rc) {
+		tmp = rc;
+		memset(buf, 0, 16);
+		qeth_ipaddr4_to_string((u8 *)entry->ipaddr, buf);
+		PRINT_WARN("Could not delete ARP entry for address %s on %s: "
+			   "%s (0x%x)\n",
+			   buf, card->info.if_name,
+			   qeth_arp_get_error_cause(&rc), tmp);
+	}
+	return rc;
+}
+
+static int
+qeth_arp_flush_cache(struct qeth_card *card)
+{
+	int rc;
+	int tmp;
+
+	QETH_DBF_TEXT(trace,3,"arpflush");
+
+	/* TODO: really not supported by GuestLAN? */
+	if (card->info.guestlan)
+		return -EOPNOTSUPP;
+	if (!qeth_is_supported(card,IPA_ARP_PROCESSING)) {
+		PRINT_WARN("ARP processing not supported "
+			   "on %s!\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+	rc = qeth_send_simple_setassparms(card, IPA_ARP_PROCESSING,
+					  IPA_CMD_ASS_ARP_FLUSH_CACHE, 0);
+	if (rc){
+		tmp = rc;
+		PRINT_WARN("Could not flush ARP cache on %s: %s (0x%x)\n",
+			   card->info.if_name, qeth_arp_get_error_cause(&rc),
+			   tmp);
+	}
+	return rc;
+}
+
+static int
+qeth_do_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
+{
+	struct qeth_card *card = (struct qeth_card *)dev->priv;
+	struct qeth_arp_cache_entry arp_entry;
+	struct mii_ioctl_data *mii_data;
+	int rc = 0;
+
+	if (!card)
+		return -ENODEV;
+
+	if ((card->state != CARD_STATE_UP_LAN_ONLINE) &&
+	    (card->state != CARD_STATE_UP_LAN_OFFLINE))
+		return -ENODEV;
+
+	switch (cmd){
+	case SIOCDEVPRIVATE:
+	case SIOC_QETH_ARP_SET_NO_ENTRIES:
+		if (!capable(CAP_NET_ADMIN)){
+			rc = -EPERM;
+			break;
+		}
+		rc = qeth_arp_set_no_entries(card, rq->ifr_ifru.ifru_ivalue);
+		break;
+	case SIOCDEVPRIVATE+1:
+	case SIOC_QETH_ARP_QUERY_INFO:
+		if (!capable(CAP_NET_ADMIN)){
+			rc = -EPERM;
+			break;
+		}
+		rc = qeth_arp_query(card, rq->ifr_ifru.ifru_data);
+		break;
+	case SIOCDEVPRIVATE+2:
+	case SIOC_QETH_ARP_ADD_ENTRY:
+		if (!capable(CAP_NET_ADMIN)){
+			rc = -EPERM;
+			break;
+		}
+		if (copy_from_user(&arp_entry, rq->ifr_ifru.ifru_data,
+				   sizeof(struct qeth_arp_cache_entry)))
+			rc = -EFAULT;
+		else
+			rc = qeth_arp_add_entry(card, &arp_entry);
+		break;
+	case SIOCDEVPRIVATE+3:
+	case SIOC_QETH_ARP_REMOVE_ENTRY:
+		if (!capable(CAP_NET_ADMIN)){
+			rc = -EPERM;
+			break;
+		}
+		if (copy_from_user(&arp_entry, rq->ifr_ifru.ifru_data,
+				   sizeof(struct qeth_arp_cache_entry)))
+			rc = -EFAULT;
+		else
+			rc = qeth_arp_remove_entry(card, &arp_entry);
+		break;
+	case SIOCDEVPRIVATE+4:
+	case SIOC_QETH_ARP_FLUSH_CACHE:
+		if (!capable(CAP_NET_ADMIN)){
+			rc = -EPERM;
+			break;
+		}
+		rc = qeth_arp_flush_cache(card);
+		break;
+	case SIOCDEVPRIVATE+5:
+	case SIOC_QETH_ADP_SET_SNMP_CONTROL:
+		break;
+	case SIOCDEVPRIVATE+6:
+	case SIOC_QETH_GET_CARD_TYPE:
+		break;
+	case SIOCGMIIPHY:
+		mii_data = (struct mii_ioctl_data *) &rq->ifr_ifru.ifru_data;
+		mii_data->phy_id = 0;
+		break;
+	case SIOCGMIIREG:
+		mii_data = (struct mii_ioctl_data *) &rq->ifr_ifru.ifru_data;
+		if (mii_data->phy_id != 0)
+			rc = -EINVAL;
+		else
+			mii_data->val_out = qeth_mdio_read(dev,mii_data->phy_id,
+							   mii_data->reg_num);
+		break;
+	case SIOCSMIIREG:
+		rc = -EOPNOTSUPP;
+		break;
+		/* TODO: remove return if qeth_mdio_write does something */
+		if (!capable(CAP_NET_ADMIN)){
+			rc = -EPERM;
+			break;
+		}
+		mii_data = (struct mii_ioctl_data *) &rq->ifr_ifru.ifru_data;
+		if (mii_data->phy_id != 0)
+			rc = -EINVAL;
+		else
+			qeth_mdio_write(dev, mii_data->phy_id, mii_data->reg_num,
+					mii_data->val_in);
+		break;
+	default:
+		rc = -EOPNOTSUPP;
+	}
+	return rc;
+}
+
+static struct net_device_stats *
+qeth_get_stats(struct net_device *dev)
+{
+	struct qeth_card *card;
+
+	card = (struct qeth_card *) (dev->priv);
+
+	QETH_DBF_TEXT(trace,5,"getstat");
+
+	return &card->stats;
+}
+
+static int
+qeth_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct qeth_card *card;
+	char dbf_text[15];
+
+	card = (struct qeth_card *) (dev->priv);
+
+	QETH_DBF_TEXT(trace,4,"chgmtu");
+	sprintf(dbf_text, "%8x", new_mtu);
+	QETH_DBF_TEXT(trace,4,dbf_text);
+
+	if (new_mtu < 64)
+		return -EINVAL;
+	if (new_mtu > 65535)
+		return -EINVAL;
+	if ((!qeth_is_supported(card,IPA_IP_FRAGMENTATION)) &&
+	    (!qeth_mtu_is_valid(card, new_mtu)))
+		return -EINVAL;
+	dev->mtu = new_mtu;
+	return 0;
+}
+
+#ifdef CONFIG_QETH_VLAN
+static void
+qeth_vlan_rx_register(struct net_device *dev, struct vlan_group *grp)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,4,"vlanreg");
+
+	card = (struct qeth_card *) dev->priv;
+	spin_lock_irq(&card->vlanlock);
+	card->vlangrp = grp;
+	spin_unlock_irq(&card->vlanlock);
+}
+
+static void
+qeth_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,4,"vlkilvid");
+
+	card = (struct qeth_card *) dev->priv;
+	spin_lock_irq(&card->vlanlock);
+	if (card->vlangrp)
+		card->vlangrp->vlan_devices[vid] = NULL;
+	spin_unlock_irq(&card->vlanlock);
+	/* delete mc addresses for this vlan dev */
+	qeth_set_thread_start_bit(card, QETH_SET_MC_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+}
+#endif
+
+static int
+qeth_neigh_setup(struct net_device *dev, struct neigh_parms *np)
+{
+	return 0;
+}
+
+#ifdef CONFIG_QETH_IPV6
+int
+qeth_ipv6_generate_eui64(u8 * eui, struct net_device *dev)
+{
+	switch (dev->type) {
+	case ARPHRD_ETHER:
+	case ARPHRD_FDDI:
+	case ARPHRD_IEEE802_TR:
+		if (dev->addr_len != ETH_ALEN)
+			return -1;
+		memcpy(eui, dev->dev_addr, 3);
+		memcpy(eui + 5, dev->dev_addr + 3, 3);
+		eui[3] = (dev->dev_id >> 8) & 0xff;
+		eui[4] = dev->dev_id & 0xff;
+		return 0;
+	}
+	return -1;
+
+}
+#endif
+
+static void
+qeth_get_mac_for_ipm(__u32 ipm, char *mac, struct net_device *dev)
+{
+	if (dev->type == ARPHRD_IEEE802_TR)
+		ip_tr_mc_map(ipm, mac);
+	else
+		ip_eth_mc_map(ipm, mac);
+}
+
+static struct qeth_ipaddr *
+qeth_get_addr_buffer(enum qeth_prot_versions prot)
+{
+	struct qeth_ipaddr *addr;
+
+	addr = kmalloc(sizeof(struct qeth_ipaddr), GFP_ATOMIC);
+	if (addr == NULL) {
+		PRINT_WARN("Not enough memory to add address\n");
+		return NULL;
+	}
+	memset(addr,0,sizeof(struct qeth_ipaddr));
+	addr->type = QETH_IP_TYPE_NORMAL;
+	addr->proto = prot;
+	addr->is_multicast = 0;
+	addr->users = 0;
+	addr->set_flags = 0;
+	addr->del_flags = 0;
+	return addr;
+}
+
+static void
+qeth_delete_mc_addresses(struct qeth_card *card)
+{
+	struct qeth_ipaddr *ipm, *iptodo;
+	unsigned long flags;
+
+	QETH_DBF_TEXT(trace,4,"delmc");
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry(ipm, &card->ip_list, entry){
+		if (!ipm->is_multicast)
+			continue;
+		iptodo = qeth_get_addr_buffer(ipm->proto);
+		memcpy(iptodo, ipm, sizeof(struct qeth_ipaddr));
+		iptodo->users = iptodo->users * -1;
+		if (!__qeth_insert_ip_todo(card, iptodo, 0))
+			kfree(iptodo);
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+}
+
+static inline void
+qeth_add_mc(struct qeth_card *card, struct in_device *in4_dev)
+{
+	struct qeth_ipaddr *ipm;
+	struct ip_mc_list *im4;
+	char buf[MAX_ADDR_LEN];
+
+	QETH_DBF_TEXT(trace,4,"addmc");
+	for (im4 = in4_dev->mc_list; im4; im4 = im4->next) {
+		qeth_get_mac_for_ipm(im4->multiaddr, buf, in4_dev->dev);
+		ipm = qeth_get_addr_buffer(QETH_PROT_IPV4);
+		if (!ipm)
+			continue;
+		ipm->u.a4.addr = im4->multiaddr;
+		memcpy(ipm->mac,buf,OSA_ADDR_LEN);
+		ipm->is_multicast = 1;
+		if (!qeth_add_ip(card,ipm))
+			kfree(ipm);
+	}
+}
+
+static inline void
+qeth_add_vlan_mc(struct qeth_card *card)
+{
+#ifdef CONFIG_QETH_VLAN
+	struct in_device *in_dev;
+	struct vlan_group *vg;
+	int i;
+
+	QETH_DBF_TEXT(trace,4,"addmcvl");
+	if (!qeth_is_supported(card,IPA_FULL_VLAN) ||
+	    (card->vlangrp == NULL))
+		return ;
+
+	vg = card->vlangrp;
+	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		if (vg->vlan_devices[i] == NULL ||
+		    !(vg->vlan_devices[i]->flags & IFF_UP))
+			continue;
+		in_dev = in_dev_get(vg->vlan_devices[i]);
+		if (!in_dev)
+			continue;
+		read_lock(&in_dev->lock);
+		qeth_add_mc(card,in_dev);
+		read_unlock(&in_dev->lock);
+		in_dev_put(in_dev);
+	}
+#endif
+}
+
+static void
+qeth_add_multicast_ipv4(struct qeth_card *card)
+{
+	struct in_device *in4_dev;
+
+	QETH_DBF_TEXT(trace,4,"chkmcv4");
+	in4_dev = in_dev_get(card->dev);
+	if (in4_dev == NULL)
+		return;
+	read_lock(&in4_dev->lock);
+	qeth_add_mc(card, in4_dev);
+	qeth_add_vlan_mc(card);
+	read_unlock(&in4_dev->lock);
+	in_dev_put(in4_dev);
+}
+
+#ifdef CONFIG_QETH_IPV6
+static inline void
+qeth_add_mc6(struct qeth_card *card, struct inet6_dev *in6_dev)
+{
+	struct qeth_ipaddr *ipm;
+	struct ifmcaddr6 *im6;
+	char buf[MAX_ADDR_LEN];
+
+	QETH_DBF_TEXT(trace,4,"addmc6");
+	for (im6 = in6_dev->mc_list; im6 != NULL; im6 = im6->next) {
+		ndisc_mc_map(&im6->mca_addr, buf, in6_dev->dev, 0);
+		ipm = qeth_get_addr_buffer(QETH_PROT_IPV6);
+		if (!ipm)
+			continue;
+		ipm->is_multicast = 1;
+		memcpy(ipm->mac,buf,OSA_ADDR_LEN);
+		memcpy(&ipm->u.a6.addr,&im6->mca_addr.s6_addr,
+		       sizeof(struct in6_addr));
+		if (!qeth_add_ip(card,ipm))
+			kfree(ipm);
+	}
+}
+
+static inline void
+qeth_add_vlan_mc6(struct qeth_card *card)
+{
+#ifdef CONFIG_QETH_VLAN
+	struct inet6_dev *in_dev;
+	struct vlan_group *vg;
+	int i;
+
+	QETH_DBF_TEXT(trace,4,"admc6vl");
+	if (!qeth_is_supported(card,IPA_FULL_VLAN) ||
+	    (card->vlangrp == NULL))
+		return ;
+
+	vg = card->vlangrp;
+	for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) {
+		if (vg->vlan_devices[i] == NULL ||
+		    !(vg->vlan_devices[i]->flags & IFF_UP))
+			continue;
+		in_dev = in6_dev_get(vg->vlan_devices[i]);
+		if (!in_dev)
+			continue;
+		read_lock(&in_dev->lock);
+		qeth_add_mc6(card,in_dev);
+		read_unlock(&in_dev->lock);
+		in6_dev_put(in_dev);
+	}
+#endif /* CONFIG_QETH_VLAN */
+}
+
+static void
+qeth_add_multicast_ipv6(struct qeth_card *card)
+{
+	struct inet6_dev *in6_dev;
+
+	QETH_DBF_TEXT(trace,4,"chkmcv6");
+	if (!qeth_is_supported(card, IPA_IPV6))
+		return ;
+
+	in6_dev = in6_dev_get(card->dev);
+	if (in6_dev == NULL)
+		return;
+	read_lock(&in6_dev->lock);
+	qeth_add_mc6(card, in6_dev);
+	qeth_add_vlan_mc6(card);
+	read_unlock(&in6_dev->lock);
+	in6_dev_put(in6_dev);
+}
+#endif /* CONFIG_QETH_IPV6 */
+
+/**
+ * set multicast address on card
+ */
+static void
+qeth_set_multicast_list(struct net_device *dev)
+{
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,3,"setmulti");
+	card = (struct qeth_card *) dev->priv;
+
+	qeth_set_thread_start_bit(card, QETH_SET_MC_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+}
+
+static void
+qeth_fill_ipacmd_header(struct qeth_card *card, struct qeth_ipa_cmd *cmd,
+			__u8 command, enum qeth_prot_versions prot)
+{
+	memset(cmd, 0, sizeof (struct qeth_ipa_cmd));
+	cmd->hdr.command = command;
+	cmd->hdr.initiator = IPA_CMD_INITIATOR_HOST;
+	cmd->hdr.seqno = card->seqno.ipa;
+	cmd->hdr.adapter_type = qeth_get_ipa_adp_type(card->info.link_type);
+	cmd->hdr.rel_adapter_no = (__u8) card->info.portno;
+	cmd->hdr.prim_version_no = 1;
+	cmd->hdr.param_count = 1;
+	cmd->hdr.prot_version = prot;
+	cmd->hdr.ipa_supported = 0;
+	cmd->hdr.ipa_enabled = 0;
+}
+
+static struct qeth_cmd_buffer *
+qeth_get_ipacmd_buffer(struct qeth_card *card, enum qeth_ipa_cmds ipacmd,
+		       enum qeth_prot_versions prot)
+{
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	iob = qeth_wait_for_buffer(&card->write);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	qeth_fill_ipacmd_header(card, cmd, ipacmd, prot);
+
+	return iob;
+}
+
+static int
+qeth_send_setdelmc(struct qeth_card *card, struct qeth_ipaddr *addr, int ipacmd)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"setdelmc");
+
+	iob = qeth_get_ipacmd_buffer(card, ipacmd, addr->proto);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	memcpy(&cmd->data.setdelipm.mac,addr->mac, OSA_ADDR_LEN);
+	if (addr->proto == QETH_PROT_IPV6)
+		memcpy(cmd->data.setdelipm.ip6, &addr->u.a6.addr,
+		       sizeof(struct in6_addr));
+	else
+		memcpy(&cmd->data.setdelipm.ip4, &addr->u.a4.addr,4);
+
+	rc = qeth_send_ipa_cmd(card, iob, NULL, NULL);
+
+	return rc;
+}
+static inline void
+qeth_fill_netmask(u8 *netmask, unsigned int len)
+{
+	int i,j;
+	for (i=0;i<16;i++) {
+		j=(len)-(i*8);
+		if (j >= 8)
+			netmask[i] = 0xff;
+		else if (j > 0)
+			netmask[i] = (u8)(0xFF00>>j);
+		else
+			netmask[i] = 0;
+	}
+}
+
+static int
+qeth_send_setdelip(struct qeth_card *card, struct qeth_ipaddr *addr,
+		   int ipacmd, unsigned int flags)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+	__u8 netmask[16];
+
+	QETH_DBF_TEXT(trace,4,"setdelip");
+	QETH_DBF_TEXT_(trace,4,"flags%02X", flags);
+
+	iob = qeth_get_ipacmd_buffer(card, ipacmd, addr->proto);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	if (addr->proto == QETH_PROT_IPV6) {
+		memcpy(cmd->data.setdelip6.ip_addr, &addr->u.a6.addr,
+		       sizeof(struct in6_addr));
+		qeth_fill_netmask(netmask,addr->u.a6.pfxlen);
+		memcpy(cmd->data.setdelip6.mask, netmask,
+		       sizeof(struct in6_addr));
+		cmd->data.setdelip6.flags = flags;
+	} else {
+		memcpy(cmd->data.setdelip4.ip_addr, &addr->u.a4.addr, 4);
+		memcpy(cmd->data.setdelip4.mask, &addr->u.a4.mask, 4);
+		cmd->data.setdelip4.flags = flags;
+	}
+
+	rc = qeth_send_ipa_cmd(card, iob, NULL, NULL);
+
+	return rc;
+}
+
+static int
+qeth_register_addr_entry(struct qeth_card *card, struct qeth_ipaddr *addr)
+{
+	//char buf[50];
+	int rc;
+	int cnt = 3;
+
+	if (addr->proto == QETH_PROT_IPV4) {
+		QETH_DBF_TEXT(trace, 2,"setaddr4");
+		QETH_DBF_HEX(trace, 4, &addr->u.a4.addr, sizeof(int));
+	} else if (addr->proto == QETH_PROT_IPV6) {
+		QETH_DBF_TEXT(trace, 2, "setaddr6");
+		QETH_DBF_HEX(trace,4,&addr->u.a6.addr,4);
+		QETH_DBF_HEX(trace,4,((char *)&addr->u.a6.addr)+4,4);
+	} else {
+		QETH_DBF_TEXT(trace, 2, "setaddr?");
+		QETH_DBF_HEX(trace, 4, addr, sizeof(struct qeth_ipaddr));
+	}
+	do {
+		if (addr->is_multicast)
+			rc =  qeth_send_setdelmc(card, addr, IPA_CMD_SETIPM);
+		else
+			rc = qeth_send_setdelip(card, addr, IPA_CMD_SETIP,
+					addr->set_flags);
+		if (rc)
+			QETH_DBF_TEXT(trace, 2, "failed");
+	} while ((--cnt > 0) && rc);
+	if (rc){
+		QETH_DBF_TEXT(trace, 2, "FAILED");
+		/* TODO: re-activate this warning as soon as we have a
+		 * clean mirco code
+		qeth_ipaddr_to_string(addr->proto, (u8 *)&addr->u, buf);
+		PRINT_WARN("Could not register IP address %s (rc=%x)\n",
+			   buf, rc);
+		*/
+	}
+	return rc;
+}
+
+static int
+qeth_deregister_addr_entry(struct qeth_card *card, struct qeth_ipaddr *addr)
+{
+	//char buf[50];
+	int rc;
+
+	if (addr->proto == QETH_PROT_IPV4) {
+		QETH_DBF_TEXT(trace, 2,"deladdr4");
+		QETH_DBF_HEX(trace, 2, &addr->u.a4.addr, sizeof(int));
+	} else if (addr->proto == QETH_PROT_IPV6) {
+		QETH_DBF_TEXT(trace, 2, "deladdr6");
+		QETH_DBF_HEX(trace, 2, &addr->u.a6.addr,
+			     sizeof(struct in6_addr));
+	} else {
+		QETH_DBF_TEXT(trace, 2, "deladdr?");
+		QETH_DBF_HEX(trace, 2, addr, sizeof(struct qeth_ipaddr));
+	}
+	if (addr->is_multicast)
+		rc = qeth_send_setdelmc(card, addr, IPA_CMD_DELIPM);
+	else
+		rc = qeth_send_setdelip(card, addr, IPA_CMD_DELIP,
+					addr->del_flags);
+	if (rc) {
+		QETH_DBF_TEXT(trace, 2, "failed");
+		/* TODO: re-activate this warning as soon as we have a
+		 * clean mirco code
+		qeth_ipaddr_to_string(addr->proto, (u8 *)&addr->u, buf);
+		PRINT_WARN("Could not deregister IP address %s (rc=%x)\n",
+			   buf, rc);
+		*/
+	}
+	return rc;
+}
+
+static int
+qeth_netdev_init(struct net_device *dev)
+{
+	struct qeth_card *card;
+
+	card = (struct qeth_card *) dev->priv;
+
+	QETH_DBF_TEXT(trace,3,"initdev");
+
+	dev->tx_timeout = &qeth_tx_timeout;
+	dev->watchdog_timeo = QETH_TX_TIMEOUT;
+	dev->open = qeth_open;
+	dev->stop = qeth_stop;
+	dev->hard_start_xmit = qeth_hard_start_xmit;
+	dev->do_ioctl = qeth_do_ioctl;
+	dev->get_stats = qeth_get_stats;
+	dev->change_mtu = qeth_change_mtu;
+	dev->neigh_setup = qeth_neigh_setup;
+	dev->set_multicast_list = qeth_set_multicast_list;
+#ifdef CONFIG_QETH_VLAN
+	dev->vlan_rx_register = qeth_vlan_rx_register;
+	dev->vlan_rx_kill_vid = qeth_vlan_rx_kill_vid;
+#endif
+	if (qeth_get_netdev_flags(card->info.type) & IFF_NOARP) {
+		dev->rebuild_header = NULL;
+		dev->hard_header = NULL;
+		dev->header_cache_update = NULL;
+		dev->hard_header_cache = NULL;
+	}
+#ifdef CONFIG_QETH_IPV6
+	/*IPv6 address autoconfiguration stuff*/
+	card->dev->dev_id = card->info.unique_id & 0xffff;
+	if (!(card->info.unique_id & UNIQUE_ID_NOT_BY_CARD))
+		card->dev->generate_eui64 = qeth_ipv6_generate_eui64;
+
+
+#endif
+	dev->hard_header_parse = NULL;
+	dev->set_mac_address = NULL;
+	dev->flags |= qeth_get_netdev_flags(card->info.type);
+	if ((card->options.fake_broadcast) ||
+	    (card->info.broadcast_capable))
+		dev->flags |= IFF_BROADCAST;
+
+	dev->hard_header_len =
+		qeth_get_hlen(card->info.link_type) + card->options.add_hhlen;
+	dev->addr_len = OSA_ADDR_LEN;
+	dev->mtu = card->info.initial_mtu;
+
+	SET_MODULE_OWNER(dev);
+	return 0;
+}
+
+/**
+ * hardsetup card, initialize MPC and QDIO stuff
+ */
+static int
+qeth_hardsetup_card(struct qeth_card *card)
+{
+	int retries = 3;
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "hrdsetup");
+
+retry:
+	if (retries < 3){
+		PRINT_WARN("Retrying to do IDX activates.\n");
+		ccw_device_set_offline(CARD_DDEV(card));
+		ccw_device_set_offline(CARD_WDEV(card));
+		ccw_device_set_offline(CARD_RDEV(card));
+		ccw_device_set_online(CARD_RDEV(card));
+		ccw_device_set_online(CARD_WDEV(card));
+		ccw_device_set_online(CARD_DDEV(card));
+	}
+	rc = qeth_qdio_clear_card(card,card->info.type==QETH_CARD_TYPE_OSAE);
+	if (rc == -ERESTARTSYS) {
+		QETH_DBF_TEXT(setup, 2, "break1");
+		return rc;
+	} else if (rc) {
+		QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		if (--retries < 0)
+			goto out;
+		else
+			goto retry;
+	}
+	if ((rc = qeth_get_unitaddr(card))){
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		return rc;
+	}
+	qeth_init_tokens(card);
+	rc = qeth_idx_activate_channel(&card->read, qeth_idx_read_cb);
+	if (rc == -ERESTARTSYS) {
+		QETH_DBF_TEXT(setup, 2, "break2");
+		return rc;
+	} else if (rc) {
+		QETH_DBF_TEXT_(setup, 2, "3err%d", rc);
+		if (--retries < 0)
+			goto out;
+		else
+			goto retry;
+	}
+	rc = qeth_idx_activate_channel(&card->write, qeth_idx_write_cb);
+	if (rc == -ERESTARTSYS) {
+		QETH_DBF_TEXT(setup, 2, "break3");
+		return rc;
+	} else if (rc) {
+		QETH_DBF_TEXT_(setup, 2, "4err%d", rc);
+		if (--retries < 0)
+			goto out;
+		else
+			goto retry;
+	}
+	if ((rc = qeth_mpc_initialize(card))){
+		QETH_DBF_TEXT_(setup, 2, "5err%d", rc);
+		goto out;
+	}
+	/* at first set_online allocate netdev */
+	if (!card->dev){
+		card->dev = qeth_get_netdevice(card->info.type,
+					       card->info.link_type);
+		if (!card->dev){
+			qeth_qdio_clear_card(card, card->info.type ==
+					     QETH_CARD_TYPE_OSAE);
+			rc = -ENODEV;
+			QETH_DBF_TEXT_(setup, 2, "6err%d", rc);
+			goto out;
+		}
+		qeth_set_device_name(card);
+		card->dev->priv = card;
+		card->dev->type = qeth_get_arphdr_type(card->info.type,
+						       card->info.link_type);
+		card->dev->init = qeth_netdev_init;
+	}
+	return 0;
+out:
+	PRINT_ERR("Initialization in hardsetup failed! rc=%d\n", rc);
+	return rc;
+}
+
+static struct qeth_cmd_buffer *
+qeth_get_adapter_cmd(struct qeth_card *card, __u32 command)
+{
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	iob = qeth_get_ipacmd_buffer(card,IPA_CMD_SETADAPTERPARMS,
+				     QETH_PROT_IPV4);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	cmd->data.setadapterparms.cmdlength =
+				sizeof(struct qeth_ipacmd_setadpparms);
+	cmd->data.setadapterparms.command_code = command;
+	cmd->data.setadapterparms.frames_used_total = 1;
+	cmd->data.setadapterparms.frame_seq_no = 1;
+
+	return iob;
+}
+
+static int
+qeth_default_setassparms_cb(struct qeth_card *card, struct qeth_reply *reply,
+			    unsigned long data)
+{
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"defadpcb");
+
+	cmd = (struct qeth_ipa_cmd *) data;
+	if (cmd->hdr.return_code == 0){
+		cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code;
+		if (cmd->hdr.prot_version == QETH_PROT_IPV4)
+			card->options.ipa4.enabled_funcs = cmd->hdr.ipa_enabled;
+#ifdef CONFIG_QETH_IPV6
+		if (cmd->hdr.prot_version == QETH_PROT_IPV6)
+			card->options.ipa6.enabled_funcs = cmd->hdr.ipa_enabled;
+#endif
+	}
+	if (cmd->data.setassparms.hdr.assist_no == IPA_INBOUND_CHECKSUM &&
+	    cmd->data.setassparms.hdr.command_code == IPA_CMD_ASS_START) {
+		card->info.csum_mask = cmd->data.setassparms.data.flags_32bit;
+		QETH_DBF_TEXT_(trace, 3, "csum:%d", card->info.csum_mask);
+	}
+	return 0;
+}
+
+static int
+qeth_default_setadapterparms_cb(struct qeth_card *card,
+				struct qeth_reply *reply,
+				unsigned long data)
+{
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"defadpcb");
+
+	cmd = (struct qeth_ipa_cmd *) data;
+	if (cmd->hdr.return_code == 0)
+		cmd->hdr.return_code = cmd->data.setadapterparms.return_code;
+	return 0;
+}
+
+static int
+qeth_query_setadapterparms_cb(struct qeth_card *card, struct qeth_reply *reply,
+			      unsigned long data)
+{
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,3,"quyadpcb");
+
+	cmd = (struct qeth_ipa_cmd *) data;
+	if (cmd->data.setadapterparms.data.query_cmds_supp.lan_type & 0x7f)
+		card->info.link_type =
+		      cmd->data.setadapterparms.data.query_cmds_supp.lan_type;
+	card->options.adp.supported_funcs =
+		cmd->data.setadapterparms.data.query_cmds_supp.supported_cmds;
+	return qeth_default_setadapterparms_cb(card, reply, (unsigned long)cmd);
+}
+
+static int
+qeth_query_setadapterparms(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(trace,3,"queryadp");
+	iob = qeth_get_adapter_cmd(card,IPA_SETADP_QUERY_COMMANDS_SUPPORTED);
+	rc = qeth_send_ipa_cmd(card, iob, qeth_query_setadapterparms_cb, NULL);
+	return rc;
+}
+
+static int
+qeth_setadpparms_change_macaddr_cb(struct qeth_card *card,
+				   struct qeth_reply *reply,
+				   unsigned long data)
+{
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"chgmaccb");
+
+	cmd = (struct qeth_ipa_cmd *) data;
+	memcpy(card->dev->dev_addr,
+	       &cmd->data.setadapterparms.data.change_addr.addr,OSA_ADDR_LEN);
+	qeth_default_setadapterparms_cb(card, reply, (unsigned long) cmd);
+	return 0;
+}
+
+static int
+qeth_setadpparms_change_macaddr(struct qeth_card *card)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"chgmac");
+
+	iob = qeth_get_adapter_cmd(card,IPA_SETADP_ALTER_MAC_ADDRESS);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	cmd->data.setadapterparms.data.change_addr.cmd = CHANGE_ADDR_READ_MAC;
+	cmd->data.setadapterparms.data.change_addr.addr_size = OSA_ADDR_LEN;
+	memcpy(&cmd->data.setadapterparms.data.change_addr.addr,
+	       card->dev->dev_addr, OSA_ADDR_LEN);
+	rc = qeth_send_ipa_cmd(card, iob, qeth_setadpparms_change_macaddr_cb,
+			       NULL);
+	return rc;
+}
+
+static int
+qeth_send_setadp_mode(struct qeth_card *card, __u32 command, __u32 mode)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"adpmode");
+
+	iob = qeth_get_adapter_cmd(card, command);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	cmd->data.setadapterparms.data.mode = mode;
+	rc = qeth_send_ipa_cmd(card, iob, qeth_default_setadapterparms_cb,
+			       NULL);
+	return rc;
+}
+
+static inline int
+qeth_setadapter_hstr(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,4,"adphstr");
+
+	if (qeth_adp_supported(card,IPA_SETADP_SET_BROADCAST_MODE)) {
+		rc = qeth_send_setadp_mode(card, IPA_SETADP_SET_BROADCAST_MODE,
+					   card->options.broadcast_mode);
+		if (rc)
+			PRINT_WARN("couldn't set broadcast mode on "
+				   "device %s: x%x\n",
+				   CARD_BUS_ID(card), rc);
+		rc = qeth_send_setadp_mode(card, IPA_SETADP_ALTER_MAC_ADDRESS,
+					   card->options.macaddr_mode);
+		if (rc)
+			PRINT_WARN("couldn't set macaddr mode on "
+				   "device %s: x%x\n", CARD_BUS_ID(card), rc);
+		return rc;
+	}
+	if (card->options.broadcast_mode == QETH_TR_BROADCAST_LOCAL)
+		PRINT_WARN("set adapter parameters not available "
+			   "to set broadcast mode, using ALLRINGS "
+			   "on device %s:\n", CARD_BUS_ID(card));
+	if (card->options.macaddr_mode == QETH_TR_MACADDR_CANONICAL)
+		PRINT_WARN("set adapter parameters not available "
+			   "to set macaddr mode, using NONCANONICAL "
+			   "on device %s:\n", CARD_BUS_ID(card));
+	return 0;
+}
+
+static int
+qeth_setadapter_parms(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "setadprm");
+
+	if (!qeth_is_supported(card, IPA_SETADAPTERPARMS)){
+		PRINT_WARN("set adapter parameters not supported "
+			   "on device %s.\n",
+			   CARD_BUS_ID(card));
+		QETH_DBF_TEXT(setup, 2, " notsupp");
+		return 0;
+	}
+	rc = qeth_query_setadapterparms(card);
+	if (rc) {
+		PRINT_WARN("couldn't set adapter parameters on device %s: "
+			   "x%x\n", CARD_BUS_ID(card), rc);
+		return rc;
+	}
+	if (qeth_adp_supported(card,IPA_SETADP_ALTER_MAC_ADDRESS)) {
+		rc = qeth_setadpparms_change_macaddr(card);
+		if (rc)
+			PRINT_WARN("couldn't get MAC address on "
+				   "device %s: x%x\n",
+				   CARD_BUS_ID(card), rc);
+	}
+
+	if ((card->info.link_type == QETH_LINK_TYPE_HSTR) ||
+	    (card->info.link_type == QETH_LINK_TYPE_LANE_TR))
+		rc = qeth_setadapter_hstr(card);
+
+	return rc;
+}
+
+
+static int
+qeth_send_startstoplan(struct qeth_card *card, enum qeth_ipa_cmds ipacmd,
+		       enum qeth_prot_versions prot)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	iob = qeth_get_ipacmd_buffer(card,ipacmd,prot);
+	rc = qeth_send_ipa_cmd(card, iob, NULL, NULL);
+
+	return rc;
+}
+
+static int
+qeth_send_startlan(struct qeth_card *card, enum qeth_prot_versions prot)
+{
+	int rc;
+
+	QETH_DBF_TEXT_(setup, 2, "strtlan%i", prot);
+
+	rc = qeth_send_startstoplan(card, IPA_CMD_STARTLAN, prot);
+	return rc;
+}
+
+static int
+qeth_send_stoplan(struct qeth_card *card)
+{
+	int rc = 0;
+
+	/*
+	 * TODO: according to the IPA format document page 14,
+	 * TCP/IP (we!) never issue a STOPLAN
+	 * is this right ?!?
+	 */
+	QETH_DBF_TEXT(trace, 2, "stoplan");
+
+	rc = qeth_send_startstoplan(card, IPA_CMD_STOPLAN, QETH_PROT_IPV4);
+	return rc;
+}
+
+static int
+qeth_query_ipassists_cb(struct qeth_card *card, struct qeth_reply *reply,
+			unsigned long data)
+{
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(setup, 2, "qipasscb");
+
+	cmd = (struct qeth_ipa_cmd *) data;
+	if (cmd->hdr.prot_version == QETH_PROT_IPV4) {
+		card->options.ipa4.supported_funcs = cmd->hdr.ipa_supported;
+		card->options.ipa4.enabled_funcs = cmd->hdr.ipa_enabled;
+	} else {
+#ifdef CONFIG_QETH_IPV6
+		card->options.ipa6.supported_funcs = cmd->hdr.ipa_supported;
+		card->options.ipa6.enabled_funcs = cmd->hdr.ipa_enabled;
+#endif
+	}
+	return 0;
+}
+
+static int
+qeth_query_ipassists(struct qeth_card *card, enum qeth_prot_versions prot)
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT_(setup, 2, "qipassi%i", prot);
+
+	iob = qeth_get_ipacmd_buffer(card,IPA_CMD_QIPASSIST,prot);
+	rc = qeth_send_ipa_cmd(card, iob, qeth_query_ipassists_cb, NULL);
+	return rc;
+}
+
+static struct qeth_cmd_buffer *
+qeth_get_setassparms_cmd(struct qeth_card *card, enum qeth_ipa_funcs ipa_func,
+			 __u16 cmd_code, __u16 len,
+			 enum qeth_prot_versions prot)
+{
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"getasscm");
+	iob = qeth_get_ipacmd_buffer(card,IPA_CMD_SETASSPARMS,prot);
+
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	cmd->data.setassparms.hdr.assist_no = ipa_func;
+	cmd->data.setassparms.hdr.length = 8 + len;
+	cmd->data.setassparms.hdr.command_code = cmd_code;
+	cmd->data.setassparms.hdr.return_code = 0;
+	cmd->data.setassparms.hdr.seq_no = 0;
+
+	return iob;
+}
+
+static int
+qeth_send_setassparms(struct qeth_card *card, struct qeth_cmd_buffer *iob,
+		      __u16 len, long data,
+		      int (*reply_cb)
+		      (struct qeth_card *,struct qeth_reply *,unsigned long),
+		      void *reply_param)
+{
+	int rc;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,4,"sendassp");
+
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	if (len <= sizeof(__u32))
+		cmd->data.setassparms.data.flags_32bit = (__u32) data;
+	else if (len > sizeof(__u32))
+		memcpy(&cmd->data.setassparms.data, (void *) data, len);
+
+	rc = qeth_send_ipa_cmd(card, iob, reply_cb, reply_param);
+	return rc;
+}
+
+#ifdef CONFIG_QETH_IPV6
+static int
+qeth_send_simple_setassparms_ipv6(struct qeth_card *card,
+				  enum qeth_ipa_funcs ipa_func, __u16 cmd_code)
+
+{
+	int rc;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(trace,4,"simassp6");
+	iob = qeth_get_setassparms_cmd(card, ipa_func, cmd_code,
+				       0, QETH_PROT_IPV6);
+	rc = qeth_send_setassparms(card, iob, 0, 0,
+				   qeth_default_setassparms_cb, NULL);
+	return rc;
+}
+#endif
+
+static int
+qeth_send_simple_setassparms(struct qeth_card *card,
+			     enum qeth_ipa_funcs ipa_func,
+			     __u16 cmd_code, long data)
+{
+	int rc;
+	int length = 0;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(trace,4,"simassp4");
+	if (data)
+		length = sizeof(__u32);
+	iob = qeth_get_setassparms_cmd(card, ipa_func, cmd_code,
+				       length, QETH_PROT_IPV4);
+	rc = qeth_send_setassparms(card, iob, length, data,
+				   qeth_default_setassparms_cb, NULL);
+	return rc;
+}
+
+static inline int
+qeth_start_ipa_arp_processing(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"ipaarp");
+
+	if (!qeth_is_supported(card,IPA_ARP_PROCESSING)) {
+		PRINT_WARN("ARP processing not supported "
+			   "on %s!\n", card->info.if_name);
+		return 0;
+	}
+	rc = qeth_send_simple_setassparms(card,IPA_ARP_PROCESSING,
+					  IPA_CMD_ASS_START, 0);
+	if (rc) {
+		PRINT_WARN("Could not start ARP processing "
+			   "assist on %s: 0x%x\n",
+			   card->info.if_name, rc);
+	}
+	return rc;
+}
+
+static int
+qeth_start_ipa_ip_fragmentation(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"ipaipfrg");
+
+	if (!qeth_is_supported(card, IPA_IP_FRAGMENTATION)) {
+		PRINT_INFO("IP fragmentation not supported on %s\n",
+			   card->info.if_name);
+		return  -EOPNOTSUPP;
+	}
+
+	rc = qeth_send_simple_setassparms(card, IPA_IP_FRAGMENTATION,
+					  IPA_CMD_ASS_START, 0);
+	if (rc) {
+		PRINT_WARN("Could not start IP fragmentation "
+			   "assist on %s: 0x%x\n",
+			   card->info.if_name, rc);
+	} else
+		PRINT_INFO("IP fragmentation enabled \n");
+	return rc;
+}
+
+static int
+qeth_start_ipa_source_mac(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"stsrcmac");
+
+	if (!card->options.fake_ll)
+		return -EOPNOTSUPP;
+
+	if (!qeth_is_supported(card, IPA_SOURCE_MAC)) {
+		PRINT_INFO("Inbound source address not "
+			   "supported on %s\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+
+	rc = qeth_send_simple_setassparms(card, IPA_SOURCE_MAC,
+					  IPA_CMD_ASS_START, 0);
+	if (rc)
+		PRINT_WARN("Could not start inbound source "
+			   "assist on %s: 0x%x\n",
+			   card->info.if_name, rc);
+	return rc;
+}
+
+static int
+qeth_start_ipa_vlan(struct qeth_card *card)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"strtvlan");
+
+#ifdef CONFIG_QETH_VLAN
+	if (!qeth_is_supported(card, IPA_FULL_VLAN)) {
+		PRINT_WARN("VLAN not supported on %s\n", card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+
+	rc = qeth_send_simple_setassparms(card, IPA_VLAN_PRIO,
+					  IPA_CMD_ASS_START,0);
+	if (rc) {
+		PRINT_WARN("Could not start vlan "
+			   "assist on %s: 0x%x\n",
+			   card->info.if_name, rc);
+	} else {
+		PRINT_INFO("VLAN enabled \n");
+		card->dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	}
+#endif /* QETH_VLAN */
+	return rc;
+}
+
+static int
+qeth_start_ipa_multicast(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"stmcast");
+
+	if (!qeth_is_supported(card, IPA_MULTICASTING)) {
+		PRINT_WARN("Multicast not supported on %s\n",
+			   card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+
+	rc = qeth_send_simple_setassparms(card, IPA_MULTICASTING,
+					  IPA_CMD_ASS_START,0);
+	if (rc) {
+		PRINT_WARN("Could not start multicast "
+			   "assist on %s: rc=%i\n",
+			   card->info.if_name, rc);
+	} else {
+		PRINT_INFO("Multicast enabled\n");
+		card->dev->flags |= IFF_MULTICAST;
+	}
+	return rc;
+}
+
+#ifdef CONFIG_QETH_IPV6
+static int
+qeth_softsetup_ipv6(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"softipv6");
+
+	netif_stop_queue(card->dev);
+	rc = qeth_send_startlan(card, QETH_PROT_IPV6);
+	if (rc) {
+		PRINT_ERR("IPv6 startlan failed on %s\n",
+			  card->info.if_name);
+		return rc;
+	}
+	netif_wake_queue(card->dev);
+	rc = qeth_query_ipassists(card,QETH_PROT_IPV6);
+	if (rc) {
+		PRINT_ERR("IPv6 query ipassist failed on %s\n",
+			  card->info.if_name);
+		return rc;
+	}
+	rc = qeth_send_simple_setassparms(card, IPA_IPV6,
+					  IPA_CMD_ASS_START, 3);
+	if (rc) {
+		PRINT_WARN("IPv6 start assist (version 4) failed "
+			   "on %s: 0x%x\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+	rc = qeth_send_simple_setassparms_ipv6(card, IPA_IPV6,
+					       IPA_CMD_ASS_START);
+	if (rc) {
+		PRINT_WARN("IPV6 start assist (version 6) failed  "
+			   "on %s: 0x%x\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+	rc = qeth_send_simple_setassparms_ipv6(card, IPA_PASSTHRU,
+					       IPA_CMD_ASS_START);
+	if (rc) {
+		PRINT_WARN("Could not enable passthrough "
+			   "on %s: 0x%x\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+	PRINT_INFO("IPV6 enabled \n");
+	return 0;
+}
+
+#endif
+
+static int
+qeth_start_ipa_ipv6(struct qeth_card *card)
+{
+	int rc = 0;
+#ifdef CONFIG_QETH_IPV6
+	QETH_DBF_TEXT(trace,3,"strtipv6");
+
+	if (!qeth_is_supported(card, IPA_IPV6)) {
+		PRINT_WARN("IPv6 not supported on %s\n",
+			   card->info.if_name);
+		return 0;
+	}
+	rc = qeth_softsetup_ipv6(card);
+#endif
+	return rc ;
+}
+
+static int
+qeth_start_ipa_broadcast(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"stbrdcst");
+	if (!qeth_is_supported(card, IPA_FILTERING)) {
+		PRINT_WARN("Broadcast not supported on %s\n",
+			   card->info.if_name);
+		return -EOPNOTSUPP;
+	}
+	rc = qeth_send_simple_setassparms(card, IPA_FILTERING,
+					  IPA_CMD_ASS_START, 0);
+	if (rc) {
+		PRINT_WARN("Could not enable broadcasting "
+			   "on %s: 0x%x\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+
+	rc = qeth_send_simple_setassparms(card, IPA_FILTERING,
+					  IPA_CMD_ASS_CONFIGURE, 1);
+	if (rc) {
+		PRINT_WARN("Could not set up broadcast filtering on %s: 0x%x\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+	PRINT_INFO("Broadcast enabled \n");
+	card->dev->flags |= IFF_BROADCAST;
+	card->info.broadcast_capable = 1;
+	return 0;
+}
+
+static int
+qeth_send_checksum_command(struct qeth_card *card)
+{
+	int rc;
+
+	rc = qeth_send_simple_setassparms(card, IPA_INBOUND_CHECKSUM,
+					  IPA_CMD_ASS_START, 0);
+	if (rc) {
+		PRINT_WARN("Starting Inbound HW Checksumming failed on %s: "
+			   "0x%x,\ncontinuing using Inbound SW Checksumming\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+	rc = qeth_send_simple_setassparms(card, IPA_INBOUND_CHECKSUM,
+					  IPA_CMD_ASS_ENABLE,
+					  card->info.csum_mask);
+	if (rc) {
+		PRINT_WARN("Enabling Inbound HW Checksumming failed on %s: "
+			   "0x%x,\ncontinuing using Inbound SW Checksumming\n",
+			   card->info.if_name, rc);
+		return rc;
+	}
+	return 0;
+}
+
+static int
+qeth_start_ipa_checksum(struct qeth_card *card)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"strtcsum");
+
+	if (card->options.checksum_type == NO_CHECKSUMMING) {
+		PRINT_WARN("Using no checksumming on %s.\n",
+			   card->info.if_name);
+		return 0;
+	}
+	if (card->options.checksum_type == SW_CHECKSUMMING) {
+		PRINT_WARN("Using SW checksumming on %s.\n",
+			   card->info.if_name);
+		return 0;
+	}
+	if (!qeth_is_supported(card, IPA_INBOUND_CHECKSUM)) {
+		PRINT_WARN("Inbound HW Checksumming not "
+			   "supported on %s,\ncontinuing "
+			   "using Inbound SW Checksumming\n",
+			   card->info.if_name);
+		card->options.checksum_type = SW_CHECKSUMMING;
+		return 0;
+	}
+	rc = qeth_send_checksum_command(card);
+	if (!rc) {
+		PRINT_INFO("HW Checksumming (inbound) enabled \n");
+	}
+	return rc;
+}
+
+/*
+static inline void
+qeth_print_ipassist_status(struct qeth_card *card)
+{
+	char buf[255];
+	int offset = 0;
+
+	offset += sprintf(buf, "IPAssist options of %s: ", card->info.if_name);
+	if (qeth_is_enabled(card, IPA_ARP_PROCESSING))
+		offset += sprintf(buf+offset, "ARP ");
+	if (qeth_is_enabled(card, IPA_IP_FRAGMENTATION))
+		offset += sprintf(buf+offset, "IP_FRAG");
+	if (qeth_is_enabled(card, IPA_SOURCE_MAC))
+		offset += sprintf(buf+offset, "SRC_MAC");
+	if (qeth_is_enabled(card, IPA_FULL_VLAN))
+		offset += sprintf(buf+offset, "VLAN");
+	if (qeth_is_enabled(card, IPA_VLAN_PRIO))
+		offset += sprintf(buf+offset, "VLAN_PRIO");
+}
+*/
+
+static int
+qeth_start_ipassists(struct qeth_card *card)
+{
+	QETH_DBF_TEXT(trace,3,"strtipas");
+	qeth_start_ipa_arp_processing(card);	/* go on*/
+	qeth_start_ipa_ip_fragmentation(card); 	/* go on*/
+	qeth_start_ipa_source_mac(card);	/* go on*/
+	qeth_start_ipa_vlan(card);		/* go on*/
+	qeth_start_ipa_multicast(card);		/* go on*/
+	qeth_start_ipa_ipv6(card);		/* go on*/
+	qeth_start_ipa_broadcast(card);		/* go on*/
+	qeth_start_ipa_checksum(card);		/* go on*/
+	return 0;
+}
+
+static int
+qeth_send_setrouting(struct qeth_card *card, enum qeth_routing_types type,
+		     enum qeth_prot_versions prot)
+{
+	int rc;
+	struct qeth_ipa_cmd *cmd;
+	struct qeth_cmd_buffer *iob;
+
+	QETH_DBF_TEXT(trace,4,"setroutg");
+	iob = qeth_get_ipacmd_buffer(card, IPA_CMD_SETRTG, prot);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	cmd->data.setrtg.type = (type);
+	rc = qeth_send_ipa_cmd(card, iob, NULL, NULL);
+
+	return rc;
+
+}
+
+int
+qeth_setrouting_v4(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(trace,3,"setrtg4");
+
+	if (card->options.route4.type == NO_ROUTER)
+		return 0;
+
+	rc = qeth_send_setrouting(card, card->options.route4.type,
+				  QETH_PROT_IPV4);
+	if (rc) {
+ 		card->options.route4.type = NO_ROUTER;
+		PRINT_WARN("Error (0x%04x) while setting routing type on %s. "
+			   "Type set to 'no router'.\n",
+			   rc, card->info.if_name);
+	}
+	return rc;
+}
+
+int
+qeth_setrouting_v6(struct qeth_card *card)
+{
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace,3,"setrtg6");
+#ifdef CONFIG_QETH_IPV6
+
+	if ((card->options.route6.type == NO_ROUTER) ||
+	    ((card->info.type == QETH_CARD_TYPE_OSAE) &&
+	     (card->options.route6.type == MULTICAST_ROUTER) &&
+	     !qeth_is_supported6(card,IPA_OSA_MC_ROUTER)))
+		return 0;
+	rc = qeth_send_setrouting(card, card->options.route6.type,
+				  QETH_PROT_IPV6);
+	if (rc) {
+	 	card->options.route6.type = NO_ROUTER;
+		PRINT_WARN("Error (0x%04x) while setting routing type on %s. "
+			   "Type set to 'no router'.\n",
+			   rc, card->info.if_name);
+	}
+#endif
+	return rc;
+}
+
+/*
+ * softsetup card: init IPA stuff
+ */
+static int
+qeth_softsetup_card(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(setup, 2, "softsetp");
+
+	if ((rc = qeth_send_startlan(card, QETH_PROT_IPV4))){
+		QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		if (rc == 0xe080){
+			PRINT_WARN("LAN on card %s if offline! "
+				   "Continuing softsetup.\n",
+				   CARD_BUS_ID(card));
+			card->lan_online = 0;
+		} else
+			return rc;
+	} else
+		card->lan_online = 1;
+	if ((rc = qeth_setadapter_parms(card)))
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+	if ((rc = qeth_start_ipassists(card)))
+		QETH_DBF_TEXT_(setup, 2, "3err%d", rc);
+	if ((rc = qeth_setrouting_v4(card)))
+		QETH_DBF_TEXT_(setup, 2, "4err%d", rc);
+	if ((rc = qeth_setrouting_v6(card)))
+		QETH_DBF_TEXT_(setup, 2, "5err%d", rc);
+	netif_stop_queue(card->dev);
+	return 0;
+}
+
+#ifdef CONFIG_QETH_IPV6
+static int
+qeth_get_unique_id_cb(struct qeth_card *card, struct qeth_reply *reply,
+		      unsigned long data)
+{
+	struct qeth_ipa_cmd *cmd;
+
+	cmd = (struct qeth_ipa_cmd *) data;
+	if (cmd->hdr.return_code == 0)
+		card->info.unique_id = *((__u16 *)
+				&cmd->data.create_destroy_addr.unique_id[6]);
+	else {
+		card->info.unique_id =  UNIQUE_ID_IF_CREATE_ADDR_FAILED |
+					UNIQUE_ID_NOT_BY_CARD;
+		PRINT_WARN("couldn't get a unique id from the card on device "
+			   "%s (result=x%x), using default id. ipv6 "
+			   "autoconfig on other lpars may lead to duplicate "
+			   "ip addresses. please use manually "
+			   "configured ones.\n",
+			   CARD_BUS_ID(card), cmd->hdr.return_code);
+	}
+	return 0;
+}
+#endif
+
+static int
+qeth_put_unique_id(struct qeth_card *card)
+{
+
+	int rc = 0;
+#ifdef CONFIG_QETH_IPV6
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(trace,2,"puniqeid");
+
+	if ((card->info.unique_id & UNIQUE_ID_NOT_BY_CARD) ==
+	    	UNIQUE_ID_NOT_BY_CARD)
+		return -1;
+	iob = qeth_get_ipacmd_buffer(card, IPA_CMD_DESTROY_ADDR,
+				     QETH_PROT_IPV6);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	*((__u16 *) &cmd->data.create_destroy_addr.unique_id[6]) =
+		            card->info.unique_id;
+	memcpy(&cmd->data.create_destroy_addr.unique_id[0],
+	       card->dev->dev_addr, OSA_ADDR_LEN);
+	rc = qeth_send_ipa_cmd(card, iob, NULL, NULL);
+#else
+	card->info.unique_id =  UNIQUE_ID_IF_CREATE_ADDR_FAILED |
+				UNIQUE_ID_NOT_BY_CARD;
+#endif
+	return rc;
+}
+
+/**
+ * Clear IP List
+ */
+static void
+qeth_clear_ip_list(struct qeth_card *card, int clean, int recover)
+{
+	struct qeth_ipaddr *addr, *tmp;
+	int first_run = 1;
+	unsigned long flags;
+
+	QETH_DBF_TEXT(trace,4,"clearip");
+	spin_lock_irqsave(&card->ip_lock, flags);
+	/* clear todo list */
+	list_for_each_entry_safe(addr, tmp, &card->ip_tbd_list, entry){
+		list_del(&addr->entry);
+		kfree(addr);
+	}
+again:
+	if (first_run)
+		first_run = 0;
+	else
+		spin_lock_irqsave(&card->ip_lock, flags);
+
+	list_for_each_entry_safe(addr, tmp, &card->ip_list, entry) {
+		list_del_init(&addr->entry);
+		if (clean){
+			spin_unlock_irqrestore(&card->ip_lock, flags);
+			qeth_deregister_addr_entry(card, addr);
+		}
+		if (!recover || addr->is_multicast)
+			kfree(addr);
+		else {
+			if (clean)
+				spin_lock_irqsave(&card->ip_lock, flags);
+			list_add_tail(&addr->entry, &card->ip_tbd_list);
+			if (clean) {
+				spin_unlock_irqrestore(&card->ip_lock, flags);
+				goto again;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+}
+
+static void
+qeth_set_allowed_threads(struct qeth_card *card, unsigned long threads,
+			 int clear_start_mask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	card->thread_allowed_mask = threads;
+	if (clear_start_mask)
+		card->thread_start_mask &= threads;
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+	wake_up(&card->wait_q);
+}
+
+static inline int
+qeth_threads_running(struct qeth_card *card, unsigned long threads)
+{
+	unsigned long flags;
+	int rc = 0;
+
+	spin_lock_irqsave(&card->thread_mask_lock, flags);
+	rc = (card->thread_running_mask & threads);
+	spin_unlock_irqrestore(&card->thread_mask_lock, flags);
+	return rc;
+}
+
+static int
+qeth_wait_for_threads(struct qeth_card *card, unsigned long threads)
+{
+	return wait_event_interruptible(card->wait_q,
+			qeth_threads_running(card, threads) == 0);
+}
+
+static int
+qeth_stop_card(struct qeth_card *card)
+{
+	int recover_flag = 0;
+	int rc = 0;
+
+	QETH_DBF_TEXT(setup ,2,"stopcard");
+	QETH_DBF_HEX(setup, 2, &card, sizeof(void *));
+
+	qeth_set_allowed_threads(card, 0, 1);
+	if (qeth_wait_for_threads(card, ~QETH_RECOVER_THREAD))
+		return -ERESTARTSYS;
+	if (card->read.state == CH_STATE_UP &&
+	    card->write.state == CH_STATE_UP &&
+	    ((card->state == CARD_STATE_UP_LAN_ONLINE) ||
+	     (card->state == CARD_STATE_UP_LAN_OFFLINE))) {
+		recover_flag = 1;
+		rtnl_lock();
+		dev_close(card->dev);
+		rtnl_unlock();
+		if (!card->use_hard_stop)
+			if ((rc = qeth_send_stoplan(card)))
+				QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		card->state = CARD_STATE_SOFTSETUP;
+	}
+	if (card->state == CARD_STATE_SOFTSETUP) {
+		qeth_clear_ip_list(card, !card->use_hard_stop, recover_flag);
+		qeth_clear_ipacmd_list(card);
+		card->state = CARD_STATE_HARDSETUP;
+	}
+	if (card->state == CARD_STATE_HARDSETUP) {
+		if (!card->use_hard_stop)
+			if ((rc = qeth_put_unique_id(card)))
+				QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		qeth_qdio_clear_card(card, 0);
+		qeth_clear_qdio_buffers(card);
+		qeth_clear_working_pool_list(card);
+		card->state = CARD_STATE_DOWN;
+	}
+	if (card->state == CARD_STATE_DOWN) {
+		qeth_clear_cmd_buffers(&card->read);
+		qeth_clear_cmd_buffers(&card->write);
+	}
+	card->use_hard_stop = 0;
+	return rc;
+}
+
+
+static int
+qeth_get_unique_id(struct qeth_card *card)
+{
+	int rc = 0;
+#ifdef CONFIG_QETH_IPV6
+	struct qeth_cmd_buffer *iob;
+	struct qeth_ipa_cmd *cmd;
+
+	QETH_DBF_TEXT(setup, 2, "guniqeid");
+
+	if (!qeth_is_supported(card,IPA_IPV6)) {
+		card->info.unique_id =  UNIQUE_ID_IF_CREATE_ADDR_FAILED |
+					UNIQUE_ID_NOT_BY_CARD;
+		return 0;
+	}
+
+	iob = qeth_get_ipacmd_buffer(card, IPA_CMD_CREATE_ADDR,
+				     QETH_PROT_IPV6);
+	cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+	*((__u16 *) &cmd->data.create_destroy_addr.unique_id[6]) =
+		            card->info.unique_id;
+
+	rc = qeth_send_ipa_cmd(card, iob, qeth_get_unique_id_cb, NULL);
+#else
+	card->info.unique_id =  UNIQUE_ID_IF_CREATE_ADDR_FAILED |
+				UNIQUE_ID_NOT_BY_CARD;
+#endif
+	return rc;
+}
+static void
+qeth_print_status_with_portname(struct qeth_card *card)
+{
+	char dbf_text[15];
+	int i;
+
+	sprintf(dbf_text, "%s", card->info.portname + 1);
+	for (i = 0; i < 8; i++)
+		dbf_text[i] =
+			(char) _ebcasc[(__u8) dbf_text[i]];
+	dbf_text[8] = 0;
+	printk("qeth: Device %s/%s/%s is a%s card%s%s%s\n"
+	       "with link type %s (portname: %s)\n",
+	       CARD_RDEV_ID(card),
+	       CARD_WDEV_ID(card),
+	       CARD_DDEV_ID(card),
+	       qeth_get_cardname(card),
+	       (card->info.mcl_level[0]) ? " (level: " : "",
+	       (card->info.mcl_level[0]) ? card->info.mcl_level : "",
+	       (card->info.mcl_level[0]) ? ")" : "",
+	       qeth_get_cardname_short(card),
+	       dbf_text);
+
+}
+
+static void
+qeth_print_status_no_portname(struct qeth_card *card)
+{
+	if (card->info.portname[0])
+		printk("qeth: Device %s/%s/%s is a%s "
+		       "card%s%s%s\nwith link type %s "
+		       "(no portname needed by interface).\n",
+		       CARD_RDEV_ID(card),
+		       CARD_WDEV_ID(card),
+		       CARD_DDEV_ID(card),
+		       qeth_get_cardname(card),
+		       (card->info.mcl_level[0]) ? " (level: " : "",
+		       (card->info.mcl_level[0]) ? card->info.mcl_level : "",
+		       (card->info.mcl_level[0]) ? ")" : "",
+		       qeth_get_cardname_short(card));
+	else
+		printk("qeth: Device %s/%s/%s is a%s "
+		       "card%s%s%s\nwith link type %s.\n",
+		       CARD_RDEV_ID(card),
+		       CARD_WDEV_ID(card),
+		       CARD_DDEV_ID(card),
+		       qeth_get_cardname(card),
+		       (card->info.mcl_level[0]) ? " (level: " : "",
+		       (card->info.mcl_level[0]) ? card->info.mcl_level : "",
+		       (card->info.mcl_level[0]) ? ")" : "",
+		       qeth_get_cardname_short(card));
+}
+
+static void
+qeth_print_status_message(struct qeth_card *card)
+{
+	switch (card->info.type) {
+	case QETH_CARD_TYPE_OSAE:
+		/* VM will use a non-zero first character
+		 * to indicate a HiperSockets like reporting
+		 * of the level OSA sets the first character to zero
+		 * */
+		if (!card->info.mcl_level[0]) {
+			sprintf(card->info.mcl_level,"%02x%02x",
+				card->info.mcl_level[2],
+				card->info.mcl_level[3]);
+
+			card->info.mcl_level[QETH_MCL_LENGTH] = 0;
+			break;
+		}
+		/* fallthrough */
+	case QETH_CARD_TYPE_IQD:
+		card->info.mcl_level[0] = (char) _ebcasc[(__u8)
+			card->info.mcl_level[0]];
+		card->info.mcl_level[1] = (char) _ebcasc[(__u8)
+			card->info.mcl_level[1]];
+		card->info.mcl_level[2] = (char) _ebcasc[(__u8)
+			card->info.mcl_level[2]];
+		card->info.mcl_level[3] = (char) _ebcasc[(__u8)
+			card->info.mcl_level[3]];
+		card->info.mcl_level[QETH_MCL_LENGTH] = 0;
+		break;
+	default:
+		memset(&card->info.mcl_level[0], 0, QETH_MCL_LENGTH + 1);
+	}
+	if (card->info.portname_required)
+		qeth_print_status_with_portname(card);
+	else
+		qeth_print_status_no_portname(card);
+}
+
+static int
+qeth_register_netdev(struct qeth_card *card)
+{
+	int rc;
+
+	QETH_DBF_TEXT(setup, 3, "regnetd");
+	if (card->dev->reg_state != NETREG_UNINITIALIZED)
+		return 0;
+	/* sysfs magic */
+	SET_NETDEV_DEV(card->dev, &card->gdev->dev);
+	rc = register_netdev(card->dev);
+	if (!rc)
+		strcpy(card->info.if_name, card->dev->name);
+
+	return rc;
+}
+
+static void
+qeth_start_again(struct qeth_card *card)
+{
+	QETH_DBF_TEXT(setup ,2, "startag");
+
+	rtnl_lock();
+	dev_open(card->dev);
+	rtnl_unlock();
+	qeth_set_thread_start_bit(card, QETH_SET_MC_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+}
+
+static int
+qeth_set_online(struct ccwgroup_device *gdev)
+{
+	struct qeth_card *card = gdev->dev.driver_data;
+	int rc = 0;
+	enum qeth_card_states recover_flag;
+
+	BUG_ON(!card);
+	QETH_DBF_TEXT(setup ,2, "setonlin");
+	QETH_DBF_HEX(setup, 2, &card, sizeof(void *));
+
+	qeth_set_allowed_threads(card, QETH_RECOVER_THREAD, 1);
+	if (qeth_wait_for_threads(card, ~QETH_RECOVER_THREAD)){
+		PRINT_WARN("set_online of card %s interrupted by user!\n",
+			   CARD_BUS_ID(card));
+		return -ERESTARTSYS;
+	}
+
+	recover_flag = card->state;
+	if (ccw_device_set_online(CARD_RDEV(card)) ||
+	    ccw_device_set_online(CARD_WDEV(card)) ||
+	    ccw_device_set_online(CARD_DDEV(card))){
+		QETH_DBF_TEXT_(setup, 2, "1err%d", rc);
+		return -EIO;
+	}
+
+	if ((rc = qeth_hardsetup_card(card))){
+		QETH_DBF_TEXT_(setup, 2, "2err%d", rc);
+		goto out_remove;
+	}
+	card->state = CARD_STATE_HARDSETUP;
+
+	if ((rc = qeth_query_ipassists(card,QETH_PROT_IPV4))){
+		QETH_DBF_TEXT_(setup, 2, "3err%d", rc);
+		/*TODO: rc !=0*/
+	} else
+		rc = qeth_get_unique_id(card);
+
+	if (rc) {
+		QETH_DBF_TEXT_(setup, 2, "4err%d", rc);
+		goto out_remove;
+	}
+	qeth_print_status_message(card);
+	if ((rc = qeth_register_netdev(card))){
+		QETH_DBF_TEXT_(setup, 2, "5err%d", rc);
+		goto out_remove;
+	}
+	if ((rc = qeth_softsetup_card(card))){
+		QETH_DBF_TEXT_(setup, 2, "6err%d", rc);
+		goto out_remove;
+	}
+	card->state = CARD_STATE_SOFTSETUP;
+
+	if ((rc = qeth_init_qdio_queues(card))){
+		QETH_DBF_TEXT_(setup, 2, "7err%d", rc);
+		goto out_remove;
+	}
+/*maybe it was set offline without ifconfig down
+ * we can also use this state for recovery purposes*/
+	qeth_set_allowed_threads(card, 0xffffffff, 0);
+	if (recover_flag == CARD_STATE_RECOVER)
+		qeth_start_again(card);
+
+	return 0;
+out_remove:
+	card->use_hard_stop = 1;
+	qeth_stop_card(card);
+	ccw_device_set_offline(CARD_DDEV(card));
+	ccw_device_set_offline(CARD_WDEV(card));
+	ccw_device_set_offline(CARD_RDEV(card));
+	if (recover_flag == CARD_STATE_RECOVER)
+		card->state = CARD_STATE_RECOVER;
+	else
+		card->state = CARD_STATE_DOWN;
+	return -ENODEV;
+}
+
+static struct ccw_device_id qeth_ids[] = {
+	{CCW_DEVICE(0x1731, 0x01), driver_info:QETH_CARD_TYPE_OSAE},
+	{CCW_DEVICE(0x1731, 0x05), driver_info:QETH_CARD_TYPE_IQD},
+	{},
+};
+MODULE_DEVICE_TABLE(ccw, qeth_ids);
+
+struct device *qeth_root_dev = NULL;
+
+struct ccwgroup_driver qeth_ccwgroup_driver = {
+	.owner = THIS_MODULE,
+	.name = "qeth",
+	.driver_id = 0xD8C5E3C8,
+	.probe = qeth_probe_device,
+	.remove = qeth_remove_device,
+	.set_online = qeth_set_online,
+	.set_offline = qeth_set_offline,
+};
+
+struct ccw_driver qeth_ccw_driver = {
+	.name = "qeth",
+	.ids = qeth_ids,
+	.probe = ccwgroup_probe_ccwdev,
+	.remove = ccwgroup_remove_ccwdev,
+};
+
+
+static void
+qeth_unregister_dbf_views(void)
+{
+	if (qeth_dbf_setup)
+		debug_unregister(qeth_dbf_setup);
+	if (qeth_dbf_qerr)
+		debug_unregister(qeth_dbf_qerr);
+	if (qeth_dbf_sense)
+		debug_unregister(qeth_dbf_sense);
+	if (qeth_dbf_misc)
+		debug_unregister(qeth_dbf_misc);
+	if (qeth_dbf_data)
+		debug_unregister(qeth_dbf_data);
+	if (qeth_dbf_control)
+		debug_unregister(qeth_dbf_control);
+	if (qeth_dbf_trace)
+		debug_unregister(qeth_dbf_trace);
+}
+static int
+qeth_register_dbf_views(void)
+{
+	qeth_dbf_setup = debug_register(QETH_DBF_SETUP_NAME,
+					QETH_DBF_SETUP_INDEX,
+					QETH_DBF_SETUP_NR_AREAS,
+					QETH_DBF_SETUP_LEN);
+	qeth_dbf_misc = debug_register(QETH_DBF_MISC_NAME,
+				       QETH_DBF_MISC_INDEX,
+				       QETH_DBF_MISC_NR_AREAS,
+				       QETH_DBF_MISC_LEN);
+	qeth_dbf_data = debug_register(QETH_DBF_DATA_NAME,
+				       QETH_DBF_DATA_INDEX,
+				       QETH_DBF_DATA_NR_AREAS,
+				       QETH_DBF_DATA_LEN);
+	qeth_dbf_control = debug_register(QETH_DBF_CONTROL_NAME,
+					  QETH_DBF_CONTROL_INDEX,
+					  QETH_DBF_CONTROL_NR_AREAS,
+					  QETH_DBF_CONTROL_LEN);
+	qeth_dbf_sense = debug_register(QETH_DBF_SENSE_NAME,
+					QETH_DBF_SENSE_INDEX,
+					QETH_DBF_SENSE_NR_AREAS,
+					QETH_DBF_SENSE_LEN);
+	qeth_dbf_qerr = debug_register(QETH_DBF_QERR_NAME,
+				       QETH_DBF_QERR_INDEX,
+				       QETH_DBF_QERR_NR_AREAS,
+				       QETH_DBF_QERR_LEN);
+	qeth_dbf_trace = debug_register(QETH_DBF_TRACE_NAME,
+					QETH_DBF_TRACE_INDEX,
+					QETH_DBF_TRACE_NR_AREAS,
+					QETH_DBF_TRACE_LEN);
+
+	if ((qeth_dbf_setup == NULL) || (qeth_dbf_misc == NULL)    ||
+	    (qeth_dbf_data == NULL)  || (qeth_dbf_control == NULL) ||
+	    (qeth_dbf_sense == NULL) || (qeth_dbf_qerr == NULL)    ||
+	    (qeth_dbf_trace == NULL)) {
+		qeth_unregister_dbf_views();
+		return -ENOMEM;
+	}
+	debug_register_view(qeth_dbf_setup, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_setup, QETH_DBF_SETUP_LEVEL);
+
+	debug_register_view(qeth_dbf_misc, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_misc, QETH_DBF_MISC_LEVEL);
+
+	debug_register_view(qeth_dbf_data, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_data, QETH_DBF_DATA_LEVEL);
+
+	debug_register_view(qeth_dbf_control, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_control, QETH_DBF_CONTROL_LEVEL);
+
+	debug_register_view(qeth_dbf_sense, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_sense, QETH_DBF_SENSE_LEVEL);
+
+	debug_register_view(qeth_dbf_qerr, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_qerr, QETH_DBF_QERR_LEVEL);
+
+	debug_register_view(qeth_dbf_trace, &debug_hex_ascii_view);
+	debug_set_level(qeth_dbf_trace, QETH_DBF_TRACE_LEVEL);
+
+	return 0;
+}
+
+#ifdef CONFIG_QETH_IPV6
+extern struct neigh_table arp_tbl;
+static struct neigh_ops *arp_direct_ops;
+static int (*qeth_old_arp_constructor) (struct neighbour *);
+
+static struct neigh_ops arp_direct_ops_template = {
+	.family = AF_INET,
+	.destructor = NULL,
+	.solicit = NULL,
+	.error_report = NULL,
+	.output = dev_queue_xmit,
+	.connected_output = dev_queue_xmit,
+	.hh_output = dev_queue_xmit,
+	.queue_xmit = dev_queue_xmit
+};
+
+static int
+qeth_arp_constructor(struct neighbour *neigh)
+{
+	struct net_device *dev = neigh->dev;
+	struct in_device *in_dev = in_dev_get(dev);
+
+	if (in_dev == NULL)
+		return -EINVAL;
+	if (!qeth_verify_dev(dev)) {
+		in_dev_put(in_dev);
+		return qeth_old_arp_constructor(neigh);
+	}
+
+	neigh->type = inet_addr_type(*(u32 *) neigh->primary_key);
+	if (in_dev->arp_parms)
+		neigh->parms = in_dev->arp_parms;
+	in_dev_put(in_dev);
+	neigh->nud_state = NUD_NOARP;
+	neigh->ops = arp_direct_ops;
+	neigh->output = neigh->ops->queue_xmit;
+	return 0;
+}
+#endif  /*CONFIG_QETH_IPV6*/
+
+/*
+ * IP address takeover related functions
+ */
+static void
+qeth_clear_ipato_list(struct qeth_card *card)
+{
+	struct qeth_ipato_entry *ipatoe, *tmp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry_safe(ipatoe, tmp, &card->ipato.entries, entry) {
+		list_del(&ipatoe->entry);
+		kfree(ipatoe);
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+}
+
+int
+qeth_add_ipato_entry(struct qeth_card *card, struct qeth_ipato_entry *new)
+{
+	struct qeth_ipato_entry *ipatoe;
+	unsigned long flags;
+	int rc = 0;
+
+	QETH_DBF_TEXT(trace, 2, "addipato");
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry(ipatoe, &card->ipato.entries, entry){
+		if (ipatoe->proto != new->proto)
+			continue;
+		if (!memcmp(ipatoe->addr, new->addr,
+			    (ipatoe->proto == QETH_PROT_IPV4)? 4:16) &&
+		    (ipatoe->mask_bits == new->mask_bits)){
+			PRINT_WARN("ipato entry already exists!\n");
+			rc = -EEXIST;
+			break;
+		}
+	}
+	if (!rc) {
+		list_add_tail(&new->entry, &card->ipato.entries);
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	return rc;
+}
+
+void
+qeth_del_ipato_entry(struct qeth_card *card, enum qeth_prot_versions proto,
+		     u8 *addr, int mask_bits)
+{
+	struct qeth_ipato_entry *ipatoe, *tmp;
+	unsigned long flags;
+
+	QETH_DBF_TEXT(trace, 2, "delipato");
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry_safe(ipatoe, tmp, &card->ipato.entries, entry){
+		if (ipatoe->proto != proto)
+			continue;
+		if (!memcmp(ipatoe->addr, addr,
+			    (proto == QETH_PROT_IPV4)? 4:16) &&
+		    (ipatoe->mask_bits == mask_bits)){
+			list_del(&ipatoe->entry);
+			kfree(ipatoe);
+		}
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+}
+
+static inline void
+qeth_convert_addr_to_bits(u8 *addr, u8 *bits, int len)
+{
+	int i, j;
+	u8 octet;
+
+	for (i = 0; i < len; ++i){
+		octet = addr[i];
+		for (j = 7; j >= 0; --j){
+			bits[i*8 + j] = octet & 1;
+			octet >>= 1;
+		}
+	}
+}
+
+static int
+qeth_is_addr_covered_by_ipato(struct qeth_card *card, struct qeth_ipaddr *addr)
+{
+	struct qeth_ipato_entry *ipatoe;
+	u8 addr_bits[128] = {0, };
+	u8 ipatoe_bits[128] = {0, };
+	int rc = 0;
+
+	if (!card->ipato.enabled)
+		return 0;
+
+	qeth_convert_addr_to_bits((u8 *) &addr->u, addr_bits,
+				  (addr->proto == QETH_PROT_IPV4)? 4:16);
+	list_for_each_entry(ipatoe, &card->ipato.entries, entry){
+		if (addr->proto != ipatoe->proto)
+			continue;
+		qeth_convert_addr_to_bits(ipatoe->addr, ipatoe_bits,
+					  (ipatoe->proto==QETH_PROT_IPV4) ?
+					  4:16);
+		if (addr->proto == QETH_PROT_IPV4)
+			rc = !memcmp(addr_bits, ipatoe_bits,
+				     min(32, ipatoe->mask_bits));
+		else
+			rc = !memcmp(addr_bits, ipatoe_bits,
+				     min(128, ipatoe->mask_bits));
+		if (rc)
+			break;
+	}
+	/* invert? */
+	if ((addr->proto == QETH_PROT_IPV4) && card->ipato.invert4)
+		rc = !rc;
+	else if ((addr->proto == QETH_PROT_IPV6) && card->ipato.invert6)
+		rc = !rc;
+
+	return rc;
+}
+
+/*
+ * VIPA related functions
+ */
+int
+qeth_add_vipa(struct qeth_card *card, enum qeth_prot_versions proto,
+	      const u8 *addr)
+{
+	struct qeth_ipaddr *ipaddr;
+	unsigned long flags;
+	int rc = 0;
+
+	ipaddr = qeth_get_addr_buffer(proto);
+	if (ipaddr){
+		if (proto == QETH_PROT_IPV4){
+			QETH_DBF_TEXT(trace, 2, "addvipa4");
+			memcpy(&ipaddr->u.a4.addr, addr, 4);
+			ipaddr->u.a4.mask = 0;
+#ifdef CONFIG_QETH_IPV6
+		} else if (proto == QETH_PROT_IPV6){
+			QETH_DBF_TEXT(trace, 2, "addvipa6");
+			memcpy(&ipaddr->u.a6.addr, addr, 16);
+			ipaddr->u.a6.pfxlen = 0;
+#endif
+		}
+		ipaddr->type = QETH_IP_TYPE_VIPA;
+		ipaddr->set_flags = QETH_IPA_SETIP_VIPA_FLAG;
+		ipaddr->del_flags = QETH_IPA_DELIP_VIPA_FLAG;
+	} else
+		return -ENOMEM;
+	spin_lock_irqsave(&card->ip_lock, flags);
+	if (__qeth_address_exists_in_list(&card->ip_list, ipaddr, 0) ||
+	    __qeth_address_exists_in_list(&card->ip_tbd_list, ipaddr, 0))
+		rc = -EEXIST;
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	if (rc){
+		PRINT_WARN("Cannot add VIPA. Address already exists!\n");
+		return rc;
+	}
+	if (!qeth_add_ip(card, ipaddr))
+		kfree(ipaddr);
+	qeth_set_thread_start_bit(card, QETH_SET_IP_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+	return rc;
+}
+
+void
+qeth_del_vipa(struct qeth_card *card, enum qeth_prot_versions proto,
+	      const u8 *addr)
+{
+	struct qeth_ipaddr *ipaddr;
+
+	ipaddr = qeth_get_addr_buffer(proto);
+	if (ipaddr){
+		if (proto == QETH_PROT_IPV4){
+			QETH_DBF_TEXT(trace, 2, "delvipa4");
+			memcpy(&ipaddr->u.a4.addr, addr, 4);
+			ipaddr->u.a4.mask = 0;
+#ifdef CONFIG_QETH_IPV6
+		} else if (proto == QETH_PROT_IPV6){
+			QETH_DBF_TEXT(trace, 2, "delvipa6");
+			memcpy(&ipaddr->u.a6.addr, addr, 16);
+			ipaddr->u.a6.pfxlen = 0;
+#endif
+		}
+		ipaddr->type = QETH_IP_TYPE_VIPA;
+	} else
+		return;
+	if (!qeth_delete_ip(card, ipaddr))
+		kfree(ipaddr);
+	qeth_set_thread_start_bit(card, QETH_SET_IP_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+}
+
+/*
+ * proxy ARP related functions
+ */
+int
+qeth_add_rxip(struct qeth_card *card, enum qeth_prot_versions proto,
+	      const u8 *addr)
+{
+	struct qeth_ipaddr *ipaddr;
+	unsigned long flags;
+	int rc = 0;
+
+	ipaddr = qeth_get_addr_buffer(proto);
+	if (ipaddr){
+		if (proto == QETH_PROT_IPV4){
+			QETH_DBF_TEXT(trace, 2, "addrxip4");
+			memcpy(&ipaddr->u.a4.addr, addr, 4);
+			ipaddr->u.a4.mask = 0;
+#ifdef CONFIG_QETH_IPV6
+		} else if (proto == QETH_PROT_IPV6){
+			QETH_DBF_TEXT(trace, 2, "addrxip6");
+			memcpy(&ipaddr->u.a6.addr, addr, 16);
+			ipaddr->u.a6.pfxlen = 0;
+#endif
+		}
+		ipaddr->type = QETH_IP_TYPE_RXIP;
+		ipaddr->set_flags = QETH_IPA_SETIP_TAKEOVER_FLAG;
+		ipaddr->del_flags = 0;
+	} else
+		return -ENOMEM;
+	spin_lock_irqsave(&card->ip_lock, flags);
+	if (__qeth_address_exists_in_list(&card->ip_list, ipaddr, 0) ||
+	    __qeth_address_exists_in_list(&card->ip_tbd_list, ipaddr, 0))
+		rc = -EEXIST;
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	if (rc){
+		PRINT_WARN("Cannot add RXIP. Address already exists!\n");
+		return rc;
+	}
+	if (!qeth_add_ip(card, ipaddr))
+		kfree(ipaddr);
+	qeth_set_thread_start_bit(card, QETH_SET_IP_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+	return 0;
+}
+
+void
+qeth_del_rxip(struct qeth_card *card, enum qeth_prot_versions proto,
+	      const u8 *addr)
+{
+	struct qeth_ipaddr *ipaddr;
+
+	ipaddr = qeth_get_addr_buffer(proto);
+	if (ipaddr){
+		if (proto == QETH_PROT_IPV4){
+			QETH_DBF_TEXT(trace, 2, "addrxip4");
+			memcpy(&ipaddr->u.a4.addr, addr, 4);
+			ipaddr->u.a4.mask = 0;
+#ifdef CONFIG_QETH_IPV6
+		} else if (proto == QETH_PROT_IPV6){
+			QETH_DBF_TEXT(trace, 2, "addrxip6");
+			memcpy(&ipaddr->u.a6.addr, addr, 16);
+			ipaddr->u.a6.pfxlen = 0;
+#endif
+		}
+		ipaddr->type = QETH_IP_TYPE_RXIP;
+	} else
+		return;
+	if (!qeth_delete_ip(card, ipaddr))
+		kfree(ipaddr);
+	qeth_set_thread_start_bit(card, QETH_SET_IP_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+}
+
+/**
+ * IP event handler
+ */
+static int
+qeth_ip_event(struct notifier_block *this,
+	      unsigned long event,void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	struct net_device *dev =(struct net_device *) ifa->ifa_dev->dev;
+	struct qeth_ipaddr *addr;
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,3,"ipevent");
+	card = qeth_get_card_from_dev(dev);
+	if (!card)
+		return NOTIFY_DONE;
+
+	addr = qeth_get_addr_buffer(QETH_PROT_IPV4);
+	if (addr != NULL) {
+		addr->u.a4.addr = ifa->ifa_address;
+		addr->u.a4.mask = ifa->ifa_mask;
+		addr->type = QETH_IP_TYPE_NORMAL;
+	}
+	switch(event) {
+	case NETDEV_UP:
+		if (addr) {
+			if (!qeth_add_ip(card, addr))
+				kfree(addr);
+		}
+		break;
+	case NETDEV_DOWN:
+		if (addr) {
+			if (!qeth_delete_ip(card, addr))
+				kfree(addr);
+		}
+		break;
+	default:
+		break;
+	}
+	qeth_set_thread_start_bit(card, QETH_SET_IP_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block qeth_ip_notifier = {
+	qeth_ip_event,
+	0
+};
+
+#ifdef CONFIG_QETH_IPV6
+/**
+ * IPv6 event handler
+ */
+static int
+qeth_ip6_event(struct notifier_block *this,
+	      unsigned long event,void *ptr)
+{
+
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	struct net_device *dev = (struct net_device *)ifa->idev->dev;
+	struct qeth_ipaddr *addr;
+	struct qeth_card *card;
+
+	QETH_DBF_TEXT(trace,3,"ip6event");
+
+	card = qeth_get_card_from_dev(dev);
+	if (!card)
+		return NOTIFY_DONE;
+	if (!qeth_is_supported(card, IPA_IPV6))
+		return NOTIFY_DONE;
+
+	addr = qeth_get_addr_buffer(QETH_PROT_IPV6);
+	if (addr != NULL) {
+		memcpy(&addr->u.a6.addr, &ifa->addr, sizeof(struct in6_addr));
+		addr->u.a6.pfxlen = ifa->prefix_len;
+		addr->type = QETH_IP_TYPE_NORMAL;
+	}
+	switch(event) {
+	case NETDEV_UP:
+		if (addr){
+			if (!qeth_add_ip(card, addr))
+				kfree(addr);
+		}
+		break;
+	case NETDEV_DOWN:
+		if (addr){
+			if (!qeth_delete_ip(card, addr))
+				kfree(addr);
+		}
+		break;
+	default:
+		break;
+	}
+	qeth_set_thread_start_bit(card, QETH_SET_IP_THREAD);
+	schedule_work(&card->kernel_thread_starter);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block qeth_ip6_notifier = {
+	qeth_ip6_event,
+	0
+};
+#endif
+
+static int
+qeth_reboot_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+
+	struct device *entry;
+	struct qeth_card *card;
+
+	down_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+	       list_for_each_entry(entry, &qeth_ccwgroup_driver.driver.devices,
+			           driver_list) {
+	               card = (struct qeth_card *) entry->driver_data;
+		       qeth_clear_ip_list(card, 0, 0);
+		       qeth_qdio_clear_card(card, 0);
+	       }
+	up_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+	return NOTIFY_DONE;
+}
+
+
+static struct notifier_block qeth_reboot_notifier = {
+	qeth_reboot_event,
+	0
+};
+
+static int
+qeth_register_notifiers(void)
+{
+        int r;
+
+	QETH_DBF_TEXT(trace,5,"regnotif");
+	if ((r = register_reboot_notifier(&qeth_reboot_notifier)))
+		return r;
+	if ((r = register_inetaddr_notifier(&qeth_ip_notifier)))
+		goto out_reboot;
+#ifdef CONFIG_QETH_IPV6
+	if ((r = register_inet6addr_notifier(&qeth_ip6_notifier)))
+		goto out_ipv4;
+#endif
+	return 0;
+
+#ifdef CONFIG_QETH_IPV6
+out_ipv4:
+	unregister_inetaddr_notifier(&qeth_ip_notifier);
+#endif
+out_reboot:
+	unregister_reboot_notifier(&qeth_reboot_notifier);
+	return r;
+}
+
+/**
+ * unregister all event notifiers
+ */
+static void
+qeth_unregister_notifiers(void)
+{
+
+	QETH_DBF_TEXT(trace,5,"unregnot");
+	BUG_ON(unregister_reboot_notifier(&qeth_reboot_notifier));
+	BUG_ON(unregister_inetaddr_notifier(&qeth_ip_notifier));
+#ifdef CONFIG_QETH_IPV6
+	BUG_ON(unregister_inet6addr_notifier(&qeth_ip6_notifier));
+#endif /* QETH_IPV6 */
+
+}
+
+#ifdef CONFIG_QETH_IPV6
+static int
+qeth_ipv6_init(void)
+{
+	qeth_old_arp_constructor = arp_tbl.constructor;
+	write_lock(&arp_tbl.lock);
+	arp_tbl.constructor = qeth_arp_constructor;
+	write_unlock(&arp_tbl.lock);
+
+	arp_direct_ops = (struct neigh_ops*)
+		kmalloc(sizeof(struct neigh_ops), GFP_KERNEL);
+	if (!arp_direct_ops)
+		return -ENOMEM;
+
+	memcpy(arp_direct_ops, &arp_direct_ops_template,
+	       sizeof(struct neigh_ops));
+
+	return 0;
+}
+
+static void
+qeth_ipv6_uninit(void)
+{
+	write_lock(&arp_tbl.lock);
+	arp_tbl.constructor = qeth_old_arp_constructor;
+	write_unlock(&arp_tbl.lock);
+	kfree(arp_direct_ops);
+}
+#endif /* CONFIG_QETH_IPV6 */
+
+static void
+qeth_sysfs_unregister(void)
+{
+	qeth_remove_driver_attributes();
+	ccw_driver_unregister(&qeth_ccw_driver);
+	ccwgroup_driver_unregister(&qeth_ccwgroup_driver);
+	s390_root_dev_unregister(qeth_root_dev);
+}
+/**
+ * register qeth at sysfs
+ */
+static int
+qeth_sysfs_register(void)
+{
+	int rc=0;
+
+	rc = ccwgroup_driver_register(&qeth_ccwgroup_driver);
+	if (rc)
+		return rc;
+	rc = ccw_driver_register(&qeth_ccw_driver);
+	if (rc)
+	 	return rc;
+	rc = qeth_create_driver_attributes();
+	if (rc)
+		return rc;
+	qeth_root_dev = s390_root_dev_register("qeth");
+	if (IS_ERR(qeth_root_dev)) {
+		rc = PTR_ERR(qeth_root_dev);
+		return rc;
+	}
+	return 0;
+}
+
+/***
+ * init function
+ */
+static int __init
+qeth_init(void)
+{
+	int rc=0;
+
+	qeth_eyecatcher();
+	printk(KERN_INFO "qeth: loading %s\n",version);
+
+	INIT_LIST_HEAD(&qeth_card_list.list);
+	rwlock_init(&qeth_card_list.rwlock);
+
+	atomic_set(&qeth_hsi_count, 0);
+	if (qeth_register_dbf_views())
+		goto out_err;
+	if (qeth_sysfs_register())
+		goto out_sysfs;
+
+#ifdef CONFIG_QETH_IPV6
+	if (qeth_ipv6_init()) {
+		PRINT_ERR("Out of memory during ipv6 init.\n");
+		goto out_sysfs;
+	}
+#endif /* QETH_IPV6 */
+	if (qeth_register_notifiers())
+		goto out_ipv6;
+	if (qeth_create_procfs_entries())
+		goto out_notifiers;
+
+	return rc;
+
+out_notifiers:
+	qeth_unregister_notifiers();
+out_ipv6:
+#ifdef CONFIG_QETH_IPV6
+	qeth_ipv6_uninit();
+#endif /* QETH_IPV6 */
+out_sysfs:
+	qeth_sysfs_unregister();
+	qeth_unregister_dbf_views();
+out_err:
+	PRINT_ERR("Initialization failed");
+	return rc;
+}
+
+static void
+__exit qeth_exit(void)
+{
+	struct qeth_card *card, *tmp;
+	unsigned long flags;
+
+	QETH_DBF_TEXT(trace,1, "cleanup.");
+
+	/*
+	 * Weed would not need to clean up our devices here, because the
+	 * common device layer calls qeth_remove_device for each device
+	 * as soon as we unregister our driver (done in qeth_sysfs_unregister).
+	 * But we do cleanup here so we can do a "soft" shutdown of our cards.
+	 * qeth_remove_device called by the common device layer would otherwise
+	 * do a "hard" shutdown (card->use_hard_stop is set to one in
+	 * qeth_remove_device).
+	 */
+again:
+	read_lock_irqsave(&qeth_card_list.rwlock, flags);
+	list_for_each_entry_safe(card, tmp, &qeth_card_list.list, list){
+		read_unlock_irqrestore(&qeth_card_list.rwlock, flags);
+		qeth_set_offline(card->gdev);
+		qeth_remove_device(card->gdev);
+		goto again;
+	}
+	read_unlock_irqrestore(&qeth_card_list.rwlock, flags);
+#ifdef CONFIG_QETH_IPV6
+	qeth_ipv6_uninit();
+#endif
+	qeth_unregister_notifiers();
+	qeth_remove_procfs_entries();
+	qeth_sysfs_unregister();
+	qeth_unregister_dbf_views();
+	printk("qeth: removed\n");
+}
+
+EXPORT_SYMBOL(qeth_eyecatcher);
+module_init(qeth_init);
+module_exit(qeth_exit);
+MODULE_AUTHOR("Frank Pavlic <pavlic@de.ibm.com>");
+MODULE_DESCRIPTION("Linux on zSeries OSA Express and HiperSockets support\n" \
+		                      "Copyright 2000,2003 IBM Corporation\n");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/s390/net/qeth_mpc.c b/drivers/s390/net/qeth_mpc.c
index 7adce027b9e4..bef90ca150f6 100644
--- a/drivers/s390/net/qeth_mpc.c
+++ b/drivers/s390/net/qeth_mpc.c
@@ -4,7 +4,8 @@
  * Linux on zSeries OSA Express and HiperSockets support
  *
  * Copyright 2000,2003 IBM Corporation
- * Author(s): Utz Bacher <utz.bacher@de.ibm.com>
+ * Author(s): Frank Pavlic <pavlic@de.ibm.com>
+ * 	      Thomas Spatzier <tspat@de.ibm.com>
  *
  */
 #include <asm/cio.h>
@@ -126,16 +127,22 @@ unsigned char DM_ACT[]={
 unsigned char IPA_PDU_HEADER[]={
 	0x00,0xe0,0x00,0x00, 0x77,0x77,0x77,0x77,
 	0x00,0x00,0x00,0x14, 0x00,0x00,
-		(IPA_PDU_HEADER_SIZE+sizeof(struct ipa_cmd))/256,
-		(IPA_PDU_HEADER_SIZE+sizeof(struct ipa_cmd))%256,
+		(IPA_PDU_HEADER_SIZE+sizeof(struct qeth_ipa_cmd))/256,
+		(IPA_PDU_HEADER_SIZE+sizeof(struct qeth_ipa_cmd))%256,
 	0x10,0x00,0x00,0x01,
 	0x00,0x00,0x00,0x00,
 	0xc1,0x03,0x00,0x01, 0x00,0x00,0x00,0x00,
-	0x00,0x00,0x00,0x00, 0x00,0x24,0x00,sizeof(struct ipa_cmd),
-	0x00,0x00,sizeof(struct ipa_cmd),0x05, 0x77,0x77,0x77,0x77,
+	0x00,0x00,0x00,0x00, 0x00,0x24,
+		sizeof(struct qeth_ipa_cmd)/256,
+		sizeof(struct qeth_ipa_cmd)%256,
+	0x00,
+		sizeof(struct qeth_ipa_cmd)/256,
+		sizeof(struct qeth_ipa_cmd),0x05, 0x77,0x77,0x77,0x77,
 	0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
-	0x01,0x00,sizeof(struct ipa_cmd)/256,sizeof(struct ipa_cmd)%256,
-				0x00,0x00,0x00,0x40,
+	0x01,0x00,
+		sizeof(struct qeth_ipa_cmd)/256,
+		sizeof(struct qeth_ipa_cmd)%256,
+	0x00,0x00,0x00,0x40,
 };
 
 unsigned char WRITE_CCW[]={
@@ -158,4 +165,3 @@ unsigned char READ_CCW[]={
 
 
-
diff --git a/drivers/s390/net/qeth_mpc.h b/drivers/s390/net/qeth_mpc.h
index 5998604695b6..b1cfca838541 100644
--- a/drivers/s390/net/qeth_mpc.h
+++ b/drivers/s390/net/qeth_mpc.h
@@ -5,346 +5,279 @@
  *
  * Copyright 2000,2003 IBM Corporation
  * Author(s): Utz Bacher <utz.bacher@de.ibm.com>
+ *            Thomas Spatzier <tspat@de.ibm.com>
+ *            Frank Pavlic <pavlic@de.ibm.com>
  *
  */
 #ifndef __QETH_MPC_H__
 #define __QETH_MPC_H__
 
-#define VERSION_QETH_MPC_H "$Revision: 1.18 $"
+#include <asm/qeth.h>
 
-#define QETH_IPA_TIMEOUT (card->ipa_timeout)
-#define QETH_MPC_TIMEOUT 2000
-#define QETH_ADDR_TIMEOUT 1000
+#define VERSION_QETH_MPC_H "$Revision: 1.27 $"
 
-#define QETH_SETIP_RETRIES 2
-
-#define IDX_ACTIVATE_SIZE 0x22
-#define CM_ENABLE_SIZE 0x63
-#define CM_SETUP_SIZE 0x64
-#define ULP_ENABLE_SIZE 0x6b
-#define ULP_SETUP_SIZE 0x6c
-#define DM_ACT_SIZE 0x55
+#define IPA_PDU_HEADER_SIZE	0x40
+#define QETH_IPA_PDU_LEN_TOTAL(buffer) (buffer+0x0e)
+#define QETH_IPA_PDU_LEN_PDU1(buffer) (buffer+0x26)
+#define QETH_IPA_PDU_LEN_PDU2(buffer) (buffer+0x2a)
+#define QETH_IPA_PDU_LEN_PDU3(buffer) (buffer+0x3a)
 
-#define QETH_MPC_TOKEN_LENGTH 4
-#define QETH_SEQ_NO_LENGTH 4
-#define QETH_IPA_SEQ_NO_LENGTH 2
+extern unsigned char IPA_PDU_HEADER[];
+#define QETH_IPA_CMD_DEST_ADDR(buffer) (buffer+0x2c)
 
-#define QETH_TRANSPORT_HEADER_SEQ_NO(buffer) (buffer+4)
-#define QETH_PDU_HEADER_SEQ_NO(buffer) (buffer+0x1c)
-#define QETH_PDU_HEADER_ACK_SEQ_NO(buffer) (buffer+0x20)
+#define IPA_CMD_LENGTH	(IPA_PDU_HEADER_SIZE + sizeof(struct qeth_ipa_cmd))
 
-extern unsigned char IDX_ACTIVATE_READ[];
+#define QETH_SEQ_NO_LENGTH 	4
+#define QETH_MPC_TOKEN_LENGTH 	4
+#define QETH_MCL_LENGTH		4
+#define OSA_ADDR_LEN		6
 
-extern unsigned char IDX_ACTIVATE_WRITE[];
+#define QETH_TIMEOUT 		(10 * HZ)
+#define QETH_IDX_COMMAND_SEQNO 	-1
+#define SR_INFO_LEN		16
 
-#define QETH_IDX_ACT_ISSUER_RM_TOKEN(buffer) (buffer+0x0c)
-#define QETH_IDX_NO_PORTNAME_REQUIRED(buffer) ((buffer)[0x0b]&0x80)
-#define QETH_IDX_ACT_FUNC_LEVEL(buffer) (buffer+0x10)
-#define QETH_IDX_ACT_DATASET_NAME(buffer) (buffer+0x16)
-#define QETH_IDX_ACT_QDIO_DEV_CUA(buffer) (buffer+0x1e)
-#define QETH_IDX_ACT_QDIO_DEV_REALADDR(buffer) (buffer+0x20)
+#define QETH_CLEAR_CHANNEL_PARM	-10
+#define QETH_HALT_CHANNEL_PARM	-11
 
-#define QETH_IS_IDX_ACT_POS_REPLY(buffer) (((buffer)[0x08]&3)==2)
+/*****************************************************************************/
+/* IP Assist related definitions                                             */
+/*****************************************************************************/
+#define IPA_CMD_INITIATOR_HOST  0x00
+#define IPA_CMD_INITIATOR_HYDRA 0x01
+#define IPA_CMD_PRIM_VERSION_NO 0x01
 
-#define QETH_IDX_REPLY_LEVEL(buffer) (buffer+0x12)
-#define QETH_MCL_LENGTH 4
+enum qeth_card_types {
+	QETH_CARD_TYPE_UNKNOWN = 0,
+	QETH_CARD_TYPE_OSAE    = 10,
+	QETH_CARD_TYPE_IQD     = 1234,
+};
 
-extern unsigned char CM_ENABLE[];
+#define QETH_MPC_DIFINFO_LEN_INDICATES_LINK_TYPE 0x18
+/* only the first two bytes are looked at in qeth_get_cardname_short */
+enum qeth_link_types {
+	QETH_LINK_TYPE_FAST_ETH     = 0x01,
+	QETH_LINK_TYPE_HSTR         = 0x02,
+	QETH_LINK_TYPE_GBIT_ETH     = 0x03,
+	QETH_LINK_TYPE_10GBIT_ETH   = 0x10,
+	QETH_LINK_TYPE_LANE_ETH100  = 0x81,
+	QETH_LINK_TYPE_LANE_TR      = 0x82,
+	QETH_LINK_TYPE_LANE_ETH1000 = 0x83,
+	QETH_LINK_TYPE_LANE         = 0x88,
+	QETH_LINK_TYPE_ATM_NATIVE   = 0x90,
+};
 
-#define QETH_CM_ENABLE_ISSUER_RM_TOKEN(buffer) (buffer+0x2c)
-#define QETH_CM_ENABLE_FILTER_TOKEN(buffer) (buffer+0x53)
-#define QETH_CM_ENABLE_USER_DATA(buffer) (buffer+0x5b)
+enum qeth_tr_macaddr_modes {
+	QETH_TR_MACADDR_NONCANONICAL = 0,
+	QETH_TR_MACADDR_CANONICAL    = 1,
+};
 
-#define QETH_CM_ENABLE_RESP_FILTER_TOKEN(buffer) (PDU_ENCAPSULATION(buffer)+ \
-						  0x13)
+enum qeth_tr_broadcast_modes {
+	QETH_TR_BROADCAST_ALLRINGS = 0,
+	QETH_TR_BROADCAST_LOCAL    = 1,
+};
 
-extern unsigned char CM_SETUP[];
+/* these values match CHECKSUM_* in include/linux/skbuff.h */
+enum qeth_checksum_types {
+	SW_CHECKSUMMING = 0, /* TODO: set to bit flag used in IPA Command */
+	HW_CHECKSUMMING = 1,
+	NO_CHECKSUMMING = 2,
+};
+#define QETH_CHECKSUM_DEFAULT SW_CHECKSUMMING
 
-#define QETH_CM_SETUP_DEST_ADDR(buffer) (buffer+0x2c)
-#define QETH_CM_SETUP_CONNECTION_TOKEN(buffer) (buffer+0x51)
-#define QETH_CM_SETUP_FILTER_TOKEN(buffer) (buffer+0x5a)
+/*
+ * Routing stuff
+ */
+#define RESET_ROUTING_FLAG 0x10 /* indicate that routing type shall be set */
+enum qeth_routing_types {
+	NO_ROUTER           = 0, /* TODO: set to bit flag used in IPA Command */
+	PRIMARY_ROUTER      = 1,
+	SECONDARY_ROUTER    = 2,
+	MULTICAST_ROUTER    = 3,
+	PRIMARY_CONNECTOR   = 4,
+	SECONDARY_CONNECTOR = 5,
+};
 
-#define QETH_CM_SETUP_RESP_DEST_ADDR(buffer) (PDU_ENCAPSULATION(buffer)+ \
-					      0x1a)
 
-extern unsigned char ULP_ENABLE[];
+/* IPA Commands */
+enum qeth_ipa_cmds {
+	IPA_CMD_STARTLAN              = 0x01,
+	IPA_CMD_STOPLAN               = 0x02,
+	IPA_CMD_SETIP                 = 0xb1,
+	IPA_CMD_DELIP                 = 0xb7,
+	IPA_CMD_QIPASSIST             = 0xb2,
+	IPA_CMD_SETASSPARMS           = 0xb3,
+	IPA_CMD_SETIPM                = 0xb4,
+	IPA_CMD_DELIPM                = 0xb5,
+	IPA_CMD_SETRTG                = 0xb6,
+	IPA_CMD_SETADAPTERPARMS       = 0xb8,
+	IPA_CMD_IPFRAME               = 0xb9,
+	IPA_CMD_ADD_ADDR_ENTRY        = 0xc1,
+	IPA_CMD_DELETE_ADDR_ENTRY     = 0xc2,
+	IPA_CMD_CREATE_ADDR           = 0xc3,
+	IPA_CMD_DESTROY_ADDR          = 0xc4,
+	IPA_CMD_REGISTER_LOCAL_ADDR   = 0xd1,
+	IPA_CMD_UNREGISTER_LOCAL_ADDR = 0xd2,
+};
 
-#define QETH_ULP_ENABLE_LINKNUM(buffer) (buffer+0x61)
-#define QETH_ULP_ENABLE_DEST_ADDR(buffer) (buffer+0x2c)
-#define QETH_ULP_ENABLE_FILTER_TOKEN(buffer) (buffer+0x53)
-#define QETH_ULP_ENABLE_PORTNAME_AND_LL(buffer) (buffer+0x62)
+enum qeth_ip_ass_cmds {
+	IPA_CMD_ASS_START	= 0x0001,
+	IPA_CMD_ASS_STOP	= 0x0002,
+	IPA_CMD_ASS_CONFIGURE 	= 0x0003,
+	IPA_CMD_ASS_ENABLE 	= 0x0004,
+};
 
-#define QETH_ULP_ENABLE_RESP_FILTER_TOKEN(buffer) (PDU_ENCAPSULATION(buffer)+ \
-						   0x13)
-#define QETH_ULP_ENABLE_RESP_MAX_MTU(buffer) (PDU_ENCAPSULATION(buffer)+ 0x1f)
-#define QETH_ULP_ENABLE_RESP_DIFINFO_LEN(buffer) (PDU_ENCAPSULATION(buffer)+ \
-					  	  0x17)
-#define QETH_ULP_ENABLE_RESP_LINK_TYPE(buffer) (PDU_ENCAPSULATION(buffer)+ \
-						0x2b)
+enum qeth_arp_process_subcmds {
+	IPA_CMD_ASS_ARP_SET_NO_ENTRIES 	= 0x0003,
+	IPA_CMD_ASS_ARP_QUERY_CACHE 	= 0x0004,
+	IPA_CMD_ASS_ARP_ADD_ENTRY 	= 0x0005,
+	IPA_CMD_ASS_ARP_REMOVE_ENTRY 	= 0x0006,
+	IPA_CMD_ASS_ARP_FLUSH_CACHE 	= 0x0007,
+	IPA_CMD_ASS_ARP_QUERY_INFO 	= 0x0104,
+	IPA_CMD_ASS_ARP_QUERY_STATS 	= 0x0204,
+};
 
-extern unsigned char ULP_SETUP[];
+/* Return Codes for IPA Commands */
+enum qeth_ipa_return_codes {
+	IPA_RC_SUCCESS             = 0x0000,
+	IPA_RC_NOTSUPP             = 0x0001,
+	IPA_RC_NO_ACCESS           = 0x0002,
+	IPA_RC_FAILED              = 0x0003,
+	IPA_RC_DATA_MISMATCH       = 0xe001,
+	IPA_RC_INVALID_LAN_TYPE    = 0xe003,
+	IPA_RC_INVALID_LAN_NO      = 0xe004,
+	IPA_RC_IPADDR_ALREADY_REG  = 0xe005,
+	IPA_RC_IPADDR_TABLE_FULL   = 0xe006,
+	IPA_RC_IPADDR_ALREADY_USED = 0xe00a,
+	IPA_RC_ASSNO_NOT_SUPP      = 0xe00d,
+	IPA_RC_ASSCMD_START_FAILED = 0xe00e,
+	IPA_RC_ASSCMD_PART_SUCCESS = 0xe00f,
+	IPA_RC_IPADDR_NOT_DEFINED  = 0xe010,
+	IPA_RC_LAN_OFFLINE         = 0xe080,
+};
 
-#define QETH_ULP_SETUP_DEST_ADDR(buffer) (buffer+0x2c)
-#define QETH_ULP_SETUP_CONNECTION_TOKEN(buffer) (buffer+0x51)
-#define QETH_ULP_SETUP_FILTER_TOKEN(buffer) (buffer+0x5a)
-#define QETH_ULP_SETUP_CUA(buffer) (buffer+0x68)
-#define QETH_ULP_SETUP_REAL_DEVADDR(buffer) (buffer+0x6a)
+/* IPA function flags; each flag marks availability of respective function */
+enum qeth_ipa_funcs {
+	IPA_ARP_PROCESSING      = 0x00000001L,
+	IPA_INBOUND_CHECKSUM    = 0x00000002L,
+	IPA_OUTBOUND_CHECKSUM   = 0x00000004L,
+	IPA_IP_FRAGMENTATION    = 0x00000008L,
+	IPA_FILTERING           = 0x00000010L,
+	IPA_IPV6                = 0x00000020L,
+	IPA_MULTICASTING        = 0x00000040L,
+	IPA_IP_REASSEMBLY       = 0x00000080L,
+	IPA_QUERY_ARP_COUNTERS  = 0x00000100L,
+	IPA_QUERY_ARP_ADDR_INFO = 0x00000200L,
+	IPA_SETADAPTERPARMS     = 0x00000400L,
+	IPA_VLAN_PRIO           = 0x00000800L,
+	IPA_PASSTHRU            = 0x00001000L,
+	IPA_FULL_VLAN           = 0x00004000L,
+	IPA_SOURCE_MAC          = 0x00010000L,
+	IPA_OSA_MC_ROUTER       = 0x00020000L,
+};
 
-#define QETH_ULP_SETUP_RESP_CONNECTION_TOKEN(buffer) (PDU_ENCAPSULATION \
-						      (buffer)+0x1a)
+/* SETIP/DELIP IPA Command: ***************************************************/
+enum qeth_ipa_setdelip_flags {
+	QETH_IPA_SETDELIP_DEFAULT          = 0x00L, /* default */
+	QETH_IPA_SETIP_VIPA_FLAG           = 0x01L, /* no grat. ARP */
+	QETH_IPA_SETIP_TAKEOVER_FLAG       = 0x02L, /* nofail on grat. ARP */
+	QETH_IPA_DELIP_ADDR_2_B_TAKEN_OVER = 0x20L,
+	QETH_IPA_DELIP_VIPA_FLAG           = 0x40L,
+	QETH_IPA_DELIP_ADDR_NEEDS_SETIP    = 0x80L,
+};
 
-extern unsigned char DM_ACT[];
+/* SETADAPTER IPA Command: ****************************************************/
+enum qeth_ipa_setadp_cmd {
+	IPA_SETADP_QUERY_COMMANDS_SUPPORTED	= 0x01,
+	IPA_SETADP_ALTER_MAC_ADDRESS 		= 0x02,
+	IPA_SETADP_ADD_DELETE_GROUP_ADDRESS 	= 0x04,
+	IPA_SETADP_ADD_DELETE_FUNCTIONAL_ADDR 	= 0x08,
+	IPA_SETADP_SET_ADDRESSING_MODE 		= 0x10,
+	IPA_SETADP_SET_CONFIG_PARMS 		= 0x20,
+	IPA_SETADP_SET_CONFIG_PARMS_EXTENDED 	= 0x40,
+	IPA_SETADP_SET_BROADCAST_MODE 		= 0x80,
+	IPA_SETADP_SEND_OSA_MESSAGE 		= 0x0100,
+	IPA_SETADP_SET_SNMP_CONTROL 		= 0x0200,
+	IPA_SETADP_READ_SNMP_PARMS 		= 0x0400,
+	IPA_SETADP_WRITE_SNMP_PARMS 		= 0x0800,
+	IPA_SETADP_QUERY_CARD_INFO 		= 0x1000,
+};
+enum qeth_ipa_mac_ops {
+	CHANGE_ADDR_READ_MAC 		= 0,
+	CHANGE_ADDR_REPLACE_MAC 	= 1,
+	CHANGE_ADDR_ADD_MAC 		= 2,
+	CHANGE_ADDR_DEL_MAC 		= 4,
+	CHANGE_ADDR_RESET_MAC 		= 8,
+};
+enum qeth_ipa_addr_ops {
+	CHANGE_ADDR_READ_ADDR 		= 0,
+	CHANGE_ADDR_ADD_ADDR 		= 1,
+	CHANGE_ADDR_DEL_ADDR 		= 2,
+	CHANGE_ADDR_FLUSH_ADDR_TABLE 	= 4,
 
-#define QETH_DM_ACT_DEST_ADDR(buffer) (buffer+0x2c)
-#define QETH_DM_ACT_CONNECTION_TOKEN(buffer) (buffer+0x51)
 
-#define IPA_CMD_STARTLAN 0x01
-#define IPA_CMD_STOPLAN 0x02
-#define IPA_CMD_SETIP 0xb1
-#define IPA_CMD_DELIP 0xb7
-#define IPA_CMD_QIPASSIST 0xb2
-#define IPA_CMD_SETASSPARMS 0xb3
-#define IPA_CMD_SETIPM 0xb4
-#define IPA_CMD_DELIPM 0xb5
-#define IPA_CMD_SETRTG 0xb6
-#define IPA_CMD_SETADAPTERPARMS 0xb8
-#define IPA_CMD_ADD_ADDR_ENTRY 0xc1
-#define IPA_CMD_DELETE_ADDR_ENTRY 0xc2
-#define IPA_CMD_CREATE_ADDR 0xc3
-#define IPA_CMD_DESTROY_ADDR 0xc4
-#define IPA_CMD_REGISTER_LOCAL_ADDR 0xd1
-#define IPA_CMD_UNREGISTER_LOCAL_ADDR 0xd2
-
-#define INITIATOR_HOST 0
-#define INITIATOR_HYDRA 1
-
-#define PRIM_VERSION_IPA 1
-
-#define PROT_VERSION_SNA 1 /* hahaha */
-#define PROT_VERSION_IPv4 4
-#define PROT_VERSION_IPv6 6
-
-#define OSA_ADDR_LEN 6
-#define IPA_SETADAPTERPARMS_IP_VERSION PROT_VERSION_IPv4
-#define SR_INFO_LEN 16
-
-#define IPA_ARP_PROCESSING 0x00000001L
-#define IPA_INBOUND_CHECKSUM 0x00000002L
-#define IPA_OUTBOUND_CHECKSUM 0x00000004L
-#define IPA_IP_FRAGMENTATION 0x00000008L
-#define IPA_FILTERING 0x00000010L
-#define IPA_IPv6 0x00000020L
-#define IPA_MULTICASTING 0x00000040L
-#define IPA_IP_REASSEMBLY 0x00000080L
-#define IPA_QUERY_ARP_COUNTERS 0x00000100L
-#define IPA_QUERY_ARP_ADDR_INFO 0x00000200L
-#define IPA_SETADAPTERPARMS 0x00000400L
-#define IPA_VLAN_PRIO 0x00000800L
-#define IPA_PASSTHRU 0x00001000L
-#define IPA_FULL_VLAN 0x00004000L
-#define IPA_SOURCE_MAC_AVAIL 0x00010000L
-#define IPA_OSA_MC_ROUTER_AVAIL 0x00020000L
-
-#define IPA_SETADP_QUERY_COMMANDS_SUPPORTED 0x01
-#define IPA_SETADP_ALTER_MAC_ADDRESS 0x02
-#define IPA_SETADP_ADD_DELETE_GROUP_ADDRESS 0x04
-#define IPA_SETADP_ADD_DELETE_FUNCTIONAL_ADDR 0x08
-#define IPA_SETADP_SET_ADDRESSING_MODE 0x10
-#define IPA_SETADP_SET_CONFIG_PARMS 0x20
-#define IPA_SETADP_SET_CONFIG_PARMS_EXTENDED 0x40
-#define IPA_SETADP_SET_BROADCAST_MODE 0x80
-#define IPA_SETADP_SEND_OSA_MESSAGE 0x0100
-#define IPA_SETADP_SET_SNMP_CONTROL 0x0200
-#define IPA_SETADP_READ_SNMP_PARMS 0x0400
-#define IPA_SETADP_WRITE_SNMP_PARMS 0x0800
-#define IPA_SETADP_QUERY_CARD_INFO 0x1000
-
-#define CHANGE_ADDR_READ_MAC 0
-#define CHANGE_ADDR_REPLACE_MAC 1
-#define CHANGE_ADDR_ADD_MAC 2
-#define CHANGE_ADDR_DEL_MAC 4
-#define CHANGE_ADDR_RESET_MAC 8
-#define CHANGE_ADDR_READ_ADDR 0
-#define CHANGE_ADDR_ADD_ADDR 1
-#define CHANGE_ADDR_DEL_ADDR 2
-#define CHANGE_ADDR_FLUSH_ADDR_TABLE 4
- 
-/* we assumed, that the card is named card */
-#define qeth_is_supported(str) (card->ipa_supported&str)
-#define qeth_is_supported6(str) (card->ipa6_supported&str)
-#define qeth_is_adp_supported(str) (card->adp_supported&str)
-
-/* the same for all assist parms: */
-#define IPA_CMD_ASS_START 0x0001
-#define IPA_CMD_ASS_STOP 0x0002
-
-#define IPA_CMD_ASS_CONFIGURE 0x0003
-#define IPA_CMD_ASS_ENABLE 0x0004
-
-#define IPA_CMD_ASS_ARP_SET_NO_ENTRIES 0x0003
-#define IPA_CMD_ASS_ARP_QUERY_CACHE 0x0004
-#define IPA_CMD_ASS_ARP_ADD_ENTRY 0x0005
-#define IPA_CMD_ASS_ARP_REMOVE_ENTRY 0x0006
-#define IPA_CMD_ASS_ARP_FLUSH_CACHE 0x0007
-#define IPA_CMD_ASS_ARP_QUERY_INFO 0x0104
-#define IPA_CMD_ASS_ARP_QUERY_STATS 0x0204
-
-#define IPA_CHECKSUM_DEFAULT_ENABLE_MASK 0x001a
-
-#define IPA_CMD_ASS_FILTER_SET_TYPES 0x0003
-
-#define IPA_CMD_ASS_IPv6_SET_FUNCTIONS 0x0003
-
-#define IPA_REPLY_SUCCESS 0
-#define IPA_REPLY_FAILED 1
-#define IPA_REPLY_OPNOTSUPP 2
-#define IPA_REPLY_OPNOTSUPP2 4
-#define IPA_REPLY_NOINFO 8
-
-#define IPA_SETIP_FLAGS 0
-#define IPA_SETIP_VIPA_FLAGS 1
-#define IPA_SETIP_TAKEOVER_FLAGS 2
-
-#define VIPA_2_B_ADDED 0
-#define VIPA_ESTABLISHED 1
-#define VIPA_2_B_REMOVED 2
-
-#define IPA_DELIP_FLAGS 0
-
-#define IPA_SETADP_CMDSIZE 40
-
-struct ipa_setadp_cmd {
-	__u32 supp_hw_cmds;
-	__u32 reserved1;
-	__u16 cmdlength;
-	__u16 reserved2;
-	__u32 command_code;
+};
+/* (SET)DELIP(M) IPA stuff ***************************************************/
+struct qeth_ipacmd_setdelip4 {
+	__u8   ip_addr[4];
+	__u8   mask[4];
+	__u32  flags;
+} __attribute__ ((packed));
+
+struct qeth_ipacmd_setdelip6 {
+	__u8   ip_addr[16];
+	__u8   mask[16];
+	__u32  flags;
+} __attribute__ ((packed));
+
+struct qeth_ipacmd_setdelipm {
+	__u8 mac[6];
+	__u8 padding[2];
+	__u8 ip6[12];
+	__u8 ip4[4];
+} __attribute__ ((packed));
+
+struct qeth_ipacmd_setassparms_hdr {
+	__u32 assist_no;
+	__u16 length;
+	__u16 command_code;
 	__u16 return_code;
-	__u8 frames_used_total;
-	__u8 frame_seq_no;
-	__u32 reserved3;
+	__u8 number_of_replies;
+	__u8 seq_no;
+} __attribute__((packed));
+
+/* SETASSPARMS IPA Command: */
+struct qeth_ipacmd_setassparms {
+	struct qeth_ipacmd_setassparms_hdr hdr;
 	union {
-		struct {
-			__u32 no_lantypes_supp;
-			__u8 lan_type;
-			__u8 reserved1[3];
-			__u32 supported_cmds;
-			__u8 reserved2[8];
-		} query_cmds_supp;
-		struct {
-			__u32 cmd;
-			__u32 addr_size;
-			__u32 no_macs;
-			__u8 addr[OSA_ADDR_LEN];
-		} change_addr;
-		__u32 mode;
+		__u32 flags_32bit;
+		struct qeth_arp_cache_entry add_arp_entry;
+		__u8 ip[16];
 	} data;
+} __attribute__ ((packed));
+
+
+/* SETRTG IPA Command:    ****************************************************/
+struct qeth_set_routing {
+	__u8 type;
 };
 
-struct ipa_cmd{
-	__u8 command;
-	__u8 initiator;
-	__u16 seq_no;
-	__u16 return_code;
-	__u8 adapter_type;
-	__u8 rel_adapter_no;
-	__u8 prim_version_no;
-	__u8 param_count;
-	__u16 prot_version;
-	__u32 ipa_supported;
-	__u32 ipa_enabled;
-	union {
-		struct {
-			__u8 ip[4];
-			__u8 netmask[4];
-			__u32 flags;
-		} setdelip4;
-		struct {
-			__u8 ip[16];
-			__u8 netmask[16];
-			__u32 flags;
-		} setdelip6;
-		struct {
-			__u32 assist_no;
-			__u16 length;
-			__u16 command_code;
-			__u16 return_code;
-			__u8 number_of_replies;
-			__u8 seq_no;
-			union {
-				__u32 flags_32bit;
-				struct {
-					__u8 mac[6];
-					__u8 reserved[2];
-					__u8 ip[16];
-					__u8 reserved2[32];
-				} add_arp_entry;
-				__u8 ip[16];
-			} data;
-		} setassparms;
-		struct {
-			__u8 mac[6];
-			__u8 padding[2];
-			__u8 ip6[12];
-			__u8 ip4_6[4];
-		} setdelipm;
-		struct {
-			__u8 type;
-		} setrtg;
-		struct ipa_setadp_cmd setadapterparms;
-		struct {
-			__u32 command;
-#define ADDR_FRAME_TYPE_DIX 1
-#define ADDR_FRAME_TYPE_802_3 2
-#define ADDR_FRAME_TYPE_TR_WITHOUT_SR 0x10
-#define ADDR_FRAME_TYPE_TR_WITH_SR 0x20
-			__u32 frame_type;
-			__u32 cmd_flags;
-			__u8 ip_addr[16];
-			__u32 tag_field;
-			__u8 mac_addr[6];
-			__u8 reserved[10];
-			__u32 sr_len;
-			__u8 sr_info[SR_INFO_LEN];
-		} add_addr_entry;
-		struct {
-			__u32 command;
-			__u32 cmd_flags;
-			__u8 ip_addr[16];
-			__u32 tag_field;
-		} delete_addr_entry;
-		struct {
-			__u8 unique_id[8];
-		} create_destroy_addr;
-	} data;
-}__attribute__ ((packed));
-
-#define QETH_IOC_MAGIC 0x22
-/* these don't really have 'unsigned long' arguments but were defined that way */
-#define QETH_IOCPROC_OSAEINTERFACES _IOWR(QETH_IOC_MAGIC, 1, unsigned long)
-#define QETH_IOCPROC_INTERFACECHANGES _IOWR(QETH_IOC_MAGIC, 2, unsigned long)
-
-#define SNMP_QUERY_CARD_INFO 0x00000002L
-#define SNMP_REGISETER_MIB   0x00000004L
-#define SNMP_GET_OID         0x00000010L
-#define SNMP_SET_OID         0x00000011L
-#define SNMP_GET_NEXT_OID    0x00000012L
-#define SNMP_QUERY_ALERTS    0x00000020L
-#define SNMP_SET_TRAP        0x00000021L
-
-
-#define ARP_DATA_SIZE 3968
-#define ARP_FLUSH -3
-#define ARP_RETURNCODE_NOARPDATA -2
-#define ARP_RETURNCODE_ERROR -1
-#define ARP_RETURNCODE_SUCCESS 0
-#define ARP_RETURNCODE_LASTREPLY 1
-
-#define SNMP_BASE_CMDLENGTH 44
-#define SNMP_SETADP_CMDLENGTH 16
-#define SNMP_REQUEST_DATA_OFFSET 16
-
-struct snmp_ipa_setadp_cmd {
+/* SETADAPTERPARMS IPA Command:    *******************************************/
+struct qeth_query_cmds_supp {
+	__u32 no_lantypes_supp;
+	__u8 lan_type;
+	__u8 reserved1[3];
+	__u32 supported_cmds;
+	__u8 reserved2[8];
+} __attribute__ ((packed));
+
+struct qeth_change_addr {
+	__u32 cmd;
+	__u32 addr_size;
+	__u32 no_macs;
+	__u8 addr[OSA_ADDR_LEN];
+} __attribute__ ((packed));
+
+struct qeth_ipacmd_setadpparms {
 	__u32 supp_hw_cmds;
 	__u32 reserved1;
 	__u16 cmdlength;
@@ -354,91 +287,198 @@ struct snmp_ipa_setadp_cmd {
 	__u8 frames_used_total;
 	__u8 frame_seq_no;
 	__u32 reserved3;
-	__u8 snmp_token[16];
 	union {
-		struct {
-			__u32 snmp_request;
-			__u32 snmp_interface;
-			__u32 snmp_returncode;
-			__u32 snmp_firmwarelevel;
-			__u32 snmp_seqno;
-			__u8 snmp_data[ARP_DATA_SIZE];
-		} snmp_subcommand;
+		struct qeth_query_cmds_supp query_cmds_supp;
+		struct qeth_change_addr change_addr;
+		__u32 mode;
 	} data;
-}__attribute__ ((packed));
+} __attribute__ ((packed));
+
+/* IPFRAME IPA Command:    ***************************************************/
+/* TODO: define in analogy to commands define above */
+
+/* ADD_ADDR_ENTRY IPA Command:    ********************************************/
+/* TODO: define in analogy to commands define above */
+
+/* DELETE_ADDR_ENTRY IPA Command:    *****************************************/
+/* TODO: define in analogy to commands define above */
+
+/* CREATE_ADDR IPA Command:    ***********************************************/
+struct qeth_create_destroy_address {
+	__u8 unique_id[8];
+} __attribute__ ((packed));
+
+/* REGISTER_LOCAL_ADDR IPA Command:    ***************************************/
+/* TODO: define in analogy to commands define above */
+
+/* UNREGISTER_LOCAL_ADDR IPA Command:    *************************************/
+/* TODO: define in analogy to commands define above */
+
+/* Header for each IPA command */
+struct qeth_ipacmd_hdr {
+	__u8   command;
+	__u8   initiator;
+	__u16  seqno;
+	__u16  return_code;
+	__u8   adapter_type;
+	__u8   rel_adapter_no;
+	__u8   prim_version_no;
+	__u8   param_count;
+	__u16  prot_version;
+	__u32  ipa_supported;
+	__u32  ipa_enabled;
+} __attribute__ ((packed));
+
+/* The IPA command itself */
+struct qeth_ipa_cmd {
+	struct qeth_ipacmd_hdr hdr;
+	union {
+		struct qeth_ipacmd_setdelip4   	setdelip4;
+		struct qeth_ipacmd_setdelip6   	setdelip6;
+		struct qeth_ipacmd_setdelipm	setdelipm;
+		struct qeth_ipacmd_setassparms 	setassparms;
+		struct qeth_create_destroy_address create_destroy_addr;
+		struct qeth_ipacmd_setadpparms 	setadapterparms;
+		struct qeth_set_routing setrtg;
+	} data;
+} __attribute__ ((packed));
 
+/*
+ * special command for ARP processing.
+ * this is not included in setassparms command before, because we get
+ * problem with the size of struct qeth_ipacmd_setassparms otherwise
+ */
+enum qeth_ipa_arp_return_codes {
+	QETH_IPA_ARP_RC_SUCCESS      = 0x0000,
+	QETH_IPA_ARP_RC_FAILED       = 0x0001,
+	QETH_IPA_ARP_RC_NOTSUPP      = 0x0002,
+	QETH_IPA_ARP_RC_OUT_OF_RANGE = 0x0003,
+	QETH_IPA_ARP_RC_Q_NOTSUPP    = 0x0004,
+	QETH_IPA_ARP_RC_Q_NO_DATA    = 0x0008,
+};
 
-struct arp_cmd {
-	__u8 command;
-	__u8 initiator;
-	__u16 seq_no;
-	__u16 return_code;
-	__u8 adapter_type;
-	__u8 rel_adapter_no;
-	__u8 prim_version_no;
-	__u8 param_count;
-	__u16 prot_version;
-	__u32 ipa_supported;
-	__u32 ipa_enabled;
+#define QETH_QARP_DATA_SIZE 3968
+struct qeth_arp_query_data {
+	__u16 request_bits;
+	__u16 reply_bits;
+	__u32 no_entries;
+	char data[QETH_QARP_DATA_SIZE];
+} __attribute__((packed));
+
+/* used as parameter for arp_query reply */
+struct qeth_arp_query_info {
+	__u32 udata_len;
+	__u32 udata_offset;
+	__u32 no_entries;
+	char *udata;
+};
+
+#define IPA_ARP_CMD_LEN (IPA_PDU_HEADER_SIZE+sizeof(struct qeth_ipa_arp_cmd))
+#define QETH_ARP_CMD_BASE_LEN (sizeof(struct qeth_ipacmd_hdr) + \
+			       sizeof(struct qeth_ipacmd_setassparms_hdr))
+#define QETH_IPA_ARP_DATA_POS(buffer) (buffer + IPA_PDU_HEADER_SIZE + \
+				       QETH_ARP_CMD_BASE_LEN)
+struct qeth_ipa_arp_cmd {
+	struct qeth_ipacmd_hdr ihdr;
+	struct qeth_ipacmd_setassparms_hdr shdr;
 	union {
-		struct {
-			__u32 assist_no;
-			__u16 length;
-			__u16 command_code;
-			__u16 return_code;
-			__u8 number_of_replies;
-			__u8 seq_no;
-			union {
-				struct {
-					__u16 tcpip_requestbitmask;
-					__u16 osa_setbitmask;
-					__u32 number_of_entries;
-					__u8 arp_data[ARP_DATA_SIZE];
-				} queryarp_data;
-			} data;
-		} setassparms;
-                struct snmp_ipa_setadp_cmd setadapterparms; 
+		struct qeth_arp_query_data query_arp;
 	} data;
-}__attribute__ ((packed));
+} __attribute__((packed));
 
 
+/* Helper functions */
+#define IS_IPA_REPLY(cmd) (cmd->hdr.initiator == IPA_CMD_INITIATOR_HOST)
 
-#define IPA_PDU_HEADER_SIZE 0x40
-#define QETH_IPA_PDU_LEN_TOTAL(buffer) (buffer+0x0e)
-#define QETH_IPA_PDU_LEN_PDU1(buffer) (buffer+0x26)
-#define QETH_IPA_PDU_LEN_PDU2(buffer) (buffer+0x2a)
-#define QETH_IPA_PDU_LEN_PDU3(buffer) (buffer+0x3a)
+/*****************************************************************************/
+/* END OF   IP Assist related definitions                                    */
+/*****************************************************************************/
 
-extern unsigned char IPA_PDU_HEADER[];
-
-#define QETH_IPA_CMD_DEST_ADDR(buffer) (buffer+0x2c)
 
-#define PDU_ENCAPSULATION(buffer) \
-	(buffer+ \
-	 *(buffer+ (*(buffer+0x0b))+ *(buffer+*(buffer+0x0b)+0x11) +0x07))
+extern unsigned char WRITE_CCW[];
+extern unsigned char READ_CCW[];
 
-#define IS_IPA(buffer) ((buffer) && ( *(buffer+ ((*(buffer+0x0b))+4) )==0xc1) )
+extern unsigned char CM_ENABLE[];
+#define CM_ENABLE_SIZE 0x63
+#define QETH_CM_ENABLE_ISSUER_RM_TOKEN(buffer) (buffer+0x2c)
+#define QETH_CM_ENABLE_FILTER_TOKEN(buffer) (buffer+0x53)
+#define QETH_CM_ENABLE_USER_DATA(buffer) (buffer+0x5b)
 
-#define IS_IPA_REPLY(buffer) ( (buffer) && ( (*(PDU_ENCAPSULATION(buffer)+1))==INITIATOR_HOST ) )
+#define QETH_CM_ENABLE_RESP_FILTER_TOKEN(buffer) \
+		(PDU_ENCAPSULATION(buffer)+ 0x13)
 
-#define CCW_NOP_CMD 0x03
-#define CCW_NOP_COUNT 1
 
-extern unsigned char WRITE_CCW[];
+extern unsigned char CM_SETUP[];
+#define CM_SETUP_SIZE 0x64
+#define QETH_CM_SETUP_DEST_ADDR(buffer) (buffer+0x2c)
+#define QETH_CM_SETUP_CONNECTION_TOKEN(buffer) (buffer+0x51)
+#define QETH_CM_SETUP_FILTER_TOKEN(buffer) (buffer+0x5a)
 
-extern unsigned char READ_CCW[];
+#define QETH_CM_SETUP_RESP_DEST_ADDR(buffer) \
+		(PDU_ENCAPSULATION(buffer) + 0x1a)
 
-#endif /* __QETH_MPC_H__ */
+extern unsigned char ULP_ENABLE[];
+#define ULP_ENABLE_SIZE 0x6b
+#define QETH_ULP_ENABLE_LINKNUM(buffer) (buffer+0x61)
+#define QETH_ULP_ENABLE_DEST_ADDR(buffer) (buffer+0x2c)
+#define QETH_ULP_ENABLE_FILTER_TOKEN(buffer) (buffer+0x53)
+#define QETH_ULP_ENABLE_PORTNAME_AND_LL(buffer) (buffer+0x62)
+#define QETH_ULP_ENABLE_RESP_FILTER_TOKEN(buffer) \
+		(PDU_ENCAPSULATION(buffer) + 0x13)
+#define QETH_ULP_ENABLE_RESP_MAX_MTU(buffer) \
+		(PDU_ENCAPSULATION(buffer)+ 0x1f)
+#define QETH_ULP_ENABLE_RESP_DIFINFO_LEN(buffer) \
+		(PDU_ENCAPSULATION(buffer) + 0x17)
+#define QETH_ULP_ENABLE_RESP_LINK_TYPE(buffer) \
+		(PDU_ENCAPSULATION(buffer)+ 0x2b)
 
+extern unsigned char ULP_SETUP[];
+#define ULP_SETUP_SIZE 0x6c
+#define QETH_ULP_SETUP_DEST_ADDR(buffer) (buffer+0x2c)
+#define QETH_ULP_SETUP_CONNECTION_TOKEN(buffer) (buffer+0x51)
+#define QETH_ULP_SETUP_FILTER_TOKEN(buffer) (buffer+0x5a)
+#define QETH_ULP_SETUP_CUA(buffer) (buffer+0x68)
+#define QETH_ULP_SETUP_REAL_DEVADDR(buffer) (buffer+0x6a)
 
+#define QETH_ULP_SETUP_RESP_CONNECTION_TOKEN(buffer) \
+		(PDU_ENCAPSULATION(buffer)+0x1a)
 
 
+extern unsigned char DM_ACT[];
+#define DM_ACT_SIZE 0x55
+#define QETH_DM_ACT_DEST_ADDR(buffer) (buffer+0x2c)
+#define QETH_DM_ACT_CONNECTION_TOKEN(buffer) (buffer+0x51)
 
 
+#define QETH_TRANSPORT_HEADER_SEQ_NO(buffer) (buffer+4)
+#define QETH_PDU_HEADER_SEQ_NO(buffer) (buffer+0x1c)
+#define QETH_PDU_HEADER_ACK_SEQ_NO(buffer) (buffer+0x20)
 
+extern unsigned char IDX_ACTIVATE_READ[];
+extern unsigned char IDX_ACTIVATE_WRITE[];
 
+#define IDX_ACTIVATE_SIZE 	0x22
+#define QETH_IDX_ACT_ISSUER_RM_TOKEN(buffer) (buffer+0x0c)
+#define QETH_IDX_NO_PORTNAME_REQUIRED(buffer) ((buffer)[0x0b]&0x80)
+#define QETH_IDX_ACT_FUNC_LEVEL(buffer) (buffer+0x10)
+#define QETH_IDX_ACT_DATASET_NAME(buffer) (buffer+0x16)
+#define QETH_IDX_ACT_QDIO_DEV_CUA(buffer) (buffer+0x1e)
+#define QETH_IDX_ACT_QDIO_DEV_REALADDR(buffer) (buffer+0x20)
+#define QETH_IS_IDX_ACT_POS_REPLY(buffer) (((buffer)[0x08]&3)==2)
+#define QETH_IDX_REPLY_LEVEL(buffer) (buffer+0x12)
 
+#define PDU_ENCAPSULATION(buffer) \
+	(buffer + *(buffer + (*(buffer+0x0b)) + \
+	 *(buffer + *(buffer+0x0b)+0x11) +0x07))
 
+#define IS_IPA(buffer) \
+	((buffer) && \
+	 ( *(buffer + ((*(buffer+0x0b))+4) )==0xc1) )
 
+#define ADDR_FRAME_TYPE_DIX 1
+#define ADDR_FRAME_TYPE_802_3 2
+#define ADDR_FRAME_TYPE_TR_WITHOUT_SR 0x10
+#define ADDR_FRAME_TYPE_TR_WITH_SR 0x20
 
+#endif
diff --git a/drivers/s390/net/qeth_proc.c b/drivers/s390/net/qeth_proc.c
new file mode 100644
index 000000000000..a031e0e7f383
--- /dev/null
+++ b/drivers/s390/net/qeth_proc.c
@@ -0,0 +1,468 @@
+/*
+ *
+ * linux/drivers/s390/net/qeth_fs.c ($Revision: 1.5 $)
+ *
+ * Linux on zSeries OSA Express and HiperSockets support
+ * This file contains code related to procfs.
+ *
+ * Copyright 2000,2003 IBM Corporation
+ *
+ * Author(s): Thomas Spatzier <tspat@de.ibm.com>
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#include "qeth.h"
+#include "qeth_mpc.h"
+#include "qeth_fs.h"
+
+/***** /proc/qeth *****/
+#define QETH_PROCFILE_NAME "qeth"
+static struct proc_dir_entry *qeth_procfile;
+
+static void *
+qeth_procfile_seq_start(struct seq_file *s, loff_t *offset)
+{
+	struct list_head *next_card = NULL;
+	int i = 0;
+
+	down_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+
+	if (*offset == 0)
+		return SEQ_START_TOKEN;
+
+	/* get card at pos *offset */
+	list_for_each(next_card, &qeth_ccwgroup_driver.driver.devices)
+		if (++i == *offset)
+			return next_card;
+
+	return NULL;
+}
+
+static void
+qeth_procfile_seq_stop(struct seq_file *s, void* it)
+{
+	up_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+}
+
+static void *
+qeth_procfile_seq_next(struct seq_file *s, void *it, loff_t *offset)
+{
+	struct list_head *next_card = NULL;
+	struct list_head *current_card;
+
+	if (it == SEQ_START_TOKEN) {
+		next_card = qeth_ccwgroup_driver.driver.devices.next;
+		if (next_card->next == next_card) /* list empty */
+			return NULL;
+		(*offset)++;
+	} else {
+		current_card = (struct list_head *)it;
+		if (current_card->next == &qeth_ccwgroup_driver.driver.devices)
+			return NULL; /* end of list reached */
+		next_card = current_card->next;
+		(*offset)++;
+	}
+
+	return next_card;
+}
+
+static inline const char *
+qeth_get_router_str(struct qeth_card *card, int ipv)
+{
+	int routing_type = 0;
+
+	if (ipv == 4){
+		routing_type = card->options.route4.type;
+	} else {
+#ifdef CONFIG_QETH_IPV6
+		routing_type = card->options.route6.type;
+#else
+		return "n/a";
+#endif /* CONFIG_QETH_IPV6 */
+	}
+
+	if (routing_type == PRIMARY_ROUTER)
+		return "pri";
+	else if (routing_type == SECONDARY_ROUTER)
+		return "sec";
+	else if (routing_type == MULTICAST_ROUTER)
+		return "mc";
+	else if (routing_type == PRIMARY_CONNECTOR)
+		return "p.c";
+	else if (routing_type == SECONDARY_CONNECTOR)
+		return "s.c";
+	else if (routing_type == NO_ROUTER)
+		return "no";
+	else
+		return "unk";
+}
+
+static int
+qeth_procfile_seq_show(struct seq_file *s, void *it)
+{
+	struct device *device;
+	struct qeth_card *card;
+	char tmp[12]; /* for qeth_get_prioq_str */
+
+	if (it == SEQ_START_TOKEN){
+		seq_printf(s, "devices                    CHPID interface  "
+		              "cardtype       port chksum prio-q'ing rtr4 "
+			      "rtr6 fsz   cnt\n");
+		seq_printf(s, "-------------------------- ----- ---------- "
+			      "-------------- ---- ------ ---------- ---- "
+			      "---- ----- -----\n");
+	} else {
+		device = list_entry(it, struct device, driver_list);
+		card = device->driver_data;
+		seq_printf(s, "%s/%s/%s x%02X   %-10s %-14s %-4i ",
+				CARD_RDEV_ID(card),
+				CARD_WDEV_ID(card),
+				CARD_DDEV_ID(card),
+				card->info.chpid,
+				card->info.if_name,
+				qeth_get_cardname_short(card),
+				card->info.portno);
+		if (card->lan_online)
+			seq_printf(s, "%-6s %-10s %-4s %-4s %-5s %-5i\n",
+					qeth_get_checksum_str(card),
+					qeth_get_prioq_str(card, tmp),
+					qeth_get_router_str(card, 4),
+					qeth_get_router_str(card, 6),
+					qeth_get_bufsize_str(card),
+					card->qdio.in_buf_pool.buf_count);
+		else
+			seq_printf(s, "  +++ LAN OFFLINE +++\n");
+	}
+	return 0;
+}
+
+static struct seq_operations qeth_procfile_seq_ops = {
+	.start = qeth_procfile_seq_start,
+	.stop  = qeth_procfile_seq_stop,
+	.next  = qeth_procfile_seq_next,
+	.show  = qeth_procfile_seq_show,
+};
+
+static int
+qeth_procfile_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &qeth_procfile_seq_ops);
+}
+
+static struct file_operations qeth_procfile_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qeth_procfile_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+/***** /proc/qeth_perf *****/
+#define QETH_PERF_PROCFILE_NAME "qeth_perf"
+static struct proc_dir_entry *qeth_perf_procfile;
+
+#ifdef CONFIG_QETH_PERF_STATS
+
+static void *
+qeth_perf_procfile_seq_start(struct seq_file *s, loff_t *offset)
+{
+	struct list_head *next_card = NULL;
+	int i = 0;
+
+	down_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+	/* get card at pos *offset */
+	list_for_each(next_card, &qeth_ccwgroup_driver.driver.devices){
+		if (i == *offset)
+			return next_card;
+		i++;
+	}
+	return NULL;
+}
+
+static void
+qeth_perf_procfile_seq_stop(struct seq_file *s, void* it)
+{
+	up_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+}
+
+static void *
+qeth_perf_procfile_seq_next(struct seq_file *s, void *it, loff_t *offset)
+{
+	struct list_head *current_card = (struct list_head *)it;
+
+	if (current_card->next == &qeth_ccwgroup_driver.driver.devices)
+		return NULL; /* end of list reached */
+	(*offset)++;
+	return current_card->next;
+}
+
+static int
+qeth_perf_procfile_seq_show(struct seq_file *s, void *it)
+{
+	struct device *device;
+	struct qeth_card *card;
+
+	device = list_entry(it, struct device, driver_list);
+	card = device->driver_data;
+	seq_printf(s, "For card with devnos %s/%s/%s (%s):\n",
+			CARD_RDEV_ID(card),
+			CARD_WDEV_ID(card),
+			CARD_DDEV_ID(card),
+			card->info.if_name
+		  );
+	seq_printf(s, "  Skb's/buffers received                 : %li/%i\n"
+		      "  Skb's/buffers sent                     : %li/%i\n\n",
+		        card->stats.rx_packets, card->perf_stats.bufs_rec,
+		        card->stats.tx_packets, card->perf_stats.bufs_sent
+		  );
+	seq_printf(s, "  Skb's/buffers sent without packing     : %li/%i\n"
+		      "  Skb's/buffers sent with packing        : %i/%i\n\n",
+		   card->stats.tx_packets - card->perf_stats.skbs_sent_pack,
+		   card->perf_stats.bufs_sent - card->perf_stats.bufs_sent_pack,
+		   card->perf_stats.skbs_sent_pack,
+		   card->perf_stats.bufs_sent_pack
+		  );
+	seq_printf(s, "  Packing state changes no pkg.->packing : %i/%i\n"
+		      "  Current buffer usage (outbound q's)    : "
+		      "%i/%i/%i/%i\n\n",
+		        card->perf_stats.sc_dp_p, card->perf_stats.sc_p_dp,
+			atomic_read(&card->qdio.out_qs[0]->used_buffers),
+			(card->qdio.no_out_queues > 1)?
+				atomic_read(&card->qdio.out_qs[1]->used_buffers)
+				: 0,
+			(card->qdio.no_out_queues > 2)?
+				atomic_read(&card->qdio.out_qs[2]->used_buffers)
+				: 0,
+			(card->qdio.no_out_queues > 3)?
+				atomic_read(&card->qdio.out_qs[3]->used_buffers)
+				: 0
+		  );
+	seq_printf(s, "  Inbound time (in us)                   : %i\n"
+		      "  Inbound cnt                            : %i\n"
+		      "  Outbound time (in us, incl QDIO)       : %i\n"
+		      "  Outbound cnt                           : %i\n"
+		      "  Watermarks L/H                         : %i/%i\n\n",
+		        card->perf_stats.inbound_time,
+			card->perf_stats.inbound_cnt,
+			card->perf_stats.outbound_time,
+			card->perf_stats.outbound_cnt,
+			QETH_LOW_WATERMARK_PACK, QETH_HIGH_WATERMARK_PACK
+		  );
+
+	return 0;
+}
+
+static struct seq_operations qeth_perf_procfile_seq_ops = {
+	.start = qeth_perf_procfile_seq_start,
+	.stop  = qeth_perf_procfile_seq_stop,
+	.next  = qeth_perf_procfile_seq_next,
+	.show  = qeth_perf_procfile_seq_show,
+};
+
+static int
+qeth_perf_procfile_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &qeth_perf_procfile_seq_ops);
+}
+
+static struct file_operations qeth_perf_procfile_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qeth_perf_procfile_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+#define qeth_perf_procfile_created qeth_perf_procfile
+#else
+#define qeth_perf_procfile_created 1
+#endif /* CONFIG_QETH_PERF_STATS */
+
+/***** /proc/qeth_ipa_takeover *****/
+#define QETH_IPATO_PROCFILE_NAME "qeth_ipa_takeover"
+static struct proc_dir_entry *qeth_ipato_procfile;
+
+static void *
+qeth_ipato_procfile_seq_start(struct seq_file *s, loff_t *offset)
+{
+	struct list_head *next_card = NULL;
+	int i = 0;
+
+	down_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+	/* TODO: finish this */
+	/*
+	 * maybe SEQ_SATRT_TOKEN can be returned for offset 0
+	 * output driver settings then;
+	 * else output setting for respective card
+	 */
+	/* get card at pos *offset */
+	list_for_each(next_card, &qeth_ccwgroup_driver.driver.devices){
+		if (i == *offset)
+			return next_card;
+		i++;
+	}
+	return NULL;
+}
+
+static void
+qeth_ipato_procfile_seq_stop(struct seq_file *s, void* it)
+{
+	up_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+}
+
+static void *
+qeth_ipato_procfile_seq_next(struct seq_file *s, void *it, loff_t *offset)
+{
+	struct list_head *current_card = (struct list_head *)it;
+
+	/* TODO: finish this */
+	/*
+	 * maybe SEQ_SATRT_TOKEN can be returned for offset 0
+	 * output driver settings then;
+	 * else output setting for respective card
+	 */
+	if (current_card->next == &qeth_ccwgroup_driver.driver.devices)
+		return NULL; /* end of list reached */
+	(*offset)++;
+	return current_card->next;
+}
+
+static int
+qeth_ipato_procfile_seq_show(struct seq_file *s, void *it)
+{
+	struct device *device;
+	struct qeth_card *card;
+
+	/* TODO: finish this */
+	/*
+	 * maybe SEQ_SATRT_TOKEN can be returned for offset 0
+	 * output driver settings then;
+	 * else output setting for respective card
+	 */
+	device = list_entry(it, struct device, driver_list);
+	card = device->driver_data;
+
+	return 0;
+}
+
+static struct seq_operations qeth_ipato_procfile_seq_ops = {
+	.start = qeth_ipato_procfile_seq_start,
+	.stop  = qeth_ipato_procfile_seq_stop,
+	.next  = qeth_ipato_procfile_seq_next,
+	.show  = qeth_ipato_procfile_seq_show,
+};
+
+static int
+qeth_ipato_procfile_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &qeth_ipato_procfile_seq_ops);
+}
+
+static struct file_operations qeth_ipato_procfile_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qeth_ipato_procfile_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+int __init
+qeth_create_procfs_entries(void)
+{
+	qeth_procfile = create_proc_entry(QETH_PROCFILE_NAME,
+					   S_IFREG | 0444, NULL);
+	if (qeth_procfile)
+		qeth_procfile->proc_fops = &qeth_procfile_fops;
+
+#ifdef CONFIG_QETH_PERF_STATS
+	qeth_perf_procfile = create_proc_entry(QETH_PERF_PROCFILE_NAME,
+					   S_IFREG | 0444, NULL);
+	if (qeth_perf_procfile)
+		qeth_perf_procfile->proc_fops = &qeth_perf_procfile_fops;
+#endif /* CONFIG_QETH_PERF_STATS */
+
+	qeth_ipato_procfile = create_proc_entry(QETH_IPATO_PROCFILE_NAME,
+					   S_IFREG | 0444, NULL);
+	if (qeth_ipato_procfile)
+		qeth_ipato_procfile->proc_fops = &qeth_ipato_procfile_fops;
+
+	if (qeth_procfile &&
+	    qeth_ipato_procfile &&
+	    qeth_perf_procfile_created)
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+void __exit
+qeth_remove_procfs_entries(void)
+{
+	if (qeth_procfile)
+		remove_proc_entry(QETH_PROCFILE_NAME, NULL);
+	if (qeth_perf_procfile)
+		remove_proc_entry(QETH_PERF_PROCFILE_NAME, NULL);
+	if (qeth_ipato_procfile)
+		remove_proc_entry(QETH_IPATO_PROCFILE_NAME, NULL);
+}
+
+
+/* ONLY FOR DEVELOPMENT! -> make it as module */
+/*
+static void
+qeth_create_sysfs_entries(void)
+{
+	struct device *dev;
+
+	down_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+
+	list_for_each_entry(dev, &qeth_ccwgroup_driver.driver.devices,
+			driver_list)
+		qeth_create_device_attributes(dev);
+
+	up_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+}
+
+static void
+qeth_remove_sysfs_entries(void)
+{
+	struct device *dev;
+
+	down_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+
+	list_for_each_entry(dev, &qeth_ccwgroup_driver.driver.devices,
+			driver_list)
+		qeth_remove_device_attributes(dev);
+
+	up_read(&qeth_ccwgroup_driver.driver.bus->subsys.rwsem);
+}
+
+static int __init
+qeth_fs_init(void)
+{
+	printk(KERN_INFO "qeth_fs_init\n");
+	qeth_create_procfs_entries();
+	qeth_create_sysfs_entries();
+
+	return 0;
+}
+
+static void __exit
+qeth_fs_exit(void)
+{
+	printk(KERN_INFO "qeth_fs_exit\n");
+	qeth_remove_procfs_entries();
+	qeth_remove_sysfs_entries();
+}
+
+
+module_init(qeth_fs_init);
+module_exit(qeth_fs_exit);
+
+MODULE_LICENSE("GPL");
+*/
diff --git a/drivers/s390/net/qeth_sys.c b/drivers/s390/net/qeth_sys.c
new file mode 100644
index 000000000000..468eaa193e56
--- /dev/null
+++ b/drivers/s390/net/qeth_sys.c
@@ -0,0 +1,1479 @@
+/*
+ *
+ * linux/drivers/s390/net/qeth_sys.c ($Revision: 1.19 $)
+ *
+ * Linux on zSeries OSA Express and HiperSockets support
+ * This file contains code related to sysfs.
+ *
+ * Copyright 2000,2003 IBM Corporation
+ *
+ * Author(s): Thomas Spatzier <tspat@de.ibm.com>
+ * 	      Frank Pavlic <pavlic@de.ibm.com>
+ *
+ */
+#include <linux/list.h>
+#include <linux/rwsem.h>
+
+#include <asm/ebcdic.h>
+
+#include "qeth.h"
+#include "qeth_mpc.h"
+#include "qeth_fs.h"
+
+/*****************************************************************************/
+/*                                                                           */
+/*          /sys-fs stuff UNDER DEVELOPMENT !!!                              */
+/*                                                                           */
+/*****************************************************************************/
+//low/high watermark
+
+static ssize_t
+qeth_dev_state_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+	if (!card)
+		return -EINVAL;
+
+	switch (card->state) {
+	case CARD_STATE_DOWN:
+		return sprintf(buf, "DOWN\n");
+	case CARD_STATE_HARDSETUP:
+		return sprintf(buf, "HARDSETUP\n");
+	case CARD_STATE_SOFTSETUP:
+		return sprintf(buf, "SOFTSETUP\n");
+	case CARD_STATE_UP_LAN_OFFLINE:
+		return sprintf(buf, "UP (LAN OFFLINE)\n");
+	case CARD_STATE_UP_LAN_ONLINE:
+		return sprintf(buf, "UP (LAN ONLINE)\n");
+	case CARD_STATE_RECOVER:
+		return sprintf(buf, "RECOVER\n");
+	default:
+		return sprintf(buf, "UNKNOWN\n");
+	}
+}
+
+static DEVICE_ATTR(state, 0444, qeth_dev_state_show, NULL);
+
+static ssize_t
+qeth_dev_chpid_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%02X\n", card->info.chpid);
+}
+
+static DEVICE_ATTR(chpid, 0444, qeth_dev_chpid_show, NULL);
+
+static ssize_t
+qeth_dev_if_name_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", card->info.if_name);
+}
+
+static DEVICE_ATTR(if_name, 0444, qeth_dev_if_name_show, NULL);
+
+static ssize_t
+qeth_dev_card_type_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%s\n", qeth_get_cardname_short(card));
+}
+
+static DEVICE_ATTR(card_type, 0444, qeth_dev_card_type_show, NULL);
+
+static ssize_t
+qeth_dev_portno_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->info.portno);
+}
+
+static ssize_t
+qeth_dev_portno_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	unsigned int portno;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	portno = simple_strtoul(buf, &tmp, 16);
+	if ((portno < 0) || (portno > MAX_PORTNO)){
+		PRINT_WARN("portno 0x%X is out of range\n", portno);
+		return -EINVAL;
+	}
+
+	card->info.portno = portno;
+	return count;
+}
+
+static DEVICE_ATTR(portno, 0644, qeth_dev_portno_show, qeth_dev_portno_store);
+
+static ssize_t
+qeth_dev_portname_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+	char portname[9] = {0, };
+
+	if (!card)
+		return -EINVAL;
+
+	if (card->info.portname_required) {
+		memcpy(portname, card->info.portname + 1, 8);
+		EBCASC(portname, 8);
+		return sprintf(buf, "%s\n", portname);
+	} else
+		return sprintf(buf, "no portname required\n");
+}
+
+static ssize_t
+qeth_dev_portname_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	int i;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	tmp = strsep((char **) &buf, "\n");
+	if ((strlen(tmp) > 8) || (strlen(tmp) < 2))
+		return -EINVAL;
+
+	card->info.portname[0] = strlen(tmp);
+	/* for beauty reasons */
+	for (i = 1; i < 9; i++)
+		card->info.portname[i] = ' ';
+	strcpy(card->info.portname + 1, tmp);
+	ASCEBC(card->info.portname + 1, 8);
+
+	return count;
+}
+
+static DEVICE_ATTR(portname, 0644, qeth_dev_portname_show,
+		qeth_dev_portname_store);
+
+static ssize_t
+qeth_dev_checksum_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%s checksumming\n", qeth_get_checksum_str(card));
+}
+
+static ssize_t
+qeth_dev_checksum_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	tmp = strsep((char **) &buf, "\n");
+	if (!strcmp(tmp, "sw_checksumming"))
+		card->options.checksum_type = SW_CHECKSUMMING;
+	else if (!strcmp(tmp, "hw_checksumming"))
+		card->options.checksum_type = HW_CHECKSUMMING;
+	else if (!strcmp(tmp, "no_checksumming"))
+		card->options.checksum_type = NO_CHECKSUMMING;
+	else {
+		PRINT_WARN("Unknown checksumming type '%s'\n", tmp);
+		return -EINVAL;
+	}
+	return count;
+}
+
+static DEVICE_ATTR(checksumming, 0644, qeth_dev_checksum_show,
+		qeth_dev_checksum_store);
+
+static ssize_t
+qeth_dev_prioqing_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	switch (card->qdio.do_prio_queueing) {
+	case QETH_PRIO_Q_ING_PREC:
+		return sprintf(buf, "%s\n", "by precedence");
+	case QETH_PRIO_Q_ING_TOS:
+		return sprintf(buf, "%s\n", "by type of service");
+	default:
+		return sprintf(buf, "always queue %i\n",
+			       card->qdio.default_out_queue);
+	}
+}
+
+static ssize_t
+qeth_dev_prioqing_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	tmp = strsep((char **) &buf, "\n");
+	if (!strcmp(tmp, "prio_queueing_prec"))
+		card->qdio.do_prio_queueing = QETH_PRIO_Q_ING_PREC;
+	else if (!strcmp(tmp, "prio_queueing_tos"))
+		card->qdio.do_prio_queueing = QETH_PRIO_Q_ING_TOS;
+	else if (!strcmp(tmp, "no_prio_queueing:0")) {
+		card->qdio.do_prio_queueing = QETH_NO_PRIO_QUEUEING;
+		card->qdio.default_out_queue = 0;
+	} else if (!strcmp(tmp, "no_prio_queueing:1")) {
+		card->qdio.do_prio_queueing = QETH_NO_PRIO_QUEUEING;
+		card->qdio.default_out_queue = 1;
+	} else if (!strcmp(tmp, "no_prio_queueing:2")) {
+		card->qdio.do_prio_queueing = QETH_NO_PRIO_QUEUEING;
+		card->qdio.default_out_queue = 2;
+	} else if (!strcmp(tmp, "no_prio_queueing:3")) {
+		card->qdio.do_prio_queueing = QETH_NO_PRIO_QUEUEING;
+		card->qdio.default_out_queue = 3;
+	} else if (!strcmp(tmp, "no_prio_queueing")) {
+		card->qdio.do_prio_queueing = QETH_NO_PRIO_QUEUEING;
+		card->qdio.default_out_queue = QETH_DEFAULT_QUEUE;
+	} else {
+		PRINT_WARN("Unknown queueing type '%s'\n", tmp);
+		return -EINVAL;
+	}
+	return count;
+}
+
+static DEVICE_ATTR(priority_queueing, 0644, qeth_dev_prioqing_show,
+		qeth_dev_prioqing_store);
+
+static ssize_t
+qeth_dev_bufcnt_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->qdio.in_buf_pool.buf_count);
+}
+
+static ssize_t
+qeth_dev_bufcnt_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	unsigned int cnt;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	cnt = simple_strtoul(buf, &tmp, 16);
+	cnt = (cnt < QETH_IN_BUF_COUNT_MIN) ? QETH_IN_BUF_COUNT_MIN :
+		((cnt > QETH_IN_BUF_COUNT_MAX) ? QETH_IN_BUF_COUNT_MAX : cnt);
+	card->qdio.in_buf_pool.buf_count = cnt;
+	/* TODO: steel/add buffers from/to a running card's buffer pool (?) */
+
+	return count;
+}
+
+static DEVICE_ATTR(buffer_count, 0644, qeth_dev_bufcnt_show,
+		qeth_dev_bufcnt_store);
+
+static inline ssize_t
+qeth_dev_route_show(struct qeth_routing_info *route, char *buf)
+{
+	switch (route->type) {
+	case PRIMARY_ROUTER:
+		return sprintf(buf, "%s\n", "primary router");
+	case SECONDARY_ROUTER:
+		return sprintf(buf, "%s\n", "secondary router");
+	case MULTICAST_ROUTER:
+		return sprintf(buf, "%s\n", "multicast router");
+	case PRIMARY_CONNECTOR:
+		return sprintf(buf, "%s\n", "primary connector");
+	case SECONDARY_CONNECTOR:
+		return sprintf(buf, "%s\n", "secondary connector");
+	default:
+		return sprintf(buf, "%s\n", "no");
+	}
+}
+
+static ssize_t
+qeth_dev_route4_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_route_show(&card->options.route4, buf);
+}
+
+static inline ssize_t
+qeth_dev_route_store(struct qeth_card *card, struct qeth_routing_info *route,
+		enum qeth_prot_versions prot, const char *buf, size_t count)
+{
+	enum qeth_routing_types old_route_type = route->type;
+	char *tmp;
+	int rc;
+
+	tmp = strsep((char **) &buf, "\n");
+
+	if (!strcmp(tmp, "no_router")){
+		route->type = NO_ROUTER;
+		goto check_reset;
+	}
+
+	if (card->info.type == QETH_CARD_TYPE_IQD) {
+		if (!strcmp(tmp, "primary_connector")) {
+			route->type = PRIMARY_CONNECTOR;
+		} else if (!strcmp(tmp, "secondary_connector")) {
+			route->type = SECONDARY_CONNECTOR;
+		} else if (!strcmp(tmp, "multicast_router")) {
+			route->type = MULTICAST_ROUTER;
+		} else
+			goto out_inval;
+	} else {
+		if (!strcmp(tmp, "primary_router")) {
+			route->type = PRIMARY_ROUTER;
+		} else if (!strcmp(tmp, "secondary_router")) {
+			route->type = SECONDARY_ROUTER;
+		} else if (!strcmp(tmp, "multicast_router")) {
+			if (qeth_is_ipafunc_supported(card, prot,
+						      IPA_OSA_MC_ROUTER))
+				route->type = MULTICAST_ROUTER;
+			else
+				goto out_inval;
+		} else
+			goto out_inval;
+	}
+check_reset:
+	if (old_route_type != route->type){
+		if (prot == QETH_PROT_IPV4)
+			rc = qeth_setrouting_v4(card);
+		else if (prot == QETH_PROT_IPV6)
+			rc = qeth_setrouting_v6(card);
+	}
+	return count;
+out_inval:
+	PRINT_WARN("Routing type '%s' not supported for interface %s.\n"
+		   "Router status not changed.\n",
+		   tmp, card->info.if_name);
+	return -EINVAL;
+}
+
+static ssize_t
+qeth_dev_route4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_route_store(card, &card->options.route4,
+			            QETH_PROT_IPV4, buf, count);
+}
+
+static DEVICE_ATTR(route4, 0644, qeth_dev_route4_show, qeth_dev_route4_store);
+
+#ifdef CONFIG_QETH_IPV6
+static ssize_t
+qeth_dev_route6_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	if (!qeth_is_supported(card, IPA_IPV6))
+		return sprintf(buf, "%s\n", "n/a");
+
+	return qeth_dev_route_show(&card->options.route6, buf);
+}
+
+static ssize_t
+qeth_dev_route6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	if (!qeth_is_supported(card, IPA_IPV6)){
+		PRINT_WARN("IPv6 not supported for interface %s.\n"
+			   "Routing status no changed.\n",
+			   card->info.if_name);
+		return -ENOTSUPP;
+	}
+
+	return qeth_dev_route_store(card, &card->options.route6,
+			            QETH_PROT_IPV6, buf, count);
+}
+
+static DEVICE_ATTR(route6, 0644, qeth_dev_route6_show, qeth_dev_route6_store);
+#endif
+
+static ssize_t
+qeth_dev_add_hhlen_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->options.add_hhlen);
+}
+
+static ssize_t
+qeth_dev_add_hhlen_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	int i;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	i = simple_strtoul(buf, &tmp, 16);
+	if ((i < 0) || (i > MAX_ADD_HHLEN)) {
+		PRINT_WARN("add_hhlen out of range\n");
+		return -EINVAL;
+	}
+	card->options.add_hhlen = i;
+
+	return count;
+}
+
+static DEVICE_ATTR(add_hhlen, 0644, qeth_dev_add_hhlen_show,
+		   qeth_dev_add_hhlen_store);
+
+static ssize_t
+qeth_dev_fake_ll_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->options.fake_ll? 1:0);
+}
+
+static ssize_t
+qeth_dev_fake_ll_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	int i;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	i = simple_strtoul(buf, &tmp, 16);
+	if ((i == 0) || (i == 1))
+		card->options.fake_ll = i;
+	else {
+		PRINT_WARN("fake_ll: write 0 or 1 to this file!\n");
+		return -EINVAL;
+	}
+	return count;
+}
+
+static DEVICE_ATTR(fake_ll, 0644, qeth_dev_fake_ll_show,
+		   qeth_dev_fake_ll_store);
+
+static ssize_t
+qeth_dev_fake_broadcast_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->options.fake_broadcast? 1:0);
+}
+
+static ssize_t
+qeth_dev_fake_broadcast_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	int i;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	i = simple_strtoul(buf, &tmp, 16);
+	if ((i == 0) || (i == 1))
+		card->options.fake_broadcast = i;
+	else {
+		PRINT_WARN("fake_broadcast: write 0 or 1 to this file!\n");
+		return -EINVAL;
+	}
+	return count;
+}
+
+static DEVICE_ATTR(fake_broadcast, 0644, qeth_dev_fake_broadcast_show,
+		   qeth_dev_fake_broadcast_store);
+
+static ssize_t
+qeth_dev_recover_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	int i;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_UP_LAN_ONLINE) &&
+	    (card->state != CARD_STATE_UP_LAN_OFFLINE))
+		return -EPERM;
+
+	i = simple_strtoul(buf, &tmp, 16);
+	if (i == 1)
+		qeth_schedule_recovery(card);
+
+	return count;
+}
+
+static DEVICE_ATTR(recover, 0200, NULL, qeth_dev_recover_store);
+
+/* TODO */
+static ssize_t
+qeth_dev_broadcast_mode_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	if (!((card->info.link_type == QETH_LINK_TYPE_HSTR) ||
+	      (card->info.link_type == QETH_LINK_TYPE_LANE_TR)))
+		return sprintf(buf, "n/a\n");
+
+	return sprintf(buf, "%s\n", (card->options.broadcast_mode ==
+				     QETH_TR_BROADCAST_ALLRINGS)?
+		       "all rings":"local");
+}
+
+/* TODO */
+static ssize_t
+qeth_dev_broadcast_mode_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	if (!((card->info.link_type == QETH_LINK_TYPE_HSTR) ||
+	      (card->info.link_type == QETH_LINK_TYPE_LANE_TR))){
+		PRINT_WARN("Device is not a tokenring device!\n");
+		return -EINVAL;
+	}
+
+	tmp = strsep((char **) &buf, "\n");
+
+	if (!strcmp(tmp, "local")){
+		card->options.broadcast_mode = QETH_TR_BROADCAST_LOCAL;
+		return count;
+	} else if (!strcmp(tmp, "all_rings")) {
+		card->options.broadcast_mode = QETH_TR_BROADCAST_ALLRINGS;
+		return count;
+	} else {
+		PRINT_WARN("broadcast_mode: invalid mode %s!\n",
+			   tmp);
+		return -EINVAL;
+	}
+	return count;
+}
+
+static DEVICE_ATTR(broadcast_mode, 0644, qeth_dev_broadcast_mode_show,
+		   qeth_dev_broadcast_mode_store);
+
+/* TODO */
+static ssize_t
+qeth_dev_canonical_macaddr_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	if (!((card->info.link_type == QETH_LINK_TYPE_HSTR) ||
+	      (card->info.link_type == QETH_LINK_TYPE_LANE_TR)))
+		return sprintf(buf, "n/a\n");
+
+	return sprintf(buf, "%i\n", (card->options.macaddr_mode ==
+				     QETH_TR_MACADDR_CANONICAL)? 1:0);
+}
+
+/* TODO */
+static ssize_t
+qeth_dev_canonical_macaddr_store(struct device *dev, const char *buf,
+				  size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+	int i;
+
+	if (!card)
+		return -EINVAL;
+
+	if ((card->state != CARD_STATE_DOWN) &&
+	    (card->state != CARD_STATE_RECOVER))
+		return -EPERM;
+
+	if (!((card->info.link_type == QETH_LINK_TYPE_HSTR) ||
+	      (card->info.link_type == QETH_LINK_TYPE_LANE_TR))){
+		PRINT_WARN("Device is not a tokenring device!\n");
+		return -EINVAL;
+	}
+
+	i = simple_strtoul(buf, &tmp, 16);
+	if ((i == 0) || (i == 1))
+		card->options.macaddr_mode = i?
+			QETH_TR_MACADDR_CANONICAL :
+			QETH_TR_MACADDR_NONCANONICAL;
+	else {
+		PRINT_WARN("canonical_macaddr: write 0 or 1 to this file!\n");
+		return -EINVAL;
+	}
+	return count;
+}
+
+static DEVICE_ATTR(canonical_macaddr, 0644, qeth_dev_canonical_macaddr_show,
+		   qeth_dev_canonical_macaddr_store);
+
+static struct device_attribute * qeth_device_attrs[] = {
+	&dev_attr_state,
+	&dev_attr_chpid,
+	&dev_attr_if_name,
+	&dev_attr_card_type,
+	&dev_attr_portno,
+	&dev_attr_portname,
+	&dev_attr_checksumming,
+	&dev_attr_priority_queueing,
+	&dev_attr_buffer_count,
+	&dev_attr_route4,
+#ifdef CONFIG_QETH_IPV6
+	&dev_attr_route6,
+#endif
+	&dev_attr_add_hhlen,
+	&dev_attr_fake_ll,
+	&dev_attr_fake_broadcast,
+	&dev_attr_recover,
+	&dev_attr_broadcast_mode,
+	&dev_attr_canonical_macaddr,
+	NULL,
+};
+
+static struct attribute_group qeth_device_attr_group = {
+	.attrs = (struct attribute **)qeth_device_attrs,
+};
+
+
+#define QETH_DEVICE_ATTR(_id,_name,_mode,_show,_store)			     \
+struct device_attribute dev_attr_##_id = {				     \
+	.attr = {.name=__stringify(_name), .mode=_mode, .owner=THIS_MODULE },\
+	.show	= _show,						     \
+	.store	= _store,						     \
+};
+
+static ssize_t
+qeth_dev_ipato_enable_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->ipato.enabled? 1:0);
+}
+
+static ssize_t
+qeth_dev_ipato_enable_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+
+	if (!card)
+		return -EINVAL;
+
+	tmp = strsep((char **) &buf, "\n");
+	if (!strcmp(tmp, "toggle")){
+		card->ipato.enabled = (card->ipato.enabled)? 0 : 1;
+	} else if (!strcmp(tmp, "1")){
+		card->ipato.enabled = 1;
+	} else if (!strcmp(tmp, "0")){
+		card->ipato.enabled = 0;
+	} else {
+		PRINT_WARN("ipato_enable: write 0, 1 or 'toggle' to "
+			   "this file\n");
+		return -EINVAL;
+	}
+	return count;
+}
+
+static QETH_DEVICE_ATTR(ipato_enable, enable, 0644,
+			qeth_dev_ipato_enable_show,
+			qeth_dev_ipato_enable_store);
+
+static ssize_t
+qeth_dev_ipato_invert4_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->ipato.invert4? 1:0);
+}
+
+static ssize_t
+qeth_dev_ipato_invert4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+
+	if (!card)
+		return -EINVAL;
+
+	tmp = strsep((char **) &buf, "\n");
+	if (!strcmp(tmp, "toggle")){
+		card->ipato.invert4 = (card->ipato.invert4)? 0 : 1;
+	} else if (!strcmp(tmp, "1")){
+		card->ipato.invert4 = 1;
+	} else if (!strcmp(tmp, "0")){
+		card->ipato.invert4 = 0;
+	} else {
+		PRINT_WARN("ipato_invert4: write 0, 1 or 'toggle' to "
+			   "this file\n");
+		return -EINVAL;
+	}
+	return count;
+}
+
+static QETH_DEVICE_ATTR(ipato_invert4, invert4, 0644,
+			qeth_dev_ipato_invert4_show,
+			qeth_dev_ipato_invert4_store);
+
+static inline ssize_t
+qeth_dev_ipato_add_show(char *buf, struct qeth_card *card,
+			enum qeth_prot_versions proto)
+{
+	struct qeth_ipato_entry *ipatoe;
+	unsigned long flags;
+	char addr_str[49];
+	int i = 0;
+
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry(ipatoe, &card->ipato.entries, entry){
+		if (ipatoe->proto != proto)
+			continue;
+		qeth_ipaddr_to_string(proto, ipatoe->addr, addr_str);
+		i += sprintf(buf + i, "%s/%i\n", addr_str, ipatoe->mask_bits);
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	i += sprintf(buf + i, "\n");
+
+	return i;
+}
+
+static ssize_t
+qeth_dev_ipato_add4_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_ipato_add_show(buf, card, QETH_PROT_IPV4);
+}
+
+static inline int
+qeth_parse_ipatoe(const char* buf, enum qeth_prot_versions proto,
+		  u8 *addr, int *mask_bits)
+{
+	const char *start, *end;
+	char *tmp;
+	char buffer[49] = {0, };
+
+	start = buf;
+	/* get address string */
+	end = strchr(start, '/');
+	if (!end){
+		PRINT_WARN("Invalid format for ipato_addx/delx. "
+			   "Use <ip addr>/<mask bits>\n");
+		return -EINVAL;
+	}
+	strncpy(buffer, start, end - start);
+	if (qeth_string_to_ipaddr(buffer, proto, addr)){
+		PRINT_WARN("Invalid IP address format!\n");
+		return -EINVAL;
+	}
+	start = end + 1;
+	*mask_bits = simple_strtoul(start, &tmp, 10);
+
+	return 0;
+}
+
+static inline ssize_t
+qeth_dev_ipato_add_store(const char *buf, size_t count,
+			 struct qeth_card *card, enum qeth_prot_versions proto)
+{
+	struct qeth_ipato_entry *ipatoe;
+	u8 addr[16];
+	int mask_bits;
+	int rc;
+
+	if ((rc = qeth_parse_ipatoe(buf, proto, addr, &mask_bits)))
+		return rc;
+
+	if (!(ipatoe = kmalloc(sizeof(struct qeth_ipato_entry), GFP_KERNEL))){
+		PRINT_WARN("No memory to allocate ipato entry\n");
+		return -ENOMEM;
+	}
+	memset(ipatoe, 0, sizeof(struct qeth_ipato_entry));
+	ipatoe->proto = proto;
+	memcpy(ipatoe->addr, addr, (proto == QETH_PROT_IPV4)? 4:16);
+	ipatoe->mask_bits = mask_bits;
+
+	if ((rc = qeth_add_ipato_entry(card, ipatoe))){
+		kfree(ipatoe);
+		return rc;
+	}
+
+	return count;
+}
+
+static ssize_t
+qeth_dev_ipato_add4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_ipato_add_store(buf, count, card, QETH_PROT_IPV4);
+}
+
+static QETH_DEVICE_ATTR(ipato_add4, add4, 0644,
+			qeth_dev_ipato_add4_show,
+			qeth_dev_ipato_add4_store);
+
+static inline ssize_t
+qeth_dev_ipato_del_store(const char *buf, size_t count,
+			 struct qeth_card *card, enum qeth_prot_versions proto)
+{
+	u8 addr[16];
+	int mask_bits;
+	int rc;
+
+	if ((rc = qeth_parse_ipatoe(buf, proto, addr, &mask_bits)))
+		return rc;
+
+	qeth_del_ipato_entry(card, proto, addr, mask_bits);
+
+	return count;
+}
+
+static ssize_t
+qeth_dev_ipato_del4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_ipato_del_store(buf, count, card, QETH_PROT_IPV4);
+}
+
+static QETH_DEVICE_ATTR(ipato_del4, del4, 0200, NULL,
+			qeth_dev_ipato_del4_store);
+
+#ifdef CONFIG_QETH_IPV6
+static ssize_t
+qeth_dev_ipato_invert6_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return sprintf(buf, "%i\n", card->ipato.invert6? 1:0);
+}
+
+static ssize_t
+qeth_dev_ipato_invert6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+	char *tmp;
+
+	if (!card)
+		return -EINVAL;
+
+	tmp = strsep((char **) &buf, "\n");
+	if (!strcmp(tmp, "toggle")){
+		card->ipato.invert6 = (card->ipato.invert6)? 0 : 1;
+	} else if (!strcmp(tmp, "1")){
+		card->ipato.invert6 = 1;
+	} else if (!strcmp(tmp, "0")){
+		card->ipato.invert6 = 0;
+	} else {
+		PRINT_WARN("ipato_invert6: write 0, 1 or 'toggle' to "
+			   "this file\n");
+		return -EINVAL;
+	}
+	return count;
+}
+
+static QETH_DEVICE_ATTR(ipato_invert6, invert6, 0644,
+			qeth_dev_ipato_invert6_show,
+			qeth_dev_ipato_invert6_store);
+
+
+static ssize_t
+qeth_dev_ipato_add6_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_ipato_add_show(buf, card, QETH_PROT_IPV6);
+}
+
+static ssize_t
+qeth_dev_ipato_add6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_ipato_add_store(buf, count, card, QETH_PROT_IPV6);
+}
+
+static QETH_DEVICE_ATTR(ipato_add6, add6, 0644,
+			qeth_dev_ipato_add6_show,
+			qeth_dev_ipato_add6_store);
+
+static ssize_t
+qeth_dev_ipato_del6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_ipato_del_store(buf, count, card, QETH_PROT_IPV6);
+}
+
+static QETH_DEVICE_ATTR(ipato_del6, del6, 0200, NULL,
+			qeth_dev_ipato_del6_store);
+#endif /* CONFIG_QETH_IPV6 */
+
+static struct device_attribute * qeth_ipato_device_attrs[] = {
+	&dev_attr_ipato_enable,
+	&dev_attr_ipato_invert4,
+	&dev_attr_ipato_add4,
+	&dev_attr_ipato_del4,
+#ifdef CONFIG_QETH_IPV6
+	&dev_attr_ipato_invert6,
+	&dev_attr_ipato_add6,
+	&dev_attr_ipato_del6,
+#endif
+	NULL,
+};
+
+static struct attribute_group qeth_device_ipato_group = {
+	.name = "ipa_takeover",
+	.attrs = (struct attribute **)qeth_ipato_device_attrs,
+};
+
+static inline ssize_t
+qeth_dev_vipa_add_show(char *buf, struct qeth_card *card,
+			enum qeth_prot_versions proto)
+{
+	struct qeth_ipaddr *ipaddr;
+	char addr_str[49];
+	unsigned long flags;
+	int i = 0;
+
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry(ipaddr, &card->ip_list, entry){
+		if (ipaddr->proto != proto)
+			continue;
+		if (ipaddr->type != QETH_IP_TYPE_VIPA)
+			continue;
+		qeth_ipaddr_to_string(proto, (const u8 *)&ipaddr->u, addr_str);
+		i += sprintf(buf + i, "%s\n", addr_str);
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	i += sprintf(buf + i, "\n");
+
+	return i;
+}
+
+static ssize_t
+qeth_dev_vipa_add4_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_vipa_add_show(buf, card, QETH_PROT_IPV4);
+}
+
+static inline int
+qeth_parse_vipae(const char* buf, enum qeth_prot_versions proto,
+		 u8 *addr)
+{
+	if (qeth_string_to_ipaddr(buf, proto, addr)){
+		PRINT_WARN("Invalid IP address format!\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline ssize_t
+qeth_dev_vipa_add_store(const char *buf, size_t count,
+			 struct qeth_card *card, enum qeth_prot_versions proto)
+{
+	u8 addr[16] = {0, };
+	int rc;
+
+	if ((rc = qeth_parse_vipae(buf, proto, addr)))
+		return rc;
+
+	if ((rc = qeth_add_vipa(card, proto, addr)))
+		return rc;
+
+	return count;
+}
+
+static ssize_t
+qeth_dev_vipa_add4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_vipa_add_store(buf, count, card, QETH_PROT_IPV4);
+}
+
+static QETH_DEVICE_ATTR(vipa_add4, add4, 0644,
+			qeth_dev_vipa_add4_show,
+			qeth_dev_vipa_add4_store);
+
+static inline ssize_t
+qeth_dev_vipa_del_store(const char *buf, size_t count,
+			 struct qeth_card *card, enum qeth_prot_versions proto)
+{
+	u8 addr[16];
+	int rc;
+
+	if ((rc = qeth_parse_vipae(buf, proto, addr)))
+		return rc;
+
+	qeth_del_vipa(card, proto, addr);
+
+	return count;
+}
+
+static ssize_t
+qeth_dev_vipa_del4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_vipa_del_store(buf, count, card, QETH_PROT_IPV4);
+}
+
+static QETH_DEVICE_ATTR(vipa_del4, del4, 0200, NULL,
+			qeth_dev_vipa_del4_store);
+
+#ifdef CONFIG_QETH_IPV6
+static ssize_t
+qeth_dev_vipa_add6_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_vipa_add_show(buf, card, QETH_PROT_IPV6);
+}
+
+static ssize_t
+qeth_dev_vipa_add6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_vipa_add_store(buf, count, card, QETH_PROT_IPV6);
+}
+
+static QETH_DEVICE_ATTR(vipa_add6, add6, 0644,
+			qeth_dev_vipa_add6_show,
+			qeth_dev_vipa_add6_store);
+
+static ssize_t
+qeth_dev_vipa_del6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_vipa_del_store(buf, count, card, QETH_PROT_IPV6);
+}
+
+static QETH_DEVICE_ATTR(vipa_del6, del6, 0200, NULL,
+			qeth_dev_vipa_del6_store);
+#endif /* CONFIG_QETH_IPV6 */
+
+static struct device_attribute * qeth_vipa_device_attrs[] = {
+	&dev_attr_vipa_add4,
+	&dev_attr_vipa_del4,
+#ifdef CONFIG_QETH_IPV6
+	&dev_attr_vipa_add6,
+	&dev_attr_vipa_del6,
+#endif
+	NULL,
+};
+
+static struct attribute_group qeth_device_vipa_group = {
+	.name = "vipa",
+	.attrs = (struct attribute **)qeth_vipa_device_attrs,
+};
+
+static inline ssize_t
+qeth_dev_rxip_add_show(char *buf, struct qeth_card *card,
+		       enum qeth_prot_versions proto)
+{
+	struct qeth_ipaddr *ipaddr;
+	char addr_str[49];
+	unsigned long flags;
+	int i = 0;
+
+	spin_lock_irqsave(&card->ip_lock, flags);
+	list_for_each_entry(ipaddr, &card->ip_list, entry){
+		if (ipaddr->proto != proto)
+			continue;
+		if (ipaddr->type != QETH_IP_TYPE_RXIP)
+			continue;
+		qeth_ipaddr_to_string(proto, (const u8 *)&ipaddr->u, addr_str);
+		i += sprintf(buf + i, "%s\n", addr_str);
+	}
+	spin_unlock_irqrestore(&card->ip_lock, flags);
+	i += sprintf(buf + i, "\n");
+
+	return i;
+}
+
+static ssize_t
+qeth_dev_rxip_add4_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_rxip_add_show(buf, card, QETH_PROT_IPV4);
+}
+
+static inline int
+qeth_parse_rxipe(const char* buf, enum qeth_prot_versions proto,
+		 u8 *addr)
+{
+	if (qeth_string_to_ipaddr(buf, proto, addr)){
+		PRINT_WARN("Invalid IP address format!\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline ssize_t
+qeth_dev_rxip_add_store(const char *buf, size_t count,
+			struct qeth_card *card, enum qeth_prot_versions proto)
+{
+	u8 addr[16] = {0, };
+	int rc;
+
+	if ((rc = qeth_parse_rxipe(buf, proto, addr)))
+		return rc;
+
+	if ((rc = qeth_add_rxip(card, proto, addr)))
+		return rc;
+
+	return count;
+}
+
+static ssize_t
+qeth_dev_rxip_add4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_rxip_add_store(buf, count, card, QETH_PROT_IPV4);
+}
+
+static QETH_DEVICE_ATTR(rxip_add4, add4, 0644,
+			qeth_dev_rxip_add4_show,
+			qeth_dev_rxip_add4_store);
+
+static inline ssize_t
+qeth_dev_rxip_del_store(const char *buf, size_t count,
+			struct qeth_card *card, enum qeth_prot_versions proto)
+{
+	u8 addr[16];
+	int rc;
+
+	if ((rc = qeth_parse_rxipe(buf, proto, addr)))
+		return rc;
+
+	qeth_del_rxip(card, proto, addr);
+
+	return count;
+}
+
+static ssize_t
+qeth_dev_rxip_del4_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_rxip_del_store(buf, count, card, QETH_PROT_IPV4);
+}
+
+static QETH_DEVICE_ATTR(rxip_del4, del4, 0200, NULL,
+			qeth_dev_rxip_del4_store);
+
+#ifdef CONFIG_QETH_IPV6
+static ssize_t
+qeth_dev_rxip_add6_show(struct device *dev, char *buf)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_rxip_add_show(buf, card, QETH_PROT_IPV6);
+}
+
+static ssize_t
+qeth_dev_rxip_add6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_rxip_add_store(buf, count, card, QETH_PROT_IPV6);
+}
+
+static QETH_DEVICE_ATTR(rxip_add6, add6, 0644,
+			qeth_dev_rxip_add6_show,
+			qeth_dev_rxip_add6_store);
+
+static ssize_t
+qeth_dev_rxip_del6_store(struct device *dev, const char *buf, size_t count)
+{
+	struct qeth_card *card = dev->driver_data;
+
+	if (!card)
+		return -EINVAL;
+
+	return qeth_dev_rxip_del_store(buf, count, card, QETH_PROT_IPV6);
+}
+
+static QETH_DEVICE_ATTR(rxip_del6, del6, 0200, NULL,
+			qeth_dev_rxip_del6_store);
+#endif /* CONFIG_QETH_IPV6 */
+
+static struct device_attribute * qeth_rxip_device_attrs[] = {
+	&dev_attr_rxip_add4,
+	&dev_attr_rxip_del4,
+#ifdef CONFIG_QETH_IPV6
+	&dev_attr_rxip_add6,
+	&dev_attr_rxip_del6,
+#endif
+	NULL,
+};
+
+static struct attribute_group qeth_device_rxip_group = {
+	.name = "rxip",
+	.attrs = (struct attribute **)qeth_rxip_device_attrs,
+};
+
+int
+qeth_create_device_attributes(struct device *dev)
+{
+	int ret;
+
+	if ((ret = sysfs_create_group(&dev->kobj, &qeth_device_attr_group)))
+		return ret;
+	if ((ret = sysfs_create_group(&dev->kobj, &qeth_device_ipato_group))){
+		sysfs_remove_group(&dev->kobj, &qeth_device_attr_group);
+		return ret;
+	}
+	if ((ret = sysfs_create_group(&dev->kobj, &qeth_device_vipa_group))){
+		sysfs_remove_group(&dev->kobj, &qeth_device_attr_group);
+		sysfs_remove_group(&dev->kobj, &qeth_device_ipato_group);
+		return ret;
+	}
+	if ((ret = sysfs_create_group(&dev->kobj, &qeth_device_rxip_group))){
+		sysfs_remove_group(&dev->kobj, &qeth_device_attr_group);
+		sysfs_remove_group(&dev->kobj, &qeth_device_ipato_group);
+		sysfs_remove_group(&dev->kobj, &qeth_device_vipa_group);
+	}
+
+	return ret;
+}
+
+void
+qeth_remove_device_attributes(struct device *dev)
+{
+	sysfs_remove_group(&dev->kobj, &qeth_device_attr_group);
+	sysfs_remove_group(&dev->kobj, &qeth_device_ipato_group);
+	sysfs_remove_group(&dev->kobj, &qeth_device_vipa_group);
+	sysfs_remove_group(&dev->kobj, &qeth_device_rxip_group);
+}
+
+/**********************/
+/* DRIVER ATTRIBUTES  */
+/**********************/
+static ssize_t
+qeth_driver_group_store(struct device_driver *ddrv, const char *buf,
+			size_t count)
+{
+	const char *start, *end;
+	char bus_ids[3][BUS_ID_SIZE], *argv[3];
+	int i;
+	int err;
+
+	start = buf;
+	for (i = 0; i < 3; i++) {
+		static const char delim[] = { ',', ',', '\n' };
+		int len;
+
+		if (!(end = strchr(start, delim[i])))
+			return -EINVAL;
+		len = min_t(ptrdiff_t, BUS_ID_SIZE, end - start);
+		strncpy(bus_ids[i], start, len);
+		bus_ids[i][len] = '\0';
+		start = end + 1;
+		argv[i] = bus_ids[i];
+	}
+	err = ccwgroup_create(qeth_root_dev, qeth_ccwgroup_driver.driver_id,
+			&qeth_ccw_driver, 3, argv);
+	if (err)
+		return err;
+	else
+		return count;
+}
+
+
+static DRIVER_ATTR(group, 0200, 0, qeth_driver_group_store);
+
+static ssize_t
+qeth_driver_snmp_register_show(struct device_driver *ddrv, char *buf)
+{
+	/* TODO */
+	return 0;
+}
+
+static ssize_t
+qeth_driver_snmp_register_store(struct device_driver *ddrv, const char *buf,
+				size_t count)
+{
+	/* TODO */
+	return count;
+}
+
+static DRIVER_ATTR(snmp_register, 0644, qeth_driver_snmp_register_show,
+		   qeth_driver_snmp_register_store);
+
+int
+qeth_create_driver_attributes(void)
+{
+	int rc;
+
+	if ((rc = driver_create_file(&qeth_ccwgroup_driver.driver,
+				     &driver_attr_group)))
+		return rc;
+	return driver_create_file(&qeth_ccwgroup_driver.driver,
+				  &driver_attr_snmp_register);
+}
+
+void
+qeth_remove_driver_attributes(void)
+{
+	driver_remove_file(&qeth_ccwgroup_driver.driver,
+			&driver_attr_group);
+	driver_remove_file(&qeth_ccwgroup_driver.driver,
+			&driver_attr_snmp_register);
+}
diff --git a/include/asm-s390/qeth.h b/include/asm-s390/qeth.h
new file mode 100644
index 000000000000..8868027f3e5c
--- /dev/null
+++ b/include/asm-s390/qeth.h
@@ -0,0 +1,60 @@
+/*
+ * include/asm-s390/qeth.h
+ *
+ * ioctl definitions for qeth driver
+ *
+ * Copyright (C) 2004 IBM Corporation
+ *
+ * Author(s):	Thomas Spatzier <tspat@de.ibm.com>
+ *
+ */
+#ifndef __ASM_S390_IOCTL_H__
+#define __ASM_S390_IOCTL_H__
+#include <linux/ioctl.h>
+
+#define QETH_IOCTL_LETTER 'Q'
+
+#define SIOC_QETH_ARP_SET_NO_ENTRIES	_IOWR(QETH_IOCTL_LETTER, 1, int)
+#define SIOC_QETH_ARP_QUERY_INFO	_IOWR(QETH_IOCTL_LETTER, 2, int)
+#define SIOC_QETH_ARP_ADD_ENTRY		_IOWR(QETH_IOCTL_LETTER, 3, int)
+#define SIOC_QETH_ARP_REMOVE_ENTRY	_IOWR(QETH_IOCTL_LETTER, 4, int)
+#define SIOC_QETH_ARP_FLUSH_CACHE	_IOWR(QETH_IOCTL_LETTER, 5, int)
+#define SIOC_QETH_ADP_SET_SNMP_CONTROL	_IOWR(QETH_IOCTL_LETTER, 6, int)
+#define SIOC_QETH_GET_CARD_TYPE		_IOWR(QETH_IOCTL_LETTER, 7, int)
+
+struct qeth_arp_cache_entry {
+	__u8  macaddr[6];
+	__u8  reserved1[2];
+	__u8  ipaddr[16]; /* for both  IPv4 and IPv6 */
+	__u8  reserved2[32];
+} __attribute__ ((packed));
+
+struct qeth_arp_qi_entry7 {
+	__u8 media_specific[32];
+	__u8 macaddr_type;
+	__u8 ipaddr_type;
+	__u8 macaddr[6];
+	__u8 ipaddr[4];
+} __attribute__((packed));
+
+struct qeth_arp_qi_entry5 {
+	__u8 media_specific[32];
+	__u8 macaddr_type;
+	__u8 ipaddr_type;
+	__u8 ipaddr[4];
+} __attribute__((packed));
+
+/* data sent to user space as result of query arp ioctl */
+#define QETH_QARP_USER_DATA_SIZE 20000
+#define QETH_QARP_MASK_OFFSET    4
+#define QETH_QARP_ENTRIES_OFFSET 6
+struct qeth_arp_query_user_data {
+	union {
+		__u32 data_len;		/* set by user space program */
+		__u32 no_entries;	/* set by kernel */
+	} u;
+	__u16 mask_bits;
+	char *entries;
+} __attribute__((packed));
+
+#endif /* __ASM_S390_IOCTL_H__ */
-- 
cgit v1.2.3


From 6a435d69de04e96de8001edbd4a3da94eaec56b3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:15:12 -0700
Subject: [PATCH] Add queue congestion callout

From: Miquel van Smoorenburg <miquels@cistron.nl>

The VM and VFS use the address_space_backing_dev_info to track the realtime
status of the device which backs the mapping.  The read_congested and
write_congested fields are used to determine whether a read or write
against that device may block.

We use this infrastructure to

a) allow pdflush to service many queues in parallel (by not getting
   stuck on any particular one) and

b) to avoid undesirable and uncontrolled latencies in places such as
   page reclaim and

c) To avoid blocking in readahead operations

The current code only supports simple disk queues (and I have a patch here
for NFS).  Stacked queues (MD and DM) don't get this information right and
problems were expected.  Efficiency problems have now been noted and it's
time to fix it.

This patch lays down the infrastructure which permits the queue
implementation to get control when someone at a higher level is querying
the queue's congestion state.  So DM (for example) can run around and
examine all the queues which contribute to the higher-level queue.


It also adds bdi_rw_congested() for code in xfs and ext2 that calls both
bdi_read_congested() and bdi_write_congested() in a row, and it was "free"
anyway.
---
 include/linux/backing-dev.h | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 94c93c9c5f66..e34916ddd1d7 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -20,10 +20,14 @@ enum bdi_state {
 	BDI_unused,		/* Available bits start here */
 };
 
+typedef int (congested_fn)(void *, int);
+
 struct backing_dev_info {
 	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */
 	unsigned long state;	/* Always use atomic bitops on this */
 	int memory_backed;	/* Cannot clean pages with writepage */
+	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	void *congested_data;	/* Pointer to aux data for congested func */
 };
 
 extern struct backing_dev_info default_backing_dev_info;
@@ -32,14 +36,27 @@ int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
 void writeback_release(struct backing_dev_info *bdi);
 
+static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+{
+	if (bdi->congested_fn)
+		return bdi->congested_fn(bdi->congested_data, bdi_bits);
+	return (bdi->state & bdi_bits);
+}
+
 static inline int bdi_read_congested(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_read_congested, &bdi->state);
+	return bdi_congested(bdi, 1 << BDI_read_congested);
 }
 
 static inline int bdi_write_congested(struct backing_dev_info *bdi)
 {
-	return test_bit(BDI_write_congested, &bdi->state);
+	return bdi_congested(bdi, 1 << BDI_write_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+	return bdi_congested(bdi, (1 << BDI_read_congested)|
+				  (1 << BDI_write_congested));
 }
 
 #endif		/* _LINUX_BACKING_DEV_H */
-- 
cgit v1.2.3


From 6d27f67bf6ee2b9ad0c8814118264bc273d916a1 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:15:51 -0700
Subject: [PATCH] per-backing dev unplugging

From: Jens Axboe <axboe@suse.de>,
      Chris Mason,
      me, others.

The global unplug list causes horrid spinlock contention on many-disk
many-CPU setups - throughput is worse than halved.

The other problem with the global unplugging is of course that it will cause
the unplugging of queues which are unrelated to the I/O upon which the caller
is about to wait.

So what we do to solve these problems is to remove the global unplug and set
up the infrastructure under which the VFS can tell the block layer to unplug
only those queues which are relevant to the page or buffer_head whcih is
about to be waited upon.

We do this via the very appropriate address_space->backing_dev_info structure.

Most of the complexity is in devicemapper, MD and swapper_space, because for
these backing devices, multiple queues may need to be unplugged to complete a
page/buffer I/O.  In each case we ensure that data structures are in place to
permit us to identify all the lower-level queues which contribute to the
higher-level backing_dev_info.  Each contributing queue is told to unplug in
response to a higher-level unplug.

To simplify things in various places we also introduce the concept of a
"synchronous BIO": it is tagged with BIO_RW_SYNC.  The block layer will
perform an immediate unplug when it sees one of these go past.
---
 drivers/block/ll_rw_blk.c    | 96 +++++++++++++-------------------------------
 drivers/block/loop.c         | 15 ++++++-
 drivers/block/rd.c           |  1 +
 drivers/block/umem.c         |  3 +-
 drivers/md/dm-crypt.c        |  2 +-
 drivers/md/dm-table.c        | 16 ++++++++
 drivers/md/dm.c              | 23 +++++++++--
 drivers/md/dm.h              |  1 +
 drivers/md/md.c              | 32 +++++++++++++--
 drivers/md/raid1.c           |  3 ++
 drivers/md/raid5.c           |  4 +-
 drivers/md/raid6main.c       |  3 +-
 drivers/mtd/devices/blkmtd.c |  6 +--
 fs/buffer.c                  | 12 ++++--
 fs/direct-io.c               |  4 +-
 fs/jfs/jfs_logmgr.c          |  6 +--
 fs/ntfs/compress.c           |  3 +-
 fs/ufs/truncate.c            |  3 +-
 fs/xfs/linux/xfs_buf.c       | 24 ++++-------
 include/linux/backing-dev.h  |  3 ++
 include/linux/bio.h          |  3 ++
 include/linux/blkdev.h       | 23 ++++++++---
 include/linux/fs.h           |  2 +
 include/linux/raid/md.h      |  1 +
 include/linux/raid/md_k.h    | 26 ------------
 include/linux/swap.h         |  3 ++
 kernel/power/disk.c          |  1 -
 kernel/power/pmdisk.c        |  3 +-
 kernel/power/swsusp.c        |  5 ---
 mm/filemap.c                 |  4 +-
 mm/mempool.c                 |  2 -
 mm/nommu.c                   |  5 +++
 mm/readahead.c               |  8 +++-
 mm/shmem.c                   |  1 +
 mm/swap_state.c              |  1 +
 mm/swapfile.c                | 65 +++++++++++++++++++++++++++++-
 36 files changed, 254 insertions(+), 159 deletions(-)

(limited to 'include')

diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index fc4b6c698fcf..209fdef4d986 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -42,12 +42,6 @@ static void blk_unplug_timeout(unsigned long data);
  */
 static kmem_cache_t *request_cachep;
 
-/*
- * plug management
- */
-static LIST_HEAD(blk_plug_list);
-static spinlock_t blk_plug_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-
 static wait_queue_head_t congestion_wqh[2] = {
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
 		__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
@@ -251,8 +245,6 @@ void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 
-	INIT_LIST_HEAD(&q->plug_list);
-
 	blk_queue_activity_fn(q, NULL, NULL);
 }
 
@@ -1104,13 +1096,11 @@ void blk_plug_device(request_queue_t *q)
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
-	if (!blk_queue_plugged(q)
-	    && !test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags)) {
-		spin_lock(&blk_plug_lock);
-		list_add_tail(&q->plug_list, &blk_plug_list);
+	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
+		return;
+
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
-		spin_unlock(&blk_plug_lock);
-	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1122,15 +1112,12 @@ EXPORT_SYMBOL(blk_plug_device);
 int blk_remove_plug(request_queue_t *q)
 {
 	WARN_ON(!irqs_disabled());
-	if (blk_queue_plugged(q)) {
-		spin_lock(&blk_plug_lock);
-		list_del_init(&q->plug_list);
-		del_timer(&q->unplug_timer);
-		spin_unlock(&blk_plug_lock);
-		return 1;
-	}
 
-	return 0;
+	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+		return 0;
+
+	del_timer(&q->unplug_timer);
+	return 1;
 }
 
 EXPORT_SYMBOL(blk_remove_plug);
@@ -1161,24 +1148,32 @@ static inline void __generic_unplug_device(request_queue_t *q)
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
- *   gets unplugged (either by manually calling this function, or by
- *   calling blk_run_queues()), the request_fn defined for the
- *   queue is invoked and transfers started.
+ *   gets unplugged, the request_fn defined for the queue is invoked and
+ *   transfers started.
  **/
-void generic_unplug_device(void *data)
+void generic_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
-
 	spin_lock_irq(q->queue_lock);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 }
-
 EXPORT_SYMBOL(generic_unplug_device);
 
+static void blk_backing_dev_unplug(struct backing_dev_info *bdi)
+{
+	request_queue_t *q = bdi->unplug_io_data;
+
+	/*
+	 * devices don't necessarily have an ->unplug_fn defined
+	 */
+	if (q->unplug_fn)
+		q->unplug_fn(q);
+}
+
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
+
 	q->unplug_fn(q);
 }
 
@@ -1255,42 +1250,6 @@ void blk_run_queue(struct request_queue *q)
 
 EXPORT_SYMBOL(blk_run_queue);
 
-/**
- * blk_run_queues - fire all plugged queues
- *
- * Description:
- *   Start I/O on all plugged queues known to the block layer. Queues that
- *   are currently stopped are ignored. This is equivalent to the older
- *   tq_disk task queue run.
- **/
-#define blk_plug_entry(entry) list_entry((entry), request_queue_t, plug_list)
-void blk_run_queues(void)
-{
-	LIST_HEAD(local_plug_list);
-
-	spin_lock_irq(&blk_plug_lock);
-
-	/*
-	 * this will happen fairly often
-	 */
-	if (list_empty(&blk_plug_list))
-		goto out;
-
-	list_splice_init(&blk_plug_list, &local_plug_list);
-	
-	while (!list_empty(&local_plug_list)) {
-		request_queue_t *q = blk_plug_entry(local_plug_list.next);
-
-		spin_unlock_irq(&blk_plug_lock);
-		q->unplug_fn(q);
-		spin_lock_irq(&blk_plug_lock);
-	}
-out:
-	spin_unlock_irq(&blk_plug_lock);
-}
-
-EXPORT_SYMBOL(blk_run_queues);
-
 /**
  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
  * @q:    the request queue to be released
@@ -1390,6 +1349,10 @@ request_queue_t *blk_alloc_queue(int gfp_mask)
 	memset(q, 0, sizeof(*q));
 	init_timer(&q->unplug_timer);
 	atomic_set(&q->refcnt, 1);
+
+	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
+	q->backing_dev_info.unplug_io_data = q;
+
 	return q;
 }
 
@@ -2050,7 +2013,6 @@ long blk_congestion_wait(int rw, long timeout)
 	DEFINE_WAIT(wait);
 	wait_queue_head_t *wqh = &congestion_wqh[rw];
 
-	blk_run_queues();
 	prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 	ret = io_schedule_timeout(timeout);
 	finish_wait(wqh, &wait);
@@ -2315,7 +2277,7 @@ out:
 	if (blk_queue_plugged(q)) {
 		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
 
-		if (nr_queued == q->unplug_thresh)
+		if (nr_queued == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f29f72ee30d0..a43c545071cb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -434,6 +434,17 @@ inactive:
 	goto out;
 }
 
+/*
+ * kick off io on the underlying address space
+ */
+static void loop_unplug(request_queue_t *q)
+{
+	struct loop_device *lo = q->queuedata;
+
+	clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags);
+	blk_run_address_space(lo->lo_backing_file->f_mapping);
+}
+
 struct switch_request {
 	struct file *file;
 	struct completion wait;
@@ -614,7 +625,6 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 {
 	struct file	*file;
 	struct inode	*inode;
-	struct block_device *lo_device = NULL;
 	struct address_space *mapping;
 	unsigned lo_blocksize;
 	int		lo_flags = 0;
@@ -671,7 +681,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
 
 	lo->lo_blocksize = lo_blocksize;
-	lo->lo_device = lo_device;
+	lo->lo_device = bdev;
 	lo->lo_flags = lo_flags;
 	lo->lo_backing_file = file;
 	lo->transfer = NULL;
@@ -688,6 +698,7 @@ static int loop_set_fd(struct loop_device *lo, struct file *lo_file,
 	 */
 	blk_queue_make_request(lo->lo_queue, loop_make_request);
 	lo->lo_queue->queuedata = lo;
+	lo->lo_queue->unplug_fn = loop_unplug;
 
 	set_capacity(disks[lo->lo_number], size);
 	bd_set_size(bdev, size << 9);
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index e626344c9b58..3dd9163a64e2 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -271,6 +271,7 @@ static int rd_ioctl(struct inode *inode, struct file *file,
 static struct backing_dev_info rd_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 static int rd_open(struct inode *inode, struct file *filp)
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 31cd010f4d56..5a1e349b131d 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -368,9 +368,8 @@ static inline void reset_page(struct mm_page *page)
 	page->biotail = & page->bio;
 }
 
-static void mm_unplug_device(void *data)
+static void mm_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	struct cardinfo *card = q->queuedata;
 	unsigned long flags;
 
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8e1798115e2f..a17b25380fce 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -668,7 +668,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 
 		/* out of memory -> run queues */
 		if (remaining)
-			blk_run_queues();
+			blk_congestion_wait(bio_data_dir(clone), HZ/100);
 	}
 
 	/* drop reference, clones could have returned before we reach this */
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 4aa6c43ffd01..93dc0e6361c0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -885,8 +885,24 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
+void dm_table_unplug_all(struct dm_table *t)
+{
+	struct list_head *d, *devices = dm_table_get_devices(t);
+
+	for (d = devices->next; d != devices; d = d->next) {
+		struct dm_dev *dd = list_entry(d, struct dm_dev, list);
+		request_queue_t *q = bdev_get_queue(dd->bdev);
+
+		if (q->unplug_fn)
+			q->unplug_fn(q);
+	}
+}
+
 EXPORT_SYMBOL(dm_vcalloc);
 EXPORT_SYMBOL(dm_get_device);
 EXPORT_SYMBOL(dm_put_device);
 EXPORT_SYMBOL(dm_table_event);
 EXPORT_SYMBOL(dm_table_get_mode);
+EXPORT_SYMBOL(dm_table_put);
+EXPORT_SYMBOL(dm_table_get);
+EXPORT_SYMBOL(dm_table_unplug_all);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6dc34c8b4604..542f9cd0acc0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -575,6 +575,17 @@ static int dm_request(request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
+static void dm_unplug_all(request_queue_t *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+}
+
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r;
@@ -672,6 +683,7 @@ static struct mapped_device *alloc_dev(unsigned int minor, int persistent)
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
+	md->queue->unplug_fn = dm_unplug_all;
 
 	md->io_pool = mempool_create(MIN_IOS, mempool_alloc_slab,
 				     mempool_free_slab, _io_cache);
@@ -896,11 +908,17 @@ int dm_suspend(struct mapped_device *md)
 	add_wait_queue(&md->wait, &wait);
 	up_write(&md->lock);
 
+	/* unplug */
+	map = dm_get_table(md);
+	if (map) {
+		dm_table_unplug_all(map);
+		dm_table_put(map);
+	}
+
 	/*
 	 * Then we wait for the already mapped ios to
 	 * complete.
 	 */
-	blk_run_queues();
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
 
@@ -945,10 +963,9 @@ int dm_resume(struct mapped_device *md)
 	def = bio_list_get(&md->deferred);
 	__flush_deferred_io(md, def);
 	up_write(&md->lock);
+	dm_table_unplug_all(map);
 	dm_table_put(map);
 
-	blk_run_queues();
-
 	return 0;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 780185db38d0..34bf0e7cceb2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -116,6 +116,7 @@ int dm_table_get_mode(struct dm_table *t);
 void dm_table_suspend_targets(struct dm_table *t);
 void dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+void dm_table_unplug_all(struct dm_table *t);
 
 /*-----------------------------------------------------------------
  * A registry of target types.
diff --git a/drivers/md/md.c b/drivers/md/md.c
index aa6fef11aa4e..72d6a2da5827 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -160,6 +160,30 @@ static int md_fail_request (request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
+void md_unplug_mddev(mddev_t *mddev)
+{
+	struct list_head *tmp;
+	mdk_rdev_t *rdev;
+
+	/*
+	 * this list iteration is done without any locking in md?!
+	 */
+	ITERATE_RDEV(mddev, rdev, tmp) {
+		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+EXPORT_SYMBOL(md_unplug_mddev);
+
+static void md_unplug_all(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+
+	md_unplug_mddev(mddev);
+}
+
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -335,6 +359,8 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct bio_vec vec;
 	struct completion event;
 
+	rw |= (1 << BIO_RW_SYNC);
+
 	bio_init(&bio);
 	bio.bi_io_vec = &vec;
 	vec.bv_page = page;
@@ -349,7 +375,6 @@ static int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	bio.bi_private = &event;
 	bio.bi_end_io = bi_complete;
 	submit_bio(rw, &bio);
-	blk_run_queues();
 	wait_for_completion(&event);
 
 	return test_bit(BIO_UPTODATE, &bio.bi_flags);
@@ -1644,6 +1669,7 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
+	mddev->queue->unplug_fn = md_unplug_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -2718,7 +2744,7 @@ int md_thread(void * arg)
 		run = thread->run;
 		if (run) {
 			run(thread->mddev);
-			blk_run_queues();
+			md_unplug_mddev(thread->mddev);
 		}
 		if (signal_pending(current))
 			flush_signals(current);
@@ -3287,7 +3313,7 @@ static void md_do_sync(mddev_t *mddev)
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		blk_run_queues();
+		md_unplug_mddev(mddev);
 
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f308d5fe946f..6616cd46c50f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -451,6 +451,7 @@ rb_out:
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
 	
@@ -478,6 +479,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	conf->nr_pending++;
@@ -644,6 +646,7 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
+	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
 	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
 	spin_unlock_irq(&conf->resync_lock);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b5cc6c4ba6ba..5c9d3fd66913 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -249,6 +249,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
+				md_unplug_mddev(conf->mddev);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
@@ -1292,9 +1293,8 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 		}
 	}
 }
-static void raid5_unplug_device(void *data)
+static void raid5_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid5_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 747085a6dac0..131f4a1f34eb 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -1454,9 +1454,8 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
 		}
 	}
 }
-static void raid6_unplug_device(void *data)
+static void raid6_unplug_device(request_queue_t *q)
 {
-	request_queue_t *q = data;
 	mddev_t *mddev = q->queuedata;
 	raid6_conf_t *conf = mddev_to_conf(mddev);
 	unsigned long flags;
diff --git a/drivers/mtd/devices/blkmtd.c b/drivers/mtd/devices/blkmtd.c
index b4b4178943a1..4bd5d3219458 100644
--- a/drivers/mtd/devices/blkmtd.c
+++ b/drivers/mtd/devices/blkmtd.c
@@ -147,8 +147,7 @@ static int blkmtd_readpage(struct blkmtd_dev *dev, struct page *page)
 		bio->bi_private = &event;
 		bio->bi_end_io = bi_read_complete;
 		if(bio_add_page(bio, page, PAGE_SIZE, 0) == PAGE_SIZE) {
-			submit_bio(READ, bio);
-			blk_run_queues();
+			submit_bio(READ_SYNC, bio);
 			wait_for_completion(&event);
 			err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
 			bio_put(bio);
@@ -179,8 +178,7 @@ static int blkmtd_write_out(struct bio *bio)
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_write_complete;
-	submit_bio(WRITE, bio);
-	blk_run_queues();
+	submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&event);
 	DEBUG(3, "submit_bio completed, bi_vcnt = %d\n", bio->bi_vcnt);
 	err = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : -EIO;
diff --git a/fs/buffer.c b/fs/buffer.c
index be9cc963a178..8ab66d0b7548 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -132,7 +132,11 @@ void __wait_on_buffer(struct buffer_head * bh)
 	do {
 		prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
 		if (buffer_locked(bh)) {
-			blk_run_queues();
+			struct block_device *bd;
+			smp_mb();
+			bd = bh->b_bdev;
+			if (bd)
+				blk_run_address_space(bd->bd_inode->i_mapping);
 			io_schedule();
 		}
 	} while (buffer_locked(bh));
@@ -492,7 +496,6 @@ static void free_more_memory(void)
 	pg_data_t *pgdat;
 
 	wakeup_bdflush(1024);
-	blk_run_queues();
 	yield();
 
 	for_each_pgdat(pgdat) {
@@ -2927,7 +2930,10 @@ EXPORT_SYMBOL(try_to_free_buffers);
 
 int block_sync_page(struct page *page)
 {
-	blk_run_queues();
+	struct address_space *mapping;
+	smp_mb();
+	mapping = page->mapping;
+	blk_run_address_space(mapping);
 	return 0;
 }
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index d022a233820f..79534d258f37 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -364,7 +364,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		if (dio->bio_list == NULL) {
 			dio->waiter = current;
 			spin_unlock_irqrestore(&dio->bio_lock, flags);
-			blk_run_queues();
+			blk_run_address_space(dio->inode->i_mapping);
 			io_schedule();
 			spin_lock_irqsave(&dio->bio_lock, flags);
 			dio->waiter = NULL;
@@ -1035,7 +1035,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
 		if (ret == 0)
 			ret = dio->result;
 		finished_one_bio(dio);		/* This can free the dio */
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		if (should_wait) {
 			unsigned long flags;
 			/*
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index b72fb4a40adc..b90aa961dd5a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1975,8 +1975,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	submit_bio(READ, bio);
-	blk_run_queues();
+	submit_bio(READ_SYNC, bio);
 
 	wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD));
 
@@ -2120,9 +2119,8 @@ static void lbmStartIO(struct lbuf * bp)
 
 	/* check if journaling to disk has been disabled */
 	if (!log->no_integrity) {
-		submit_bio(WRITE, bio);
+		submit_bio(WRITE_SYNC, bio);
 		INCREMENT(lmStat.submitted);
-		blk_run_queues();
 	}
 	else {
 		bio->bi_size = 0;
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index a8618f107ead..68231e909496 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -23,6 +23,7 @@
 
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 #include "ntfs.h"
 
@@ -668,7 +669,7 @@ lock_retry_remap:
 					"uptodate! Unplugging the disk queue "
 					"and rescheduling.");
 			get_bh(tbh);
-			blk_run_queues();
+			blk_run_address_space(mapping);
 			schedule();
 			put_bh(tbh);
 			if (unlikely(!buffer_uptodate(tbh)))
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 04e50f696202..b22169e7ba76 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -38,6 +38,7 @@
 #include <linux/string.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 #include <linux/sched.h>
 
 #include "swab.h"
@@ -456,7 +457,7 @@ void ufs_truncate (struct inode * inode)
 			break;
 		if (IS_SYNC(inode) && (inode->i_state & I_DIRTY))
 			ufs_sync_inode (inode);
-		blk_run_queues();
+		blk_run_address_space(inode->i_mapping);
 		yield();
 	}
 	offset = inode->i_size & uspi->s_fshift;
diff --git a/fs/xfs/linux/xfs_buf.c b/fs/xfs/linux/xfs_buf.c
index c5f06aad5234..2d4cf586cf85 100644
--- a/fs/xfs/linux/xfs_buf.c
+++ b/fs/xfs/linux/xfs_buf.c
@@ -1013,7 +1013,7 @@ pagebuf_lock(
 {
 	PB_TRACE(pb, "lock", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_sema);
 	PB_SET_OWNER(pb);
 	PB_TRACE(pb, "locked", 0);
@@ -1109,7 +1109,7 @@ _pagebuf_wait_unpin(
 		if (atomic_read(&pb->pb_pin_count) == 0)
 			break;
 		if (atomic_read(&pb->pb_io_remaining))
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		schedule();
 	}
 	remove_wait_queue(&pb->pb_waiters, &wait);
@@ -1407,7 +1407,7 @@ submit_io:
 	if (pb->pb_flags & PBF_RUN_QUEUES) {
 		pb->pb_flags &= ~PBF_RUN_QUEUES;
 		if (atomic_read(&pb->pb_io_remaining) > 1)
-			blk_run_queues();
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 	}
 }
 
@@ -1471,7 +1471,7 @@ pagebuf_iowait(
 {
 	PB_TRACE(pb, "iowait", 0);
 	if (atomic_read(&pb->pb_io_remaining))
-		blk_run_queues();
+		blk_run_address_space(pb->pb_target->pbr_mapping);
 	down(&pb->pb_iodonesema);
 	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
 	return pb->pb_error;
@@ -1617,7 +1617,6 @@ STATIC int
 pagebuf_daemon(
 	void			*data)
 {
-	int			count;
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 
@@ -1640,7 +1639,6 @@ pagebuf_daemon(
 
 		spin_lock(&pbd_delwrite_lock);
 
-		count = 0;
 		list_for_each_safe(curr, next, &pbd_delwrite_queue) {
 			pb = list_entry(curr, page_buf_t, pb_list);
 
@@ -1657,7 +1655,6 @@ pagebuf_daemon(
 				pb->pb_flags &= ~PBF_DELWRI;
 				pb->pb_flags |= PBF_WRITE;
 				list_move(&pb->pb_list, &tmp);
-				count++;
 			}
 		}
 
@@ -1667,12 +1664,11 @@ pagebuf_daemon(
 			list_del_init(&pb->pb_list);
 
 			pagebuf_iostrategy(pb);
+			blk_run_address_space(pb->pb_target->pbr_mapping);
 		}
 
 		if (as_list_len > 0)
 			purge_addresses();
-		if (count)
-			blk_run_queues();
 
 		force_flush = 0;
 	} while (pagebuf_daemon_active);
@@ -1689,7 +1685,6 @@ pagebuf_delwri_flush(
 	page_buf_t		*pb;
 	struct list_head	*curr, *next, tmp;
 	int			pincount = 0;
-	int			flush_cnt = 0;
 
 	pagebuf_runall_queues(pagebuf_dataio_workqueue);
 	pagebuf_runall_queues(pagebuf_logio_workqueue);
@@ -1733,14 +1728,8 @@ pagebuf_delwri_flush(
 
 		pagebuf_lock(pb);
 		pagebuf_iostrategy(pb);
-		if (++flush_cnt > 32) {
-			blk_run_queues();
-			flush_cnt = 0;
-		}
 	}
 
-	blk_run_queues();
-
 	while (!list_empty(&tmp)) {
 		pb = list_entry(tmp.next, page_buf_t, pb_list);
 
@@ -1751,6 +1740,9 @@ pagebuf_delwri_flush(
 		pagebuf_rele(pb);
 	}
 
+	if (flags & PBDF_WAIT)
+		blk_run_address_space(target->pbr_mapping);
+
 	if (pinptr)
 		*pinptr = pincount;
 }
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index e34916ddd1d7..00371734995c 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -28,9 +28,12 @@ struct backing_dev_info {
 	int memory_backed;	/* Cannot clean pages with writepage */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	void (*unplug_io_fn)(struct backing_dev_info *);
+	void *unplug_io_data;
 };
 
 extern struct backing_dev_info default_backing_dev_info;
+void default_unplug_io_fn(struct backing_dev_info *bdi);
 
 int writeback_acquire(struct backing_dev_info *bdi);
 int writeback_in_progress(struct backing_dev_info *bdi);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index c421c46bfbb2..c4dd287dd1c8 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -119,11 +119,13 @@ struct bio {
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
+ * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
+#define BIO_RW_SYNC	4
 
 /*
  * various member access, note that bio_data should of course not be used
@@ -138,6 +140,7 @@ struct bio {
 #define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
 #define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
+#define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 
 /*
  * will die
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1a521e16b398..572f96e6940a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -243,7 +243,7 @@ typedef int (merge_requests_fn) (request_queue_t *, struct request *,
 typedef void (request_fn_proc) (request_queue_t *q);
 typedef int (make_request_fn) (request_queue_t *q, struct bio *bio);
 typedef int (prep_rq_fn) (request_queue_t *, struct request *);
-typedef void (unplug_fn) (void *q);
+typedef void (unplug_fn) (request_queue_t *);
 
 struct bio_vec;
 typedef int (merge_bvec_fn) (request_queue_t *, struct bio *, struct bio_vec *);
@@ -315,8 +315,6 @@ struct request_queue
 	unsigned long		bounce_pfn;
 	int			bounce_gfp;
 
-	struct list_head	plug_list;
-
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
@@ -370,8 +368,9 @@ struct request_queue
 #define QUEUE_FLAG_WRITEFULL	4	/* read queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
+#define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 
-#define blk_queue_plugged(q)	!list_empty(&(q)->plug_list)
+#define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 
@@ -515,7 +514,7 @@ extern int scsi_cmd_ioctl(struct gendisk *, unsigned int, unsigned long);
 extern void blk_start_queue(request_queue_t *q);
 extern void blk_stop_queue(request_queue_t *q);
 extern void __blk_stop_queue(request_queue_t *q);
-extern void blk_run_queue(request_queue_t *q);
+extern void blk_run_queue(request_queue_t *);
 extern void blk_queue_activity_fn(request_queue_t *, activity_fn *, void *);
 extern struct request *blk_rq_map_user(request_queue_t *, int, void __user *, unsigned int);
 extern int blk_rq_unmap_user(struct request *, void __user *, unsigned int);
@@ -526,6 +525,18 @@ static inline request_queue_t *bdev_get_queue(struct block_device *bdev)
 	return bdev->bd_disk->queue;
 }
 
+static inline void blk_run_backing_dev(struct backing_dev_info *bdi)
+{
+	if (bdi && bdi->unplug_io_fn)
+		bdi->unplug_io_fn(bdi);
+}
+
+static inline void blk_run_address_space(struct address_space *mapping)
+{
+	if (mapping)
+		blk_run_backing_dev(mapping->backing_dev_info);
+}
+
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
@@ -572,7 +583,7 @@ extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bd
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
-extern void generic_unplug_device(void *);
+extern void generic_unplug_device(request_queue_t *);
 extern long nr_blockdev_pages(void);
 
 int blk_get_queue(request_queue_t *);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 39c893f8aa28..c7f0052b4abd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -83,6 +83,8 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SPECIAL 4	/* For non-blockdevice requests in request queue */
+#define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
 
 #define SEL_IN		1
 #define SEL_OUT		2
diff --git a/include/linux/raid/md.h b/include/linux/raid/md.h
index 240dc450dcd3..9c06e776cfc2 100644
--- a/include/linux/raid/md.h
+++ b/include/linux/raid/md.h
@@ -76,6 +76,7 @@ extern void md_handle_safemode(mddev_t *mddev);
 extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
 extern void md_sync_acct(mdk_rdev_t *rdev, unsigned long nr_sectors);
 extern void md_error (mddev_t *mddev, mdk_rdev_t *rdev);
+extern void md_unplug_mddev(mddev_t *mddev);
 
 extern void md_print_devices (void);
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index bea64b0fb6c1..42c973c53d04 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -326,7 +326,6 @@ do {									\
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
-		blk_run_queues();					\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -341,30 +340,5 @@ do {									\
 	__wait_event_lock_irq(wq, condition, lock);			\
 } while (0)
 
-
-#define __wait_disk_event(wq, condition) 				\
-do {									\
-	wait_queue_t __wait;						\
-	init_waitqueue_entry(&__wait, current);				\
-									\
-	add_wait_queue(&wq, &__wait);					\
-	for (;;) {							\
-		set_current_state(TASK_UNINTERRUPTIBLE);		\
-		if (condition)						\
-			break;						\
-		blk_run_queues();					\
-		schedule();						\
-	}								\
-	current->state = TASK_RUNNING;					\
-	remove_wait_queue(&wq, &__wait);				\
-} while (0)
-
-#define wait_disk_event(wq, condition) 					\
-do {									\
-	if (condition)	 						\
-		break;							\
-	__wait_disk_event(wq, condition);				\
-} while (0)
-
 #endif
 
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b000c56803b8..d189090cf63a 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -197,6 +197,8 @@ extern int shmem_unuse(swp_entry_t entry, struct page *page);
 #define	SWAP_AGAIN	1
 #define	SWAP_FAIL	2
 
+extern void swap_unplug_io_fn(struct backing_dev_info *);
+
 #ifdef CONFIG_SWAP
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct file *, struct page *);
@@ -232,6 +234,7 @@ extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
+struct backing_dev_info;
 
 extern struct swap_list_t swap_list;
 extern spinlock_t swaplock;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 7e035a9b42d1..6abcf99b7ada 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -84,7 +84,6 @@ static void free_some_memory(void)
 	while (shrink_all_memory(10000))
 		printk(".");
 	printk("|\n");
-	blk_run_queues();
 }
 
 
diff --git a/kernel/power/pmdisk.c b/kernel/power/pmdisk.c
index d54147214bea..22855abbdd6e 100644
--- a/kernel/power/pmdisk.c
+++ b/kernel/power/pmdisk.c
@@ -859,7 +859,6 @@ static int end_io(struct bio * bio, unsigned int num, int err)
 
 static void wait_io(void)
 {
-	blk_run_queues();
 	while(atomic_read(&io_done))
 		io_schedule();
 }
@@ -898,7 +897,7 @@ static int submit(int rw, pgoff_t page_off, void * page)
 	if (rw == WRITE)
 		bio_set_pages_dirty(bio);
 	start_io();
-	submit_bio(rw,bio);
+	submit_bio(rw | (1 << BIO_RW_SYNC), bio);
 	wait_io();
  Done:
 	bio_put(bio);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 20134ab8e0b2..ae748a467af5 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -707,11 +707,6 @@ int software_suspend(void)
 
 		free_some_memory();
 		
-		/* No need to invalidate any vfsmnt list -- 
-		 * they will be valid after resume, anyway.
-		 */
-		blk_run_queues();
-
 		/* Save state of all device drivers, and stop them. */		   
 		if ((res = device_suspend(4))==0)
 			/* If stopping device drivers worked, we proceed basically into
diff --git a/mm/filemap.c b/mm/filemap.c
index ec1952db8baf..dc2f0992d879 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -119,8 +119,10 @@ void remove_from_page_cache(struct page *page)
 
 static inline int sync_page(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping;
 
+	smp_mb();
+	mapping = page->mapping;
 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 		return mapping->a_ops->sync_page(page);
 	return 0;
diff --git a/mm/mempool.c b/mm/mempool.c
index 756e60ee18d6..da6ad1e12c97 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -234,8 +234,6 @@ repeat_alloc:
 	if (!(gfp_mask & __GFP_WAIT))
 		return NULL;
 
-	blk_run_queues();
-
 	prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
 	mb();
 	if (!pool->curr_nr)
diff --git a/mm/nommu.c b/mm/nommu.c
index c940756b49e5..1432dbab85eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -18,6 +18,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -571,3 +572,7 @@ unsigned long get_unmapped_area(struct file *file, unsigned long addr,
 void pte_chain_init(void)
 {
 }
+
+void swap_unplug_io_fn(struct backing_dev_info *)
+{
+}
diff --git a/mm/readahead.c b/mm/readahead.c
index 08a2d9f1051d..71bf2462d097 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -15,11 +15,16 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 
+void default_unplug_io_fn(struct backing_dev_info *bdi)
+{
+}
+EXPORT_SYMBOL(default_unplug_io_fn);
+
 struct backing_dev_info default_backing_dev_info = {
 	.ra_pages	= (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE,
 	.state		= 0,
+	.unplug_io_fn	= default_unplug_io_fn,
 };
-
 EXPORT_SYMBOL_GPL(default_backing_dev_info);
 
 /*
@@ -32,7 +37,6 @@ file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
 	ra->ra_pages = mapping->backing_dev_info->ra_pages;
 	ra->average = ra->ra_pages / 2;
 }
-
 EXPORT_SYMBOL(file_ra_state_init);
 
 /*
diff --git a/mm/shmem.c b/mm/shmem.c
index 4116ea26daf1..345e04cb0f6c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -169,6 +169,7 @@ static struct vm_operations_struct shmem_vm_ops;
 static struct backing_dev_info shmem_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn = default_unplug_io_fn,
 };
 
 LIST_HEAD(shmem_inodes);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 22946f0d9ecf..97f80d20807c 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -19,6 +19,7 @@
 static struct backing_dev_info swap_backing_dev_info = {
 	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
+	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
 extern struct address_space_operations swap_aops;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index e5cebb1800b9..f885e6d17a49 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -23,6 +23,7 @@
 #include <linux/module.h>
 #include <linux/rmap-locking.h>
 #include <linux/security.h>
+#include <linux/backing-dev.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
@@ -44,8 +45,64 @@ struct swap_list_t swap_list = {-1, -1};
 
 struct swap_info_struct swap_info[MAX_SWAPFILES];
 
+/*
+ * Array of backing blockdevs, for swap_unplug_fn.  We need this because the
+ * bdev->unplug_fn can sleep and we cannot hold swap_list_lock while calling
+ * the unplug_fn.  And swap_list_lock cannot be turned into a semaphore.
+ */
+static DECLARE_MUTEX(swap_bdevs_sem);
+static struct block_device *swap_bdevs[MAX_SWAPFILES];
+
 #define SWAPFILE_CLUSTER 256
 
+/*
+ * Caller holds swap_bdevs_sem
+ */
+static void install_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == NULL) {
+			swap_bdevs[i] = bdev;
+			return;
+		}
+	}
+	BUG();
+}
+
+static void remove_swap_bdev(struct block_device *bdev)
+{
+	int i;
+
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		if (swap_bdevs[i] == bdev) {
+			memcpy(&swap_bdevs[i], &swap_bdevs[i + 1],
+				(MAX_SWAPFILES - i - 1) * sizeof(*swap_bdevs));
+			swap_bdevs[MAX_SWAPFILES - 1] = NULL;
+			return;
+		}
+	}
+	BUG();
+}
+
+void swap_unplug_io_fn(struct backing_dev_info *unused_bdi)
+{
+	int i;
+
+	down(&swap_bdevs_sem);
+	for (i = 0; i < MAX_SWAPFILES; i++) {
+		struct block_device *bdev = swap_bdevs[i];
+		struct backing_dev_info *bdi;
+
+		if (bdev == NULL)
+			break;
+		bdi = bdev->bd_inode->i_mapping->backing_dev_info;
+		(*bdi->unplug_io_fn)(bdi);
+	}
+	up(&swap_bdevs_sem);
+}
+
 static inline int scan_swap_map(struct swap_info_struct *si)
 {
 	unsigned long offset;
@@ -1088,6 +1145,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 		swap_list_unlock();
 		goto out_dput;
 	}
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	swap_file = p->swap_file;
@@ -1099,6 +1157,8 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
 	destroy_swap_extents(p);
 	swap_device_unlock(p);
 	swap_list_unlock();
+	remove_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	vfree(swap_map);
 	if (S_ISBLK(mapping->host->i_mode)) {
 		struct block_device *bdev = I_BDEV(mapping->host);
@@ -1440,6 +1500,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	if (error)
 		goto bad_swap;
 
+	down(&swap_bdevs_sem);
 	swap_list_lock();
 	swap_device_lock(p);
 	p->flags = SWP_ACTIVE;
@@ -1465,6 +1526,8 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
 	}
 	swap_device_unlock(p);
 	swap_list_unlock();
+	install_swap_bdev(p->bdev);
+	up(&swap_bdevs_sem);
 	error = 0;
 	goto out;
 bad_swap:
@@ -1484,7 +1547,7 @@ bad_swap_2:
 	destroy_swap_extents(p);
 	if (swap_map)
 		vfree(swap_map);
-	if (swap_file && !IS_ERR(swap_file))
+	if (swap_file)
 		filp_close(swap_file, NULL);
 out:
 	if (page && !IS_ERR(page)) {
-- 
cgit v1.2.3


From 66db15b4577185624ae95ffe99a66305c8c63ef7 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:16:17 -0700
Subject: [PATCH] unplugging: md update

From: Neil Brown <neilb@cse.unsw.edu.au>

I've made a bunch of changes to the 'md' bits - largely moving the
unplugging into the individual personalities which know more about which
drives are actually in use.
---
 drivers/md/linear.c       | 15 +++++++++++++
 drivers/md/md.c           | 35 +++++------------------------
 drivers/md/multipath.c    | 23 +++++++++++++++++++
 drivers/md/raid0.c        | 17 ++++++++++++++
 drivers/md/raid1.c        | 56 ++++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.c        | 36 ++++++++++++++++++++++++++----
 drivers/md/raid6main.c    | 36 +++++++++++++++++++++++++++---
 include/linux/raid/md_k.h |  7 +++---
 8 files changed, 175 insertions(+), 50 deletions(-)

(limited to 'include')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 1198e07e7abe..e0aa017a26b7 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -80,6 +80,20 @@ static int linear_mergeable_bvec(request_queue_t *q, struct bio *bio, struct bio
 	return maxsectors << 9;
 }
 
+static void linear_unplug(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+	linear_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i < mddev->raid_disks; i++) {
+		request_queue_t *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+
+
 static int linear_run (mddev_t *mddev)
 {
 	linear_conf_t *conf;
@@ -185,6 +199,7 @@ static int linear_run (mddev_t *mddev)
 		BUG();
 
 	blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
+	mddev->queue->unplug_fn = linear_unplug;
 	return 0;
 
 out:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 72d6a2da5827..b521ca509b1e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -160,30 +160,6 @@ static int md_fail_request (request_queue_t *q, struct bio *bio)
 	return 0;
 }
 
-void md_unplug_mddev(mddev_t *mddev)
-{
-	struct list_head *tmp;
-	mdk_rdev_t *rdev;
-
-	/*
-	 * this list iteration is done without any locking in md?!
-	 */
-	ITERATE_RDEV(mddev, rdev, tmp) {
-		request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
-
-		if (r_queue->unplug_fn)
-			r_queue->unplug_fn(r_queue);
-	}
-}
-EXPORT_SYMBOL(md_unplug_mddev);
-
-static void md_unplug_all(request_queue_t *q)
-{
-	mddev_t *mddev = q->queuedata;
-
-	md_unplug_mddev(mddev);
-}
-
 static inline mddev_t *mddev_get(mddev_t *mddev)
 {
 	atomic_inc(&mddev->active);
@@ -1669,7 +1645,6 @@ static int do_md_run(mddev_t * mddev)
 	 */
 	mddev->queue->queuedata = mddev;
 	mddev->queue->make_request_fn = mddev->pers->make_request;
-	mddev->queue->unplug_fn = md_unplug_all;
 
 	mddev->changed = 1;
 	return 0;
@@ -2742,10 +2717,9 @@ int md_thread(void * arg)
 		clear_bit(THREAD_WAKEUP, &thread->flags);
 
 		run = thread->run;
-		if (run) {
+		if (run)
 			run(thread->mddev);
-			md_unplug_mddev(thread->mddev);
-		}
+
 		if (signal_pending(current))
 			flush_signals(current);
 	}
@@ -3313,8 +3287,6 @@ static void md_do_sync(mddev_t *mddev)
 		    test_bit(MD_RECOVERY_ERR, &mddev->recovery))
 			break;
 
-		md_unplug_mddev(mddev);
-
 	repeat:
 		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP ) {
 			/* step marks */
@@ -3347,6 +3319,7 @@ static void md_do_sync(mddev_t *mddev)
 		 * about not overloading the IO subsystem. (things like an
 		 * e2fsck being done on the RAID array should execute fast)
 		 */
+		mddev->queue->unplug_fn(mddev->queue);
 		cond_resched();
 
 		currspeed = ((unsigned long)(j-mddev->resync_mark_cnt))/2/((jiffies-mddev->resync_mark)/HZ +1) +1;
@@ -3365,6 +3338,8 @@ static void md_do_sync(mddev_t *mddev)
 	 * this also signals 'finished resyncing' to md_stop
 	 */
  out:
+	mddev->queue->unplug_fn(mddev->queue);
+
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index bf9980a8b1fd..9114c7c269ed 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -155,6 +155,27 @@ static int multipath_read_balance (multipath_conf_t *conf)
 	return 0;
 }
 
+static void unplug_slaves(mddev_t *mddev)
+{
+	multipath_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->multipaths[i].rdev;
+		if (rdev && !rdev->faulty) {
+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+			if (r_queue->unplug_fn)
+				r_queue->unplug_fn(r_queue);
+		}
+	}
+}
+static void multipath_unplug(request_queue_t *q)
+{
+	unplug_slaves(q->queuedata);
+}
+
+
 static int multipath_make_request (request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -419,6 +440,8 @@ static int multipath_run (mddev_t *mddev)
 	}
 	memset(conf->multipaths, 0, sizeof(struct multipath_info)*mddev->raid_disks);
 
+	mddev->queue->unplug_fn = multipath_unplug;
+
 	conf->working_disks = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
 		disk_idx = rdev->raid_disk;
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3cbf14021820..5f4b8bfefc91 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -25,6 +25,21 @@
 #define MD_DRIVER
 #define MD_PERSONALITY
 
+static void raid0_unplug(request_queue_t *q)
+{
+	mddev_t *mddev = q->queuedata;
+	raid0_conf_t *conf = mddev_to_conf(mddev);
+	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		request_queue_t *r_queue = bdev_get_queue(devlist[i]->bdev);
+
+		if (r_queue->unplug_fn)
+			r_queue->unplug_fn(r_queue);
+	}
+}
+
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
@@ -202,6 +217,8 @@ static int create_strip_zones (mddev_t *mddev)
 			conf->hash_spacing = sz;
 	}
 
+	mddev->queue->unplug_fn = raid0_unplug;
+
 	printk("raid0: done.\n");
 	return 0;
  abort:
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 6616cd46c50f..bcc81ef13a35 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -37,6 +37,9 @@ static mdk_personality_t raid1_personality;
 static spinlock_t retry_list_lock = SPIN_LOCK_UNLOCKED;
 static LIST_HEAD(retry_list_head);
 
+static void unplug_slaves(mddev_t *mddev);
+
+
 static void * r1bio_pool_alloc(int gfp_flags, void *data)
 {
 	mddev_t *mddev = data;
@@ -47,6 +50,8 @@ static void * r1bio_pool_alloc(int gfp_flags, void *data)
 			 gfp_flags);
 	if (r1_bio)
 		memset(r1_bio, 0, sizeof(*r1_bio) + sizeof(struct bio*)*mddev->raid_disks);
+	else
+		unplug_slaves(mddev);
 
 	return r1_bio;
 }
@@ -71,8 +76,10 @@ static void * r1buf_pool_alloc(int gfp_flags, void *data)
 	int i, j;
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, conf->mddev);
-	if (!r1_bio)
+	if (!r1_bio) {
+		unplug_slaves(conf->mddev);
 		return NULL;
+	}
 
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
@@ -443,6 +450,29 @@ rb_out:
 	return new_disk;
 }
 
+static void unplug_slaves(mddev_t *mddev)
+{
+	conf_t *conf = mddev_to_conf(mddev);
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&conf->device_lock, flags);
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->mirrors[i].rdev;
+		if (rdev && !rdev->faulty) {
+			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
+
+			if (r_queue->unplug_fn)
+				r_queue->unplug_fn(r_queue);
+		}
+	}
+	spin_unlock_irqrestore(&conf->device_lock, flags);
+}
+static void raid1_unplug(request_queue_t *q)
+{
+	unplug_slaves(q->queuedata);
+}
+
 /*
  * Throttle resync depth, so that we can both get proper overlapping of
  * requests, but are still able to handle normal requests quickly.
@@ -451,16 +481,18 @@ rb_out:
 
 static void device_barrier(conf_t *conf, sector_t sect)
 {
-	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume), conf->resync_lock);
+	wait_event_lock_irq(conf->wait_idle, !waitqueue_active(&conf->wait_resume),
+			    conf->resync_lock, unplug_slaves(conf->mddev));
 	
 	if (!conf->barrier++) {
-		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending, conf->resync_lock);
+		wait_event_lock_irq(conf->wait_idle, !conf->nr_pending,
+				    conf->resync_lock, unplug_slaves(conf->mddev));
 		if (conf->nr_pending)
 			BUG();
 	}
-	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH, conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, conf->barrier < RESYNC_DEPTH,
+			    conf->resync_lock, unplug_slaves(conf->mddev));
 	conf->next_resync = sect;
 	spin_unlock_irq(&conf->resync_lock);
 }
@@ -479,9 +511,8 @@ static int make_request(request_queue_t *q, struct bio * bio)
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
-	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock, );
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 
@@ -646,9 +677,9 @@ static void print_conf(conf_t *conf)
 
 static void close_sync(conf_t *conf)
 {
-	md_unplug_mddev(conf->mddev);
 	spin_lock_irq(&conf->resync_lock);
-	wait_event_lock_irq(conf->wait_resume, !conf->barrier, conf->resync_lock);
+	wait_event_lock_irq(conf->wait_resume, !conf->barrier,
+			    conf->resync_lock, 	unplug_slaves(conf->mddev));
 	spin_unlock_irq(&conf->resync_lock);
 
 	if (conf->barrier) BUG();
@@ -862,6 +893,7 @@ static void raid1d(mddev_t *mddev)
 	struct bio *bio;
 	unsigned long flags;
 	conf_t *conf = mddev_to_conf(mddev);
+	int unplug=0;
 	mdk_rdev_t *rdev;
 
 	md_check_recovery(mddev);
@@ -881,6 +913,7 @@ static void raid1d(mddev_t *mddev)
 		bio = r1_bio->master_bio;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
+			unplug = 1;
 		} else {
 			if (map(mddev, &rdev) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -896,12 +929,14 @@ static void raid1d(mddev_t *mddev)
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_rw = READ;
-
+				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 	}
 	spin_unlock_irqrestore(&retry_list_lock, flags);
+	if (unplug)
+		unplug_slaves(mddev);
 }
 
 
@@ -1104,6 +1139,7 @@ static int run(mddev_t *mddev)
 			mdname(mddev));
 		goto out_free_conf;
 	}
+	mddev->queue->unplug_fn = raid1_unplug;
 
 
 	ITERATE_RDEV(mddev, rdev, tmp) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5c9d3fd66913..05087b8ae056 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -231,6 +231,8 @@ static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector)
 	return NULL;
 }
 
+static void unplug_slaves(mddev_t *mddev);
+
 static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector,
 					     int pd_idx, int noblock) 
 {
@@ -249,12 +251,13 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector
 				break;
 			if (!sh) {
 				conf->inactive_blocked = 1;
-				md_unplug_mddev(conf->mddev);
 				wait_event_lock_irq(conf->wait_for_stripe,
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 						     || !conf->inactive_blocked),
-						    conf->device_lock);
+						    conf->device_lock,
+						    unplug_slaves(conf->mddev);
+					);
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, pd_idx);
@@ -1293,6 +1296,25 @@ static inline void raid5_activate_delayed(raid5_conf_t *conf)
 		}
 	}
 }
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			if (bdev) {
+				request_queue_t *r_queue = bdev_get_queue(bdev);
+				if (r_queue && r_queue->unplug_fn)
+					r_queue->unplug_fn(r_queue);
+			}
+		}
+	}
+}
+
 static void raid5_unplug_device(request_queue_t *q)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1306,6 +1328,8 @@ static void raid5_unplug_device(request_queue_t *q)
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
+
+	unplug_slaves(mddev);
 }
 
 static inline void raid5_plug_device(raid5_conf_t *conf)
@@ -1392,9 +1416,11 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks-1;
 
-	if (sector_nr >= mddev->size <<1)
-		/* just being told to finish up .. nothing to do */
+	if (sector_nr >= mddev->size <<1) {
+		/* just being told to finish up .. nothing much to do */
+		unplug_slaves(mddev);
 		return 0;
+	}
 
 	x = sector_nr;
 	chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1474,6 +1500,8 @@ static void raid5d (mddev_t *mddev)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	unplug_slaves(mddev);
+
 	PRINTK("--- raid5d inactive\n");
 }
 
diff --git a/drivers/md/raid6main.c b/drivers/md/raid6main.c
index 131f4a1f34eb..99d08d67342f 100644
--- a/drivers/md/raid6main.c
+++ b/drivers/md/raid6main.c
@@ -250,6 +250,8 @@ static struct stripe_head *__find_stripe(raid6_conf_t *conf, sector_t sector)
 	return NULL;
 }
 
+static void unplug_slaves(mddev_t *mddev);
+
 static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector,
 					     int pd_idx, int noblock)
 {
@@ -272,7 +274,9 @@ static struct stripe_head *get_active_stripe(raid6_conf_t *conf, sector_t sector
 						    !list_empty(&conf->inactive_list) &&
 						    (atomic_read(&conf->active_stripes) < (NR_STRIPES *3/4)
 						     || !conf->inactive_blocked),
-						    conf->device_lock);
+						    conf->device_lock,
+						    unplug_slaves(conf->mddev);
+					);
 				conf->inactive_blocked = 0;
 			} else
 				init_stripe(sh, sector, pd_idx);
@@ -1454,6 +1458,26 @@ static inline void raid6_activate_delayed(raid6_conf_t *conf)
 		}
 	}
 }
+
+static void unplug_slaves(mddev_t *mddev)
+{
+	/* note: this is always called with device_lock held */
+	raid6_conf_t *conf = mddev_to_conf(mddev);
+	int i;
+
+	for (i=0; i<mddev->raid_disks; i++) {
+		mdk_rdev_t *rdev = conf->disks[i].rdev;
+		if (rdev && !rdev->faulty) {
+			struct block_device *bdev = rdev->bdev;
+			if (bdev) {
+				request_queue_t *r_queue = bdev_get_queue(bdev);
+				if (r_queue && r_queue->unplug_fn)
+					r_queue->unplug_fn(r_queue);
+			}
+		}
+	}
+}
+
 static void raid6_unplug_device(request_queue_t *q)
 {
 	mddev_t *mddev = q->queuedata;
@@ -1467,6 +1491,8 @@ static void raid6_unplug_device(request_queue_t *q)
 	md_wakeup_thread(mddev->thread);
 
 	spin_unlock_irqrestore(&conf->device_lock, flags);
+
+	unplug_slaves(mddev);
 }
 
 static inline void raid6_plug_device(raid6_conf_t *conf)
@@ -1553,9 +1579,11 @@ static int sync_request (mddev_t *mddev, sector_t sector_nr, int go_faster)
 	int raid_disks = conf->raid_disks;
 	int data_disks = raid_disks - 2;
 
-	if (sector_nr >= mddev->size <<1)
-		/* just being told to finish up .. nothing to do */
+	if (sector_nr >= mddev->size <<1) {
+		/* just being told to finish up .. nothing much to do */
+		unplug_slaves(mddev);
 		return 0;
+	}
 
 	x = sector_nr;
 	chunk_offset = sector_div(x, sectors_per_chunk);
@@ -1635,6 +1663,8 @@ static void raid6d (mddev_t *mddev)
 
 	spin_unlock_irq(&conf->device_lock);
 
+	unplug_slaves(mddev);
+
 	PRINTK("--- raid6d inactive\n");
 }
 
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 42c973c53d04..0b6b5e6f34eb 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -315,7 +315,7 @@ typedef struct mdk_thread_s {
 
 #define THREAD_WAKEUP  0
 
-#define __wait_event_lock_irq(wq, condition, lock) 			\
+#define __wait_event_lock_irq(wq, condition, lock, cmd) 		\
 do {									\
 	wait_queue_t __wait;						\
 	init_waitqueue_entry(&__wait, current);				\
@@ -326,6 +326,7 @@ do {									\
 		if (condition)						\
 			break;						\
 		spin_unlock_irq(&lock);					\
+		cmd;							\
 		schedule();						\
 		spin_lock_irq(&lock);					\
 	}								\
@@ -333,11 +334,11 @@ do {									\
 	remove_wait_queue(&wq, &__wait);				\
 } while (0)
 
-#define wait_event_lock_irq(wq, condition, lock) 			\
+#define wait_event_lock_irq(wq, condition, lock, cmd) 			\
 do {									\
 	if (condition)	 						\
 		break;							\
-	__wait_event_lock_irq(wq, condition, lock);			\
+	__wait_event_lock_irq(wq, condition, lock, cmd);		\
 } while (0)
 
 #endif
-- 
cgit v1.2.3


From 1dc841edc41a3014ece92b72013b3b57b0424e6b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:16:32 -0700
Subject: [PATCH] Correct unplugs on nr_queued

From: Jens Axboe <axboe@suse.de>

There's a small discrepancy in when we decide to unplug a queue based on
q->unplug_thresh.  Basically it doesn't work for tagged queues, since
q->rq.count[READ] + q->rq.count[WRITE] is just the number of allocated
requests, not the number of requests stuck in the io scheduler.  We could
just change the nr_queued == to a nr_queued >=, however that is still
suboptimal.

This patch adds accounting for requests that have been dequeued from the io
scheduler, but not freed yet.  These are q->in_flight.  allocated_requests
- q->in_flight == requests_in_scheduler.  So the condition correctly
becomes

	if (requests_in_scheduler == q->unplug_thresh)

instead.  I did a quick round of testing, and for dbench on a SCSI disk the
number of timer induced unplugs was reduced from 13 to 5 :-).  Not a huge
number, but there might be cases where it's more significant.  Either way,
it gets ->unplug_thresh always right, which the old logic didn't.
---
 drivers/block/elevator.c  | 23 +++++++++++++++++++++++
 drivers/block/ll_rw_blk.c |  4 ++--
 include/linux/blkdev.h    |  5 +++++
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 40377d4a030a..c42fd0ddd75f 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -149,6 +149,13 @@ void elv_merge_requests(request_queue_t *q, struct request *rq,
 
 void elv_requeue_request(request_queue_t *q, struct request *rq)
 {
+	/*
+	 * it already went through dequeue, we need to decrement the
+	 * in_flight count again
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight--;
+
 	/*
 	 * if iosched has an explicit requeue hook, then use that. otherwise
 	 * just put the request at the front of the queue
@@ -232,6 +239,16 @@ void elv_remove_request(request_queue_t *q, struct request *rq)
 {
 	elevator_t *e = &q->elevator;
 
+	/*
+	 * the time frame between a request being removed from the lists
+	 * and to it is freed is accounted as io that is in progress at
+	 * the driver side. note that we only account requests that the
+	 * driver has seen (REQ_STARTED set), to avoid false accounting
+	 * for request-request merges
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight++;
+
 	/*
 	 * the main clearing point for q->last_merge is on retrieval of
 	 * request by driver (it calls elv_next_request()), but it _can_
@@ -321,6 +338,12 @@ void elv_completed_request(request_queue_t *q, struct request *rq)
 {
 	elevator_t *e = &q->elevator;
 
+	/*
+	 * request is released from the driver, io must be done
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight--;
+
 	if (e->elevator_completed_req_fn)
 		e->elevator_completed_req_fn(q, rq);
 }
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 209fdef4d986..6b0ff2c5f092 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -2275,9 +2275,9 @@ out:
 		__blk_put_request(q, freereq);
 
 	if (blk_queue_plugged(q)) {
-		int nr_queued = q->rq.count[READ] + q->rq.count[WRITE];
+		int nrq = q->rq.count[READ] + q->rq.count[WRITE] - q->in_flight;
 
-		if (nr_queued == q->unplug_thresh || bio_sync(bio))
+		if (nrq == q->unplug_thresh || bio_sync(bio))
 			__generic_unplug_device(q);
 	}
 	spin_unlock_irq(q->queue_lock);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 572f96e6940a..44c722d4b67b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -348,6 +348,8 @@ struct request_queue
 
 	atomic_t		refcnt;
 
+	unsigned int		in_flight;
+
 	/*
 	 * sg stuff
 	 */
@@ -377,6 +379,9 @@ struct request_queue
 #define blk_fs_request(rq)	((rq)->flags & REQ_CMD)
 #define blk_pc_request(rq)	((rq)->flags & REQ_BLOCK_PC)
 #define blk_noretry_request(rq)	((rq)->flags & REQ_FAILFAST)
+#define blk_rq_started(rq)	((rq)->flags & REQ_STARTED)
+
+#define blk_account_rq(rq)	(blk_rq_started(rq) && blk_fs_request(rq))
 
 #define blk_pm_suspend_request(rq)	((rq)->flags & REQ_PM_SUSPEND)
 #define blk_pm_resume_request(rq)	((rq)->flags & REQ_PM_RESUME)
-- 
cgit v1.2.3


From 3e2ea65d7be031644c6d6a3ed5bd89af5ef2f090 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:16:44 -0700
Subject: [PATCH] CFQ io scheduler

From: Jens Axboe <axboe@suse.de>

CFQ I/O scheduler
---
 drivers/block/Kconfig.iosched |   7 +
 drivers/block/Makefile        |   1 +
 drivers/block/cfq-iosched.c   | 707 ++++++++++++++++++++++++++++++++++++++++++
 drivers/block/ll_rw_blk.c     |   6 +
 include/linux/elevator.h      |   5 +
 5 files changed, 726 insertions(+)
 create mode 100644 drivers/block/cfq-iosched.c

(limited to 'include')

diff --git a/drivers/block/Kconfig.iosched b/drivers/block/Kconfig.iosched
index fa5476571677..d938c5fd130b 100644
--- a/drivers/block/Kconfig.iosched
+++ b/drivers/block/Kconfig.iosched
@@ -27,3 +27,10 @@ config IOSCHED_DEADLINE
 	  a disk at any one time, its behaviour is almost identical to the
 	  anticipatory I/O scheduler and so is a good choice.
 
+config IOSCHED_CFQ
+	bool "CFQ I/O scheduler" if EMBEDDED
+	default y
+	---help---
+	  The CFQ I/O scheduler tries to distribute bandwidth equally
+	  among all processes in the system. It should provide a fair
+	  working environment, suitable for desktop systems.
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index da1ce1b1361f..33b14e84cd8c 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -18,6 +18,7 @@ obj-y	:= elevator.o ll_rw_blk.o ioctl.o genhd.o scsi_ioctl.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
+obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
 obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o
 obj-$(CONFIG_BLK_DEV_FD98)	+= floppy98.o
diff --git a/drivers/block/cfq-iosched.c b/drivers/block/cfq-iosched.c
new file mode 100644
index 000000000000..11528978acf4
--- /dev/null
+++ b/drivers/block/cfq-iosched.c
@@ -0,0 +1,707 @@
+/*
+ *  linux/drivers/block/cfq-iosched.c
+ *
+ *  CFQ, or complete fairness queueing, disk scheduler.
+ *
+ *  Based on ideas from a previously unfinished io
+ *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
+ *
+ *  Copyright (C) 2003 Jens Axboe <axboe@suse.de>
+ */
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/bio.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/hash.h>
+#include <linux/rbtree.h>
+#include <linux/mempool.h>
+
+/*
+ * tunables
+ */
+static int cfq_quantum = 4;
+static int cfq_queued = 8;
+
+#define CFQ_QHASH_SHIFT		6
+#define CFQ_QHASH_ENTRIES	(1 << CFQ_QHASH_SHIFT)
+#define list_entry_qhash(entry)	list_entry((entry), struct cfq_queue, cfq_hash)
+
+#define CFQ_MHASH_SHIFT		8
+#define CFQ_MHASH_BLOCK(sec)	((sec) >> 3)
+#define CFQ_MHASH_ENTRIES	(1 << CFQ_MHASH_SHIFT)
+#define CFQ_MHASH_FN(sec)	(hash_long(CFQ_MHASH_BLOCK((sec)),CFQ_MHASH_SHIFT))
+#define ON_MHASH(crq)		!list_empty(&(crq)->hash)
+#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
+#define list_entry_hash(ptr)	list_entry((ptr), struct cfq_rq, hash)
+
+#define list_entry_cfqq(ptr)	list_entry((ptr), struct cfq_queue, cfq_list)
+
+#define RQ_DATA(rq)		((struct cfq_rq *) (rq)->elevator_private)
+
+static kmem_cache_t *crq_pool;
+static kmem_cache_t *cfq_pool;
+static mempool_t *cfq_mpool;
+
+struct cfq_data {
+	struct list_head rr_list;
+	struct list_head *dispatch;
+	struct list_head *cfq_hash;
+
+	struct list_head *crq_hash;
+
+	unsigned int busy_queues;
+	unsigned int max_queued;
+
+	mempool_t *crq_pool;
+};
+
+struct cfq_queue {
+	struct list_head cfq_hash;
+	struct list_head cfq_list;
+	struct rb_root sort_list;
+	int pid;
+	int queued[2];
+#if 0
+	/*
+	 * with a simple addition like this, we can do io priorities. almost.
+	 * does need a split request free list, too.
+	 */
+	int io_prio
+#endif
+};
+
+struct cfq_rq {
+	struct rb_node rb_node;
+	sector_t rb_key;
+
+	struct request *request;
+
+	struct cfq_queue *cfq_queue;
+
+	struct list_head hash;
+};
+
+static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq);
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid);
+static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq);
+
+/*
+ * lots of deadline iosched dupes, can be abstracted later...
+ */
+static inline void __cfq_del_crq_hash(struct cfq_rq *crq)
+{
+	list_del_init(&crq->hash);
+}
+
+static inline void cfq_del_crq_hash(struct cfq_rq *crq)
+{
+	if (ON_MHASH(crq))
+		__cfq_del_crq_hash(crq);
+}
+
+static void cfq_remove_merge_hints(request_queue_t *q, struct cfq_rq *crq)
+{
+	cfq_del_crq_hash(crq);
+
+	if (q->last_merge == crq->request)
+		q->last_merge = NULL;
+}
+
+static inline void cfq_add_crq_hash(struct cfq_data *cfqd, struct cfq_rq *crq)
+{
+	struct request *rq = crq->request;
+
+	BUG_ON(ON_MHASH(crq));
+
+	list_add(&crq->hash, &cfqd->crq_hash[CFQ_MHASH_FN(rq_hash_key(rq))]);
+}
+
+static struct request *cfq_find_rq_hash(struct cfq_data *cfqd, sector_t offset)
+{
+	struct list_head *hash_list = &cfqd->crq_hash[CFQ_MHASH_FN(offset)];
+	struct list_head *entry, *next = hash_list->next;
+
+	while ((entry = next) != hash_list) {
+		struct cfq_rq *crq = list_entry_hash(entry);
+		struct request *__rq = crq->request;
+
+		next = entry->next;
+
+		BUG_ON(!ON_MHASH(crq));
+
+		if (!rq_mergeable(__rq)) {
+			__cfq_del_crq_hash(crq);
+			continue;
+		}
+
+		if (rq_hash_key(__rq) == offset)
+			return __rq;
+	}
+
+	return NULL;
+}
+
+/*
+ * rb tree support functions
+ */
+#define RB_NONE		(2)
+#define RB_EMPTY(node)	((node)->rb_node == NULL)
+#define RB_CLEAR(node)	((node)->rb_color = RB_NONE)
+#define RB_CLEAR_ROOT(root)	((root)->rb_node = NULL)
+#define ON_RB(node)	((node)->rb_color != RB_NONE)
+#define rb_entry_crq(node)	rb_entry((node), struct cfq_rq, rb_node)
+#define rq_rb_key(rq)		(rq)->sector
+
+static inline void cfq_del_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	if (ON_RB(&crq->rb_node)) {
+		cfqq->queued[rq_data_dir(crq->request)]--;
+		rb_erase(&crq->rb_node, &cfqq->sort_list);
+		crq->cfq_queue = NULL;
+	}
+}
+
+static struct cfq_rq *
+__cfq_add_crq_rb(struct cfq_queue *cfqq, struct cfq_rq *crq)
+{
+	struct rb_node **p = &cfqq->sort_list.rb_node;
+	struct rb_node *parent = NULL;
+	struct cfq_rq *__crq;
+
+	while (*p) {
+		parent = *p;
+		__crq = rb_entry_crq(parent);
+
+		if (crq->rb_key < __crq->rb_key)
+			p = &(*p)->rb_left;
+		else if (crq->rb_key > __crq->rb_key)
+			p = &(*p)->rb_right;
+		else
+			return __crq;
+	}
+
+	rb_link_node(&crq->rb_node, parent, p);
+	return 0;
+}
+
+static void
+cfq_add_crq_rb(struct cfq_data *cfqd, struct cfq_queue *cfqq,struct cfq_rq *crq)
+{
+	struct request *rq = crq->request;
+	struct cfq_rq *__alias;
+
+	crq->rb_key = rq_rb_key(rq);
+	cfqq->queued[rq_data_dir(rq)]++;
+retry:
+	__alias = __cfq_add_crq_rb(cfqq, crq);
+	if (!__alias) {
+		rb_insert_color(&crq->rb_node, &cfqq->sort_list);
+		crq->cfq_queue = cfqq;
+		return;
+	}
+
+	cfq_del_crq_rb(cfqq, __alias);
+	cfq_dispatch_sort(cfqd->dispatch, __alias);
+	goto retry;
+}
+
+static struct request *
+cfq_find_rq_rb(struct cfq_data *cfqd, sector_t sector)
+{
+	struct cfq_queue *cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	struct rb_node *n;
+
+	if (!cfqq)
+		goto out;
+
+	n = cfqq->sort_list.rb_node;
+	while (n) {
+		struct cfq_rq *crq = rb_entry_crq(n);
+
+		if (sector < crq->rb_key)
+			n = n->rb_left;
+		else if (sector > crq->rb_key)
+			n = n->rb_right;
+		else
+			return crq->request;
+	}
+
+out:
+	return NULL;
+}
+
+static void cfq_remove_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (crq) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		cfq_remove_merge_hints(q, crq);
+		list_del_init(&rq->queuelist);
+
+		if (cfqq) {
+			cfq_del_crq_rb(cfqq, crq);
+
+			if (RB_EMPTY(&cfqq->sort_list))
+				cfq_put_queue(cfqd, cfqq);
+		}
+	}
+}
+
+static int
+cfq_merge(request_queue_t *q, struct request **req, struct bio *bio)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct request *__rq;
+	int ret;
+
+	ret = elv_try_last_merge(q, bio);
+	if (ret != ELEVATOR_NO_MERGE) {
+		__rq = q->last_merge;
+		goto out_insert;
+	}
+
+	__rq = cfq_find_rq_hash(cfqd, bio->bi_sector);
+	if (__rq) {
+		BUG_ON(__rq->sector + __rq->nr_sectors != bio->bi_sector);
+
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_BACK_MERGE;
+			goto out;
+		}
+	}
+
+	__rq = cfq_find_rq_rb(cfqd, bio->bi_sector + bio_sectors(bio));
+	if (__rq) {
+		if (elv_rq_merge_ok(__rq, bio)) {
+			ret = ELEVATOR_FRONT_MERGE;
+			goto out;
+		}
+	}
+
+	return ELEVATOR_NO_MERGE;
+out:
+	q->last_merge = __rq;
+out_insert:
+	*req = __rq;
+	return ret;
+}
+
+static void cfq_merged_request(request_queue_t *q, struct request *req)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(req);
+
+	cfq_del_crq_hash(crq);
+	cfq_add_crq_hash(cfqd, crq);
+
+	if (ON_RB(&crq->rb_node) && (rq_rb_key(req) != crq->rb_key)) {
+		struct cfq_queue *cfqq = crq->cfq_queue;
+
+		cfq_del_crq_rb(cfqq, crq);
+		cfq_add_crq_rb(cfqd, cfqq, crq);
+	}
+
+	q->last_merge = req;
+}
+
+static void
+cfq_merged_requests(request_queue_t *q, struct request *req,
+		    struct request *next)
+{
+	cfq_merged_request(q, req);
+	cfq_remove_request(q, next);
+}
+
+static void cfq_dispatch_sort(struct list_head *head, struct cfq_rq *crq)
+{
+	struct list_head *entry = head;
+	struct request *__rq;
+
+	if (!list_empty(head)) {
+		__rq = list_entry_rq(head->next);
+
+		if (crq->request->sector < __rq->sector) {
+			entry = head->prev;
+			goto link;
+		}
+	}
+
+	while ((entry = entry->prev) != head) {
+		__rq = list_entry_rq(entry);
+
+		if (crq->request->sector <= __rq->sector)
+			break;
+	}
+
+link:
+	list_add_tail(&crq->request->queuelist, entry);
+}
+
+static inline void
+__cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd,
+			struct cfq_queue *cfqq)
+{
+	struct cfq_rq *crq = rb_entry_crq(rb_first(&cfqq->sort_list));
+
+	cfq_del_crq_rb(cfqq, crq);
+	cfq_remove_merge_hints(q, crq);
+	cfq_dispatch_sort(cfqd->dispatch, crq);
+}
+
+static int cfq_dispatch_requests(request_queue_t *q, struct cfq_data *cfqd)
+{
+	struct cfq_queue *cfqq;
+	struct list_head *entry, *tmp;
+	int ret, queued, good_queues;
+
+	if (list_empty(&cfqd->rr_list))
+		return 0;
+
+	queued = ret = 0;
+restart:
+	good_queues = 0;
+	list_for_each_safe(entry, tmp, &cfqd->rr_list) {
+		cfqq = list_entry_cfqq(cfqd->rr_list.next);
+
+		BUG_ON(RB_EMPTY(&cfqq->sort_list));
+
+		__cfq_dispatch_requests(q, cfqd, cfqq);
+
+		if (RB_EMPTY(&cfqq->sort_list))
+			cfq_put_queue(cfqd, cfqq);
+		else
+			good_queues++;
+
+		queued++;
+		ret = 1;
+	}
+
+	if ((queued < cfq_quantum) && good_queues)
+		goto restart;
+
+	return ret;
+}
+
+static struct request *cfq_next_request(request_queue_t *q)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct request *rq;
+
+	if (!list_empty(cfqd->dispatch)) {
+		struct cfq_rq *crq;
+dispatch:
+		rq = list_entry_rq(cfqd->dispatch->next);
+
+		BUG_ON(q->last_merge == rq);
+		crq = RQ_DATA(rq);
+		if (crq)
+			BUG_ON(ON_MHASH(crq));
+
+		return rq;
+	}
+
+	if (cfq_dispatch_requests(q, cfqd))
+		goto dispatch;
+
+	return NULL;
+}
+
+static inline struct cfq_queue *
+__cfq_find_cfq_hash(struct cfq_data *cfqd, int pid, const int hashval)
+{
+	struct list_head *hash_list = &cfqd->cfq_hash[hashval];
+	struct list_head *entry;
+
+	list_for_each(entry, hash_list) {
+		struct cfq_queue *__cfqq = list_entry_qhash(entry);
+
+		if (__cfqq->pid == pid)
+			return __cfqq;
+	}
+
+	return NULL;
+}
+
+static struct cfq_queue *cfq_find_cfq_hash(struct cfq_data *cfqd, int pid)
+{
+	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+
+	return __cfq_find_cfq_hash(cfqd, pid, hashval);
+}
+
+static void cfq_put_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+{
+	cfqd->busy_queues--;
+	list_del(&cfqq->cfq_list);
+	list_del(&cfqq->cfq_hash);
+	mempool_free(cfqq, cfq_mpool);
+}
+
+static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, int pid)
+{
+	const int hashval = hash_long(current->tgid, CFQ_QHASH_SHIFT);
+	struct cfq_queue *cfqq = __cfq_find_cfq_hash(cfqd, pid, hashval);
+
+	if (!cfqq) {
+		cfqq = mempool_alloc(cfq_mpool, GFP_NOIO);
+
+		INIT_LIST_HEAD(&cfqq->cfq_hash);
+		INIT_LIST_HEAD(&cfqq->cfq_list);
+		RB_CLEAR_ROOT(&cfqq->sort_list);
+
+		cfqq->pid = pid;
+		cfqq->queued[0] = cfqq->queued[1] = 0;
+		list_add(&cfqq->cfq_hash, &cfqd->cfq_hash[hashval]);
+	}
+
+	return cfqq;
+}
+
+static void cfq_enqueue(struct cfq_data *cfqd, struct cfq_rq *crq)
+{
+	struct cfq_queue *cfqq;
+
+	cfqq = cfq_get_queue(cfqd, current->tgid);
+
+	cfq_add_crq_rb(cfqd, cfqq, crq);
+
+	if (list_empty(&cfqq->cfq_list)) {
+		list_add(&cfqq->cfq_list, &cfqd->rr_list);
+		cfqd->busy_queues++;
+	}
+}
+
+static void
+cfq_insert_request(request_queue_t *q, struct request *rq, int where)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	switch (where) {
+		case ELEVATOR_INSERT_BACK:
+			while (cfq_dispatch_requests(q, cfqd))
+				;
+			list_add_tail(&rq->queuelist, cfqd->dispatch);
+			break;
+		case ELEVATOR_INSERT_FRONT:
+			list_add(&rq->queuelist, cfqd->dispatch);
+			break;
+		case ELEVATOR_INSERT_SORT:
+			BUG_ON(!blk_fs_request(rq));
+			cfq_enqueue(cfqd, crq);
+			break;
+		default:
+			printk("%s: bad insert point %d\n", __FUNCTION__,where);
+			return;
+	}
+
+	if (rq_mergeable(rq)) {
+		cfq_add_crq_hash(cfqd, crq);
+
+		if (!q->last_merge)
+			q->last_merge = rq;
+	}
+}
+
+static int cfq_queue_empty(request_queue_t *q)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+
+	if (list_empty(cfqd->dispatch) && list_empty(&cfqd->rr_list))
+		return 1;
+
+	return 0;
+}
+
+static struct request *
+cfq_former_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct rb_node *rbprev = rb_prev(&crq->rb_node);
+
+	if (rbprev)
+		return rb_entry_crq(rbprev)->request;
+
+	return NULL;
+}
+
+static struct request *
+cfq_latter_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_rq *crq = RQ_DATA(rq);
+	struct rb_node *rbnext = rb_next(&crq->rb_node);
+
+	if (rbnext)
+		return rb_entry_crq(rbnext)->request;
+
+	return NULL;
+}
+
+static int cfq_may_queue(request_queue_t *q, int rw)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_queue *cfqq;
+	int ret = 1;
+
+	if (!cfqd->busy_queues)
+		goto out;
+
+	cfqq = cfq_find_cfq_hash(cfqd, current->tgid);
+	if (cfqq) {
+		int limit = (q->nr_requests - cfq_queued) / cfqd->busy_queues;
+
+		if (limit < 3)
+			limit = 3;
+		else if (limit > cfqd->max_queued)
+			limit = cfqd->max_queued;
+
+		if (cfqq->queued[rw] > limit)
+			ret = 0;
+	}
+out:
+	return ret;
+}
+
+static void cfq_put_request(request_queue_t *q, struct request *rq)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = RQ_DATA(rq);
+
+	if (crq) {
+		BUG_ON(q->last_merge == rq);
+		BUG_ON(ON_MHASH(crq));
+
+		mempool_free(crq, cfqd->crq_pool);
+		rq->elevator_private = NULL;
+	}
+}
+
+static int cfq_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
+{
+	struct cfq_data *cfqd = q->elevator.elevator_data;
+	struct cfq_rq *crq = mempool_alloc(cfqd->crq_pool, gfp_mask);
+
+	if (crq) {
+		RB_CLEAR(&crq->rb_node);
+		crq->request = rq;
+		crq->cfq_queue = NULL;
+		INIT_LIST_HEAD(&crq->hash);
+		rq->elevator_private = crq;
+		return 0;
+	}
+
+	return 1;
+}
+
+static void cfq_exit(request_queue_t *q, elevator_t *e)
+{
+	struct cfq_data *cfqd = e->elevator_data;
+
+	e->elevator_data = NULL;
+	mempool_destroy(cfqd->crq_pool);
+	kfree(cfqd->crq_hash);
+	kfree(cfqd->cfq_hash);
+	kfree(cfqd);
+}
+
+static int cfq_init(request_queue_t *q, elevator_t *e)
+{
+	struct cfq_data *cfqd;
+	int i;
+
+	cfqd = kmalloc(sizeof(*cfqd), GFP_KERNEL);
+	if (!cfqd)
+		return -ENOMEM;
+
+	memset(cfqd, 0, sizeof(*cfqd));
+	INIT_LIST_HEAD(&cfqd->rr_list);
+
+	cfqd->crq_hash = kmalloc(sizeof(struct list_head) * CFQ_MHASH_ENTRIES, GFP_KERNEL);
+	if (!cfqd->crq_hash)
+		goto out_crqhash;
+
+	cfqd->cfq_hash = kmalloc(sizeof(struct list_head) * CFQ_QHASH_ENTRIES, GFP_KERNEL);
+	if (!cfqd->cfq_hash)
+		goto out_cfqhash;
+
+	cfqd->crq_pool = mempool_create(BLKDEV_MIN_RQ, mempool_alloc_slab, mempool_free_slab, crq_pool);
+	if (!cfqd->crq_pool)
+		goto out_crqpool;
+
+	for (i = 0; i < CFQ_MHASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&cfqd->crq_hash[i]);
+	for (i = 0; i < CFQ_QHASH_ENTRIES; i++)
+		INIT_LIST_HEAD(&cfqd->cfq_hash[i]);
+
+	cfqd->dispatch = &q->queue_head;
+	e->elevator_data = cfqd;
+
+	/*
+	 * just set it to some high value, we want anyone to be able to queue
+	 * some requests. fairness is handled differently
+	 */
+	cfqd->max_queued = q->nr_requests;
+	q->nr_requests = 8192;
+
+	return 0;
+out_crqpool:
+	kfree(cfqd->cfq_hash);
+out_cfqhash:
+	kfree(cfqd->crq_hash);
+out_crqhash:
+	kfree(cfqd);
+	return -ENOMEM;
+}
+
+static int __init cfq_slab_setup(void)
+{
+	crq_pool = kmem_cache_create("crq_pool", sizeof(struct cfq_rq), 0, 0,
+					NULL, NULL);
+
+	if (!crq_pool)
+		panic("cfq_iosched: can't init crq pool\n");
+
+	cfq_pool = kmem_cache_create("cfq_pool", sizeof(struct cfq_queue), 0, 0,
+					NULL, NULL);
+
+	if (!cfq_pool)
+		panic("cfq_iosched: can't init cfq pool\n");
+
+	cfq_mpool = mempool_create(64, mempool_alloc_slab, mempool_free_slab, cfq_pool);
+
+	if (!cfq_mpool)
+		panic("cfq_iosched: can't init cfq mpool\n");
+
+	return 0;
+}
+
+subsys_initcall(cfq_slab_setup);
+
+elevator_t iosched_cfq = {
+	.elevator_name =		"cfq",
+	.elevator_merge_fn = 		cfq_merge,
+	.elevator_merged_fn =		cfq_merged_request,
+	.elevator_merge_req_fn =	cfq_merged_requests,
+	.elevator_next_req_fn =		cfq_next_request,
+	.elevator_add_req_fn =		cfq_insert_request,
+	.elevator_remove_req_fn =	cfq_remove_request,
+	.elevator_queue_empty_fn =	cfq_queue_empty,
+	.elevator_former_req_fn =	cfq_former_request,
+	.elevator_latter_req_fn =	cfq_latter_request,
+	.elevator_set_req_fn =		cfq_set_request,
+	.elevator_put_req_fn =		cfq_put_request,
+	.elevator_may_queue_fn =	cfq_may_queue,
+	.elevator_init_fn =		cfq_init,
+	.elevator_exit_fn =		cfq_exit,
+};
+
+EXPORT_SYMBOL(iosched_cfq);
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 6b0ff2c5f092..5ee752d64f4a 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -1311,6 +1311,8 @@ static elevator_t *chosen_elevator =
 	&iosched_as;
 #elif defined(CONFIG_IOSCHED_DEADLINE)
 	&iosched_deadline;
+#elif defined(CONFIG_IOSCHED_CFQ)
+	&iosched_cfq;
 #elif defined(CONFIG_IOSCHED_NOOP)
 	&elevator_noop;
 #else
@@ -1329,6 +1331,10 @@ static int __init elevator_setup(char *str)
 	if (!strcmp(str, "as"))
 		chosen_elevator = &iosched_as;
 #endif
+#ifdef CONFIG_IOSCHED_CFQ
+	if (!strcmp(str, "cfq"))
+		chosen_elevator = &iosched_cfq;
+#endif
 #ifdef CONFIG_IOSCHED_NOOP
 	if (!strcmp(str, "noop"))
 		chosen_elevator = &elevator_noop;
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ce58f47126c1..27e8183f4776 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -94,6 +94,11 @@ extern elevator_t iosched_deadline;
  */
 extern elevator_t iosched_as;
 
+/*
+ * completely fair queueing I/O scheduler
+ */
+extern elevator_t iosched_cfq;
+
 extern int elevator_init(request_queue_t *, elevator_t *);
 extern void elevator_exit(request_queue_t *);
 extern int elv_rq_merge_ok(struct request *, struct bio *);
-- 
cgit v1.2.3


From 4c4acd2447ef473f23aee53f04518f93840a8693 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:53:50 -0700
Subject: [PATCH] rmap 1 linux/rmap.h

From: Hugh Dickins <hugh@veritas.com>

First of a batch of three rmap patches: this initial batch of three paving
the way for a move to some form of object-based rmap (probably Andrea's, but
drawing from mine too), and making almost no functional change by itself.  A
few days will intervene before the next batch, to give the struct page
changes in the second patch some exposure before proceeding.

rmap 1 create include/linux/rmap.h

Start small: linux/rmap-locking.h has already gathered some declarations
unrelated to locking, and the rest of the rmap declarations were over in
linux/swap.h: gather them all together in linux/rmap.h, and rename the
pte_chain_lock to rmap_lock.
---
 fs/exec.c                    |  2 +-
 include/linux/page-flags.h   |  2 +-
 include/linux/rmap-locking.h | 23 --------------------
 include/linux/rmap.h         | 52 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/swap.h         | 16 --------------
 mm/fremap.c                  |  2 +-
 mm/memory.c                  |  2 +-
 mm/mremap.c                  |  2 +-
 mm/rmap.c                    | 20 ++++++++---------
 mm/swapfile.c                |  2 +-
 mm/vmscan.c                  | 24 ++++++++++----------
 11 files changed, 79 insertions(+), 68 deletions(-)
 delete mode 100644 include/linux/rmap-locking.h
 create mode 100644 include/linux/rmap.h

(limited to 'include')

diff --git a/fs/exec.c b/fs/exec.c
index 5fb9f8f7c38f..1ea7c8d6c898 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -45,7 +45,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgalloc.h>
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index bd6ddb279c55..93f22640b6cb 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -69,7 +69,7 @@
 #define PG_private		12	/* Has something at ->private */
 #define PG_writeback		13	/* Page is under writeback */
 #define PG_nosave		14	/* Used for system suspend/resume */
-#define PG_chainlock		15	/* lock bit for ->pte_chain */
+#define PG_maplock		15	/* Lock bit for rmap to ptes */
 
 #define PG_direct		16	/* ->pte_chain points directly at pte */
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
diff --git a/include/linux/rmap-locking.h b/include/linux/rmap-locking.h
deleted file mode 100644
index cb30ed470cf6..000000000000
--- a/include/linux/rmap-locking.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * include/linux/rmap-locking.h
- *
- * Locking primitives for exclusive access to a page's reverse-mapping
- * pte chain.
- */
-
-#include <linux/slab.h>
-
-struct pte_chain;
-extern kmem_cache_t *pte_chain_cache;
-
-#define pte_chain_lock(page)	bit_spin_lock(PG_chainlock, (unsigned long *)&page->flags)
-#define pte_chain_unlock(page)	bit_spin_unlock(PG_chainlock, (unsigned long *)&page->flags)
-
-struct pte_chain *pte_chain_alloc(int gfp_flags);
-void __pte_chain_free(struct pte_chain *pte_chain);
-
-static inline void pte_chain_free(struct pte_chain *pte_chain)
-{
-	if (pte_chain)
-		__pte_chain_free(pte_chain);
-}
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
new file mode 100644
index 000000000000..5f9b35f2fa65
--- /dev/null
+++ b/include/linux/rmap.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_RMAP_H
+#define _LINUX_RMAP_H
+/*
+ * Declarations for Reverse Mapping functions in mm/rmap.c
+ * Its structures are declared within that file.
+ */
+
+#include <linux/config.h>
+#include <linux/linkage.h>
+
+#define rmap_lock(page) \
+	bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags)
+#define rmap_unlock(page) \
+	bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags)
+
+#ifdef CONFIG_MMU
+
+struct pte_chain;
+struct pte_chain *pte_chain_alloc(int gfp_flags);
+void __pte_chain_free(struct pte_chain *pte_chain);
+
+static inline void pte_chain_free(struct pte_chain *pte_chain)
+{
+	if (pte_chain)
+		__pte_chain_free(pte_chain);
+}
+
+struct pte_chain * fastcall
+	page_add_rmap(struct page *, pte_t *, struct pte_chain *);
+void fastcall page_remove_rmap(struct page *, pte_t *);
+
+/*
+ * Called from mm/vmscan.c to handle paging out
+ */
+int fastcall page_referenced(struct page *);
+int fastcall try_to_unmap(struct page *);
+
+#else	/* !CONFIG_MMU */
+
+#define page_referenced(page)	TestClearPageReferenced(page)
+#define try_to_unmap(page)	SWAP_FAIL
+
+#endif	/* CONFIG_MMU */
+
+/*
+ * Return values of try_to_unmap
+ */
+#define SWAP_SUCCESS	0
+#define SWAP_AGAIN	1
+#define SWAP_FAIL	2
+
+#endif	/* _LINUX_RMAP_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d189090cf63a..f911d8afb8a5 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -76,7 +76,6 @@ struct reclaim_state {
 #ifdef __KERNEL__
 
 struct address_space;
-struct pte_chain;
 struct sysinfo;
 struct writeback_control;
 struct zone;
@@ -177,26 +176,11 @@ extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 
-/* linux/mm/rmap.c */
 #ifdef CONFIG_MMU
-int FASTCALL(page_referenced(struct page *));
-struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *,
-					struct pte_chain *));
-void FASTCALL(page_remove_rmap(struct page *, pte_t *));
-int FASTCALL(try_to_unmap(struct page *));
-
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
-#else
-#define page_referenced(page)	TestClearPageReferenced(page)
-#define try_to_unmap(page)	SWAP_FAIL
 #endif /* CONFIG_MMU */
 
-/* return values of try_to_unmap */
-#define	SWAP_SUCCESS	0
-#define	SWAP_AGAIN	1
-#define	SWAP_FAIL	2
-
 extern void swap_unplug_io_fn(struct backing_dev_info *);
 
 #ifdef CONFIG_SWAP
diff --git a/mm/fremap.c b/mm/fremap.c
index dacebb172f6f..2c8abe6d1f5a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -12,7 +12,7 @@
 #include <linux/mman.h>
 #include <linux/pagemap.h>
 #include <linux/swapops.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/module.h>
 
 #include <asm/mmu_context.h>
diff --git a/mm/memory.c b/mm/memory.c
index f7f1649b848c..40695793393c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -43,7 +43,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/module.h>
 #include <linux/init.h>
 
diff --git a/mm/mremap.c b/mm/mremap.c
index e59e9355055e..c355d4da4afe 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -15,7 +15,7 @@
 #include <linux/swap.h>
 #include <linux/fs.h>
 #include <linux/highmem.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/security.h>
 
 #include <asm/uaccess.h>
diff --git a/mm/rmap.c b/mm/rmap.c
index b960734c8724..3f304d8fd38a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -13,7 +13,7 @@
 
 /*
  * Locking:
- * - the page->pte.chain is protected by the PG_chainlock bit,
+ * - the page->pte.chain is protected by the PG_maplock bit,
  *   which nests within the the mm->page_table_lock,
  *   which nests within the page lock.
  * - because swapout locking is opposite to the locking order
@@ -26,7 +26,7 @@
 #include <linux/swapops.h>
 #include <linux/slab.h>
 #include <linux/init.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/cache.h>
 #include <linux/percpu.h>
 
@@ -108,7 +108,7 @@ pte_chain_encode(struct pte_chain *pte_chain, int idx)
  *
  * Quick test_and_clear_referenced for all mappings to a page,
  * returns the number of processes which referenced the page.
- * Caller needs to hold the pte_chain_lock.
+ * Caller needs to hold the rmap lock.
  *
  * If the page has a single-entry pte_chain, collapse that back to a PageDirect
  * representation.  This way, it's only done under memory pressure.
@@ -175,7 +175,7 @@ page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 	if (PageReserved(page))
 		return pte_chain;
 
-	pte_chain_lock(page);
+	rmap_lock(page);
 
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
@@ -208,7 +208,7 @@ page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 	cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr;
 	cur_pte_chain->next_and_idx--;
 out:
-	pte_chain_unlock(page);
+	rmap_unlock(page);
 	return pte_chain;
 }
 
@@ -230,7 +230,7 @@ void fastcall page_remove_rmap(struct page *page, pte_t *ptep)
 	if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
 		return;
 
-	pte_chain_lock(page);
+	rmap_lock(page);
 
 	if (!page_mapped(page))
 		goto out_unlock;	/* remap_page_range() from a driver? */
@@ -276,8 +276,7 @@ out:
 	if (!page_mapped(page))
 		dec_page_state(nr_mapped);
 out_unlock:
-	pte_chain_unlock(page);
-	return;
+	rmap_unlock(page);
 }
 
 /**
@@ -290,10 +289,9 @@ out_unlock:
  * to the locking order used by the page fault path, we use trylocks.
  * Locking:
  *	    page lock			shrink_list(), trylock
- *		pte_chain_lock		shrink_list()
+ *		rmap lock		shrink_list()
  *		    mm->page_table_lock	try_to_unmap_one(), trylock
  */
-static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t));
 static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
 {
 	pte_t *ptep = rmap_ptep_map(paddr);
@@ -376,7 +374,7 @@ out_unlock:
  *
  * Tries to remove all the page table entries which are mapping this
  * page, used in the pageout path.  Caller must hold the page lock
- * and its pte chain lock.  Return values are:
+ * and its rmap lock.  Return values are:
  *
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a trylock, try again later
diff --git a/mm/swapfile.c b/mm/swapfile.c
index f885e6d17a49..44e214da0270 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -21,7 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/security.h>
 #include <linux/backing-dev.h>
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 39e8ed0fcdd6..35fbca1c5168 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -28,7 +28,7 @@
 #include <linux/mm_inline.h>
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
-#include <linux/rmap-locking.h>
+#include <linux/rmap.h>
 #include <linux/topology.h>
 #include <linux/cpu.h>
 #include <linux/notifier.h>
@@ -173,7 +173,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask)
 	return 0;
 }
 
-/* Must be called with page's pte_chain_lock held. */
+/* Must be called with page's rmap lock held. */
 static inline int page_mapping_inuse(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -278,11 +278,11 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 		if (PageWriteback(page))
 			goto keep_locked;
 
-		pte_chain_lock(page);
+		rmap_lock(page);
 		referenced = page_referenced(page);
 		if (referenced && page_mapping_inuse(page)) {
 			/* In active use or really unfreeable.  Activate it. */
-			pte_chain_unlock(page);
+			rmap_unlock(page);
 			goto activate_locked;
 		}
 
@@ -296,10 +296,10 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 		 * XXX: implement swap clustering ?
 		 */
 		if (page_mapped(page) && !mapping && !PagePrivate(page)) {
-			pte_chain_unlock(page);
+			rmap_unlock(page);
 			if (!add_to_swap(page))
 				goto activate_locked;
-			pte_chain_lock(page);
+			rmap_lock(page);
 			mapping = page->mapping;
 		}
 #endif /* CONFIG_SWAP */
@@ -314,16 +314,16 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page)) {
 			case SWAP_FAIL:
-				pte_chain_unlock(page);
+				rmap_unlock(page);
 				goto activate_locked;
 			case SWAP_AGAIN:
-				pte_chain_unlock(page);
+				rmap_unlock(page);
 				goto keep_locked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
 		}
-		pte_chain_unlock(page);
+		rmap_unlock(page);
 
 		/*
 		 * If the page is dirty, only perform writeback if that write
@@ -657,13 +657,13 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
 				list_add(&page->lru, &l_active);
 				continue;
 			}
-			pte_chain_lock(page);
+			rmap_lock(page);
 			if (page_referenced(page)) {
-				pte_chain_unlock(page);
+				rmap_unlock(page);
 				list_add(&page->lru, &l_active);
 				continue;
 			}
-			pte_chain_unlock(page);
+			rmap_unlock(page);
 		}
 		/*
 		 * FIXME: need to consider page_count(page) here if/when we
-- 
cgit v1.2.3


From 4875a6018bcc53201ddbf745bff35ed723b468eb Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:54:03 -0700
Subject: [PATCH] rmap 2 anon and swapcache

From: Hugh Dickins <hugh@veritas.com>

Tracking anonymous pages by anon_vma,pgoff or mm,address needs a
pointer,offset pair in struct page: mapping,index the natural choice.  But
swapcache uses those for &swapper_space,swp_entry_t.

It's trivial to separate swapcache from pagecache with radix tree; most of
swapper_space is actually unused, just a fiction to pretend swap like file;
and page->private is a good place to keep swp_entry_t, now that swap never
uses bufferheads.

Define PG_anon bit, page_add_rmap SetPageAnon and put an oopsable address in
page->mapping to test that we're not confused by it.  Define
page_mapping(page) macro to give NULL when PageAnon, whatever may be in
page->mapping.  Define PG_swapcache bit, deduce swapper_space from that in
the few places we need it.

add_to_swap_cache now distinct from add_to_page_cache.  Separating the caches
somewhat simplifies the tmpfs swizzling in swap_state.c, now the page can
briefly be in both caches.

The rmap method remains pte chains, no change to that yet.  But one small
functional difference: the use of PageAnon implies that a page truncated
while still mapped will no longer be found and freed (swapped out) by
try_to_unmap, will only be freed by exit or munmap.  But normally pages are
unmapped by vmtruncate: this should only affect nonlinear mappings, and a
later patch not in this batch will fix that.
---
 fs/buffer.c                |  19 ++----
 include/linux/mm.h         |  38 +++++------
 include/linux/page-flags.h |  17 +++--
 mm/filemap.c               |  25 ++++---
 mm/memory.c                |   4 +-
 mm/page-writeback.c        |  28 ++++++--
 mm/page_alloc.c            |   9 +++
 mm/page_io.c               |  38 ++---------
 mm/rmap.c                  |  50 +++++++++-----
 mm/swap_state.c            | 163 +++++++++++++++++++++++----------------------
 mm/swapfile.c              |  34 ++++++----
 mm/vmscan.c                |  34 +++++-----
 12 files changed, 242 insertions(+), 217 deletions(-)

(limited to 'include')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8ab66d0b7548..99f1ce112ea9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -836,19 +836,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
  * address_space though.
- *
- * For now, we treat swapper_space specially.  It doesn't use the normal
- * block a_ops.
  */
 int __set_page_dirty_buffers(struct page *page)
 {
 	struct address_space * const mapping = page->mapping;
-	int ret = 0;
-
-	if (mapping == NULL) {
-		SetPageDirty(page);
-		goto out;
-	}
 
 	spin_lock(&mapping->private_lock);
 	if (page_has_buffers(page)) {
@@ -877,8 +868,7 @@ int __set_page_dirty_buffers(struct page *page)
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 	}
 	
-out:
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
 
@@ -1577,8 +1567,7 @@ int try_to_release_page(struct page *page, int gfp_mask)
 {
 	struct address_space * const mapping = page->mapping;
 
-	if (!PageLocked(page))
-		BUG();
+	BUG_ON(!PageLocked(page));
 	if (PageWriteback(page))
 		return 0;
 	
@@ -2895,14 +2884,14 @@ int try_to_free_buffers(struct page *page)
 	if (PageWriteback(page))
 		return 0;
 
-	if (mapping == NULL) {		/* swapped-in anon page */
+	if (mapping == NULL) {		/* can this still happen? */
 		ret = drop_buffers(page, &buffers_to_free);
 		goto out;
 	}
 
 	spin_lock(&mapping->private_lock);
 	ret = drop_buffers(page, &buffers_to_free);
-	if (ret && !PageSwapCache(page)) {
+	if (ret) {
 		/*
 		 * If the filesystem writes its buffers by hand (eg ext3)
 		 * then we can have clean buffers against a dirty page.  We
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6d6abe8c656e..796f498658d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -189,8 +189,11 @@ struct page {
 					 * protected by PG_chainlock */
 		pte_addr_t direct;
 	} pte;
-	unsigned long private;		/* mapping-private opaque data */
-
+	unsigned long private;		/* Mapping-private opaque data:
+					 * usually used for buffer_heads
+					 * if PagePrivate set; used for
+					 * swp_entry_t if PageSwapCache
+					 */
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
 	 * we can simply calculate the virtual address. On machines with
@@ -402,6 +405,19 @@ void page_address_init(void);
 #define page_address_init()  do { } while(0)
 #endif
 
+/*
+ * On an anonymous page mapped into a user virtual memory area,
+ * page->mapping points to its anon_vma, not to a struct address_space.
+ *
+ * Please note that, confusingly, "page_mapping" refers to the inode
+ * address_space which maps the page from disk; whereas "page_mapped"
+ * refers to user virtual address space into which the page is mapped.
+ */
+static inline struct address_space *page_mapping(struct page *page)
+{
+	return PageAnon(page)? NULL: page->mapping;
+}
+
 /*
  * Return true if this page is mapped into pagetables.  Subtle: test pte.direct
  * rather than pte.chain.  Because sometimes pte.direct is 64-bit, and .chain
@@ -471,6 +487,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
+int FASTCALL(set_page_dirty(struct page *page));
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
@@ -497,23 +514,6 @@ struct shrinker;
 extern struct shrinker *set_shrinker(int, shrinker_t);
 extern void remove_shrinker(struct shrinker *shrinker);
 
-/*
- * If the mapping doesn't provide a set_page_dirty a_op, then
- * just fall through and assume that it wants buffer_heads.
- * FIXME: make the method unconditional.
- */
-static inline int set_page_dirty(struct page *page)
-{
-	if (page->mapping) {
-		int (*spd)(struct page *);
-
-		spd = page->mapping->a_ops->set_page_dirty;
-		if (spd)
-			return (*spd)(page);
-	}
-	return __set_page_dirty_buffers(page);
-}
-
 /*
  * On a two-level page table, this ends up being trivial. Thus the
  * inlining and the symmetry break with pte_alloc_map() that does all
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 93f22640b6cb..6959827c9f62 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -75,6 +75,8 @@
 #define PG_mappedtodisk		17	/* Has blocks allocated on-disk */
 #define PG_reclaim		18	/* To be reclaimed asap */
 #define PG_compound		19	/* Part of a compound page */
+#define PG_anon			20	/* Anonymous page: anon_vma in mapping*/
+#define PG_swapcache		21	/* Swap page: swp_entry_t in private */
 
 
 /*
@@ -298,15 +300,16 @@ extern void get_full_page_state(struct page_state *ret);
 #define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
 #define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
 
-/*
- * The PageSwapCache predicate doesn't use a PG_flag at this time,
- * but it may again do so one day.
- */
+#define PageAnon(page)		test_bit(PG_anon, &(page)->flags)
+#define SetPageAnon(page)	set_bit(PG_anon, &(page)->flags)
+#define ClearPageAnon(page)	clear_bit(PG_anon, &(page)->flags)
+
 #ifdef CONFIG_SWAP
-extern struct address_space swapper_space;
-#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+#define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
+#define SetPageSwapCache(page)	set_bit(PG_swapcache, &(page)->flags)
+#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags)
 #else
-#define PageSwapCache(page) 0
+#define PageSwapCache(page)	0
 #endif
 
 struct page;	/* forward declaration */
diff --git a/mm/filemap.c b/mm/filemap.c
index dc2f0992d879..ca8fc1148296 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -122,9 +122,13 @@ static inline int sync_page(struct page *page)
 	struct address_space *mapping;
 
 	smp_mb();
-	mapping = page->mapping;
-	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
-		return mapping->a_ops->sync_page(page);
+	mapping = page_mapping(page);
+	if (mapping) {
+		if (mapping->a_ops && mapping->a_ops->sync_page)
+			return mapping->a_ops->sync_page(page);
+	} else if (PageSwapCache(page)) {
+		swap_unplug_io_fn(NULL);
+	}
 	return 0;
 }
 
@@ -242,13 +246,9 @@ int filemap_write_and_wait(struct address_space *mapping)
  * This function is used for two things: adding newly allocated pagecache
  * pages and for moving existing anon pages into swapcache.
  *
- * In the case of pagecache pages, the page is new, so we can just run
- * SetPageLocked() against it.  The other page state flags were set by
- * rmqueue()
- *
- * In the case of swapcache, try_to_swap_out() has already locked the page, so
- * SetPageLocked() is ugly-but-OK there too.  The required page state has been
- * set up by swap_out_add_to_swap_cache().
+ * This function is used to add newly allocated pagecache pages:
+ * the page is new, so we can just run SetPageLocked() against it.
+ * The other page state flags were set by rmqueue().
  *
  * This function does not add the page to the LRU.  The caller must do that.
  */
@@ -263,7 +263,10 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
 		error = radix_tree_insert(&mapping->page_tree, offset, page);
 		if (!error) {
 			SetPageLocked(page);
-			___add_to_page_cache(page, mapping, offset);
+			page->mapping = mapping;
+			page->index = offset;
+			mapping->nrpages++;
+			pagecache_acct(1);
 		} else {
 			page_cache_release(page);
 		}
diff --git a/mm/memory.c b/mm/memory.c
index 40695793393c..95b9b84d8478 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -417,8 +417,8 @@ zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd,
 				if (!PageReserved(page)) {
 					if (pte_dirty(pte))
 						set_page_dirty(page);
-					if (page->mapping && pte_young(pte) &&
-							!PageSwapCache(page))
+					if (pte_young(pte) &&
+							page_mapping(page))
 						mark_page_accessed(page);
 					tlb->freed++;
 					page_remove_rmap(page, ptep);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9cf47af10ccc..22e17333982a 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -579,6 +579,24 @@ int __set_page_dirty_nobuffers(struct page *page)
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
+/*
+ * If the mapping doesn't provide a set_page_dirty a_op, then
+ * just fall through and assume that it wants buffer_heads.
+ */
+int fastcall set_page_dirty(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+	int (*spd)(struct page *);
+
+	if (!mapping) {
+		SetPageDirty(page);
+		return 0;
+	}
+	spd = mapping->a_ops->set_page_dirty;
+	return spd? (*spd)(page): __set_page_dirty_buffers(page);
+}
+EXPORT_SYMBOL(set_page_dirty);
+
 /*
  * set_page_dirty() is racy if the caller has no reference against
  * page->mapping->host, and if the page is unlocked.  This is because another
@@ -606,7 +624,7 @@ EXPORT_SYMBOL(set_page_dirty_lock);
  */
 int test_clear_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	unsigned long flags;
 
 	if (mapping) {
@@ -642,7 +660,7 @@ EXPORT_SYMBOL(test_clear_page_dirty);
  */
 int clear_page_dirty_for_io(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 
 	if (mapping) {
 		if (TestClearPageDirty(page)) {
@@ -661,7 +679,7 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
  */
 int __clear_page_dirty(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 
 	if (mapping) {
 		unsigned long flags;
@@ -681,7 +699,7 @@ int __clear_page_dirty(struct page *page)
 
 int test_clear_page_writeback(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	int ret;
 
 	if (mapping) {
@@ -701,7 +719,7 @@ int test_clear_page_writeback(struct page *page)
 
 int test_set_page_writeback(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page_mapping(page);
 	int ret;
 
 	if (mapping) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4148e94eee13..6b4d5dc0c930 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -84,6 +84,9 @@ static void bad_page(const char *function, struct page *page)
 			1 << PG_lru	|
 			1 << PG_active	|
 			1 << PG_dirty	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
 			1 << PG_writeback);
 	set_page_count(page, 0);
 	page->mapping = NULL;
@@ -224,6 +227,9 @@ static inline void free_pages_check(const char *function, struct page *page)
 			1 << PG_active	|
 			1 << PG_reclaim	|
 			1 << PG_slab	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(function, page);
 	if (PageDirty(page))
@@ -331,6 +337,9 @@ static void prep_new_page(struct page *page, int order)
 			1 << PG_active	|
 			1 << PG_dirty	|
 			1 << PG_reclaim	|
+			1 << PG_maplock |
+			1 << PG_anon    |
+			1 << PG_swapcache |
 			1 << PG_writeback )))
 		bad_page(__FUNCTION__, page);
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 7ec159ded5ca..dbbc4e5b2e1e 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -16,8 +16,6 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/swapops.h>
-#include <linux/buffer_head.h>	/* for block_sync_page() */
-#include <linux/mpage.h>
 #include <linux/writeback.h>
 #include <asm/pgtable.h>
 
@@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page *page, bio_end_io_t end_io)
 		swp_entry_t entry;
 
 		BUG_ON(!PageSwapCache(page));
-		entry.val = page->index;
+		entry.val = page->private;
 		sis = get_swap_info_struct(swp_type(entry));
 
 		bio->bi_sector = map_swap_page(sis, swp_offset(entry)) *
@@ -132,13 +130,6 @@ out:
 	return ret;
 }
 
-struct address_space_operations swap_aops = {
-	.writepage	= swap_writepage,
-	.readpage	= swap_readpage,
-	.sync_page	= block_sync_page,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
-};
-
 #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_PM_DISK)
 
 /*
@@ -148,25 +139,15 @@ struct address_space_operations swap_aops = {
 int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 {
 	int ret;
+	unsigned long save_private;
 	struct writeback_control swap_wbc = {
 		.sync_mode = WB_SYNC_ALL,
 	};
 
 	lock_page(page);
-
-	BUG_ON(page->mapping);
-	ret = add_to_page_cache(page, &swapper_space,
-				entry.val, GFP_NOIO|__GFP_NOFAIL);
-	if (ret) {
-		unlock_page(page);
-		goto out;
-	}
-
-	/*
-	 * get one more reference to make page non-exclusive so
-	 * remove_exclusive_swap_page won't mess with it.
-	 */
-	page_cache_get(page);
+	SetPageSwapCache(page);
+	save_private = page->private;
+	page->private = entry.val;
 
 	if (rw == READ) {
 		ret = swap_readpage(NULL, page);
@@ -176,15 +157,10 @@ int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
 		wait_on_page_writeback(page);
 	}
 
-	lock_page(page);
-	remove_from_page_cache(page);
-	unlock_page(page);
-	page_cache_release(page);
-	page_cache_release(page);	/* For add_to_page_cache() */
-
+	ClearPageSwapCache(page);
+	page->private = save_private;
 	if (ret == 0 && (!PageUptodate(page) || PageError(page)))
 		ret = -EIO;
-out:
 	return ret;
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 3f304d8fd38a..455b498a9591 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -35,7 +35,18 @@
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 
-/* #define DEBUG_RMAP */
+/*
+ * Something oopsable to put for now in the page->mapping
+ * of an anonymous page, to test that it is ignored.
+ */
+#define ANON_MAPPING_DEBUG	((struct address_space *) 0xADB)
+
+static inline void clear_page_anon(struct page *page)
+{
+	BUG_ON(page->mapping != ANON_MAPPING_DEBUG);
+	page->mapping = NULL;
+	ClearPageAnon(page);
+}
 
 /*
  * Shared pages have a chain of pte_chain structures, used to locate
@@ -180,6 +191,10 @@ page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain)
 	if (page->pte.direct == 0) {
 		page->pte.direct = pte_paddr;
 		SetPageDirect(page);
+		if (!page->mapping) {
+			SetPageAnon(page);
+			page->mapping = ANON_MAPPING_DEBUG;
+		}
 		inc_page_state(nr_mapped);
 		goto out;
 	}
@@ -271,10 +286,13 @@ void fastcall page_remove_rmap(struct page *page, pte_t *ptep)
 		}
 	}
 out:
-	if (page->pte.direct == 0 && page_test_and_clear_dirty(page))
-		set_page_dirty(page);
-	if (!page_mapped(page))
+	if (!page_mapped(page)) {
+		if (page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+		if (PageAnon(page))
+			clear_page_anon(page);
 		dec_page_state(nr_mapped);
+	}
 out_unlock:
 	rmap_unlock(page);
 }
@@ -330,12 +348,13 @@ static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
 	flush_cache_page(vma, address);
 	pte = ptep_clear_flush(vma, address, ptep);
 
-	if (PageSwapCache(page)) {
+	if (PageAnon(page)) {
+		swp_entry_t entry = { .val = page->private };
 		/*
 		 * Store the swap location in the pte.
 		 * See handle_pte_fault() ...
 		 */
-		swp_entry_t entry = { .val = page->index };
+		BUG_ON(!PageSwapCache(page));
 		swap_duplicate(entry);
 		set_pte(ptep, swp_entry_to_pte(entry));
 		BUG_ON(pte_file(*ptep));
@@ -345,6 +364,7 @@ static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
 		 * If a nonlinear mapping then store the file page offset
 		 * in the pte.
 		 */
+		BUG_ON(!page->mapping);
 		pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
 		pgidx += vma->vm_pgoff;
 		pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
@@ -391,20 +411,15 @@ int fastcall try_to_unmap(struct page * page)
 		BUG();
 	if (!PageLocked(page))
 		BUG();
-	/* We need backing store to swap out a page. */
-	if (!page->mapping)
-		BUG();
 
 	if (PageDirect(page)) {
 		ret = try_to_unmap_one(page, page->pte.direct);
 		if (ret == SWAP_SUCCESS) {
-			if (page_test_and_clear_dirty(page))
-				set_page_dirty(page);
 			page->pte.direct = 0;
 			ClearPageDirect(page);
 		}
 		goto out;
-	}		
+	}
 
 	start = page->pte.chain;
 	victim_i = pte_chain_idx(start);
@@ -436,9 +451,6 @@ int fastcall try_to_unmap(struct page * page)
 				} else {
 					start->next_and_idx++;
 				}
-				if (page->pte.direct == 0 &&
-				    page_test_and_clear_dirty(page))
-					set_page_dirty(page);
 				break;
 			case SWAP_AGAIN:
 				/* Skip this pte, remembering status. */
@@ -451,8 +463,14 @@ int fastcall try_to_unmap(struct page * page)
 		}
 	}
 out:
-	if (!page_mapped(page))
+	if (!page_mapped(page)) {
+		if (page_test_and_clear_dirty(page))
+			set_page_dirty(page);
+		if (PageAnon(page))
+			clear_page_anon(page);
 		dec_page_state(nr_mapped);
+		ret = SWAP_SUCCESS;
+	}
 	return ret;
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 97f80d20807c..d76b2d1bcf79 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -16,25 +16,24 @@
 
 #include <asm/pgtable.h>
 
+/*
+ * swapper_space is a fiction, retained to simplify the path through
+ * vmscan's shrink_list.  Only those fields initialized below are used.
+ */
+static struct address_space_operations swap_aops = {
+	.writepage	= swap_writepage,
+};
+
 static struct backing_dev_info swap_backing_dev_info = {
-	.ra_pages	= 0,	/* No readahead */
 	.memory_backed	= 1,	/* Does not contribute to dirty memory */
 	.unplug_io_fn	= swap_unplug_io_fn,
 };
 
-extern struct address_space_operations swap_aops;
-
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.tree_lock	= SPIN_LOCK_UNLOCKED,
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
-	.i_mmap		= LIST_HEAD_INIT(swapper_space.i_mmap),
-	.i_mmap_shared	= LIST_HEAD_INIT(swapper_space.i_mmap_shared),
-	.i_shared_sem	= __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
-	.truncate_count  = ATOMIC_INIT(0),
-	.private_lock	= SPIN_LOCK_UNLOCKED,
-	.private_list	= LIST_HEAD_INIT(swapper_space.private_list),
 };
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
@@ -56,30 +55,55 @@ void show_swap_cache_info(void)
 		swap_cache_info.noent_race, swap_cache_info.exist_race);
 }
 
+/*
+ * __add_to_swap_cache resembles add_to_page_cache on swapper_space,
+ * but sets SwapCache flag and private instead of mapping and index.
+ */
+static int __add_to_swap_cache(struct page *page,
+		swp_entry_t entry, int gfp_mask)
+{
+	int error;
+
+	BUG_ON(PageSwapCache(page));
+	BUG_ON(PagePrivate(page));
+	error = radix_tree_preload(gfp_mask);
+	if (!error) {
+		page_cache_get(page);
+		spin_lock(&swapper_space.tree_lock);
+		error = radix_tree_insert(&swapper_space.page_tree,
+						entry.val, page);
+		if (!error) {
+			SetPageLocked(page);
+			SetPageSwapCache(page);
+			page->private = entry.val;
+			total_swapcache_pages++;
+			pagecache_acct(1);
+		} else
+			page_cache_release(page);
+		spin_unlock(&swapper_space.tree_lock);
+		radix_tree_preload_end();
+	}
+	return error;
+}
+
 static int add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
 	int error;
 
-	if (page->mapping)
-		BUG();
 	if (!swap_duplicate(entry)) {
 		INC_CACHE_INFO(noent_race);
 		return -ENOENT;
 	}
-	error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL);
+	error = __add_to_swap_cache(page, entry, GFP_KERNEL);
 	/*
 	 * Anon pages are already on the LRU, we don't run lru_cache_add here.
 	 */
-	if (error != 0) {
+	if (error) {
 		swap_free(entry);
 		if (error == -EEXIST)
 			INC_CACHE_INFO(exist_race);
 		return error;
 	}
-	if (!PageLocked(page))
-		BUG();
-	if (!PageSwapCache(page))
-		BUG();
 	INC_CACHE_INFO(add_total);
 	return 0;
 }
@@ -93,7 +117,12 @@ void __delete_from_swap_cache(struct page *page)
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!PageSwapCache(page));
 	BUG_ON(PageWriteback(page));
-	__remove_from_page_cache(page);
+
+	radix_tree_delete(&swapper_space.page_tree, page->private);
+	page->private = 0;
+	ClearPageSwapCache(page);
+	total_swapcache_pages--;
+	pagecache_acct(-1);
 	INC_CACHE_INFO(del_total);
 }
 
@@ -137,8 +166,7 @@ int add_to_swap(struct page * page)
 		/*
 		 * Add it to the swap cache and mark it dirty
 		 */
-		err = add_to_page_cache(page, &swapper_space,
-					entry.val, GFP_ATOMIC);
+		err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
 
 		if (pf_flags & PF_MEMALLOC)
 			current->flags |= PF_MEMALLOC;
@@ -146,8 +174,7 @@ int add_to_swap(struct page * page)
 		switch (err) {
 		case 0:				/* Success */
 			SetPageUptodate(page);
-			__clear_page_dirty(page);
-			set_page_dirty(page);
+			SetPageDirty(page);
 			INC_CACHE_INFO(add_total);
 			return 1;
 		case -EEXIST:
@@ -173,81 +200,55 @@ void delete_from_swap_cache(struct page *page)
 {
 	swp_entry_t entry;
 
+	BUG_ON(!PageSwapCache(page));
 	BUG_ON(!PageLocked(page));
 	BUG_ON(PageWriteback(page));
 	BUG_ON(PagePrivate(page));
   
-	entry.val = page->index;
+	entry.val = page->private;
 
-	spin_lock_irq(&swapper_space.tree_lock);
+	spin_lock(&swapper_space.tree_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock_irq(&swapper_space.tree_lock);
+	spin_unlock(&swapper_space.tree_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
 }
 
+/*
+ * Strange swizzling function only for use by shmem_writepage
+ */
 int move_to_swap_cache(struct page *page, swp_entry_t entry)
 {
-	struct address_space *mapping = page->mapping;
-	int err;
-
-	spin_lock_irq(&swapper_space.tree_lock);
-	spin_lock(&mapping->tree_lock);
-
-	err = radix_tree_insert(&swapper_space.page_tree, entry.val, page);
-	if (!err) {
-		__remove_from_page_cache(page);
-		___add_to_page_cache(page, &swapper_space, entry.val);
-	}
-
-	spin_unlock(&mapping->tree_lock);
-	spin_unlock_irq(&swapper_space.tree_lock);
-
+	int err = __add_to_swap_cache(page, entry, GFP_ATOMIC);
 	if (!err) {
+		remove_from_page_cache(page);
+		page_cache_release(page);	/* pagecache ref */
 		if (!swap_duplicate(entry))
 			BUG();
-		BUG_ON(PageDirty(page));
-		set_page_dirty(page);
+		SetPageDirty(page);
 		INC_CACHE_INFO(add_total);
 	} else if (err == -EEXIST)
 		INC_CACHE_INFO(exist_race);
 	return err;
 }
 
+/*
+ * Strange swizzling function for shmem_getpage (and shmem_unuse)
+ */
 int move_from_swap_cache(struct page *page, unsigned long index,
 		struct address_space *mapping)
 {
-	swp_entry_t entry;
-	int err;
-
-	BUG_ON(!PageLocked(page));
-	BUG_ON(PageWriteback(page));
-	BUG_ON(PagePrivate(page));
-
-	entry.val = page->index;
-
-	spin_lock_irq(&swapper_space.tree_lock);
-	spin_lock(&mapping->tree_lock);
-
-	err = radix_tree_insert(&mapping->page_tree, index, page);
+	int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC);
 	if (!err) {
-		__delete_from_swap_cache(page);
-		___add_to_page_cache(page, mapping, index);
-	}
-
-	spin_unlock(&mapping->tree_lock);
-	spin_unlock_irq(&swapper_space.tree_lock);
-
-	if (!err) {
-		swap_free(entry);
-		__clear_page_dirty(page);
+		delete_from_swap_cache(page);
+		/* shift page from clean_pages to dirty_pages list */
+		ClearPageDirty(page);
 		set_page_dirty(page);
 	}
 	return err;
 }
 
-
 /* 
  * If we are the only user, then try to free up the swap cache. 
  * 
@@ -305,19 +306,17 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
  */
 struct page * lookup_swap_cache(swp_entry_t entry)
 {
-	struct page *found;
+	struct page *page;
 
-	found = find_get_page(&swapper_space, entry.val);
-	/*
-	 * Unsafe to assert PageSwapCache and mapping on page found:
-	 * if SMP nothing prevents swapoff from deleting this page from
-	 * the swap cache at this moment.  find_lock_page would prevent
-	 * that, but no need to change: we _have_ got the right page.
-	 */
-	INC_CACHE_INFO(find_total);
-	if (found)
+	spin_lock(&swapper_space.tree_lock);
+	page = radix_tree_lookup(&swapper_space.page_tree, entry.val);
+	if (page) {
+		page_cache_get(page);
 		INC_CACHE_INFO(find_success);
-	return found;
+	}
+	spin_unlock(&swapper_space.tree_lock);
+	INC_CACHE_INFO(find_total);
+	return page;
 }
 
 /* 
@@ -335,10 +334,14 @@ struct page * read_swap_cache_async(swp_entry_t entry)
 		/*
 		 * First check the swap cache.  Since this is normally
 		 * called after lookup_swap_cache() failed, re-calling
-		 * that would confuse statistics: use find_get_page()
-		 * directly.
+		 * that would confuse statistics.
 		 */
-		found_page = find_get_page(&swapper_space, entry.val);
+		spin_lock(&swapper_space.tree_lock);
+		found_page = radix_tree_lookup(&swapper_space.page_tree,
+						entry.val);
+		if (found_page)
+			page_cache_get(found_page);
+		spin_unlock(&swapper_space.tree_lock);
 		if (found_page)
 			break;
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 44e214da0270..c3ece5503ddb 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -304,16 +304,16 @@ static int exclusive_swap_page(struct page *page)
 	struct swap_info_struct * p;
 	swp_entry_t entry;
 
-	entry.val = page->index;
+	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (p) {
 		/* Is the only swap cache user the cache itself? */
 		if (p->swap_map[swp_offset(entry)] == 1) {
-			/* Recheck the page count with the pagecache lock held.. */
-			spin_lock_irq(&swapper_space.tree_lock);
-			if (page_count(page) - !!PagePrivate(page) == 2)
+			/* Recheck the page count with the swapcache lock held.. */
+			spin_lock(&swapper_space.tree_lock);
+			if (page_count(page) == 2)
 				retval = 1;
-			spin_unlock_irq(&swapper_space.tree_lock);
+			spin_unlock(&swapper_space.tree_lock);
 		}
 		swap_info_put(p);
 	}
@@ -372,7 +372,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (page_count(page) != 2) /* 2: us + cache */
 		return 0;
 
-	entry.val = page->index;
+	entry.val = page->private;
 	p = swap_info_get(entry);
 	if (!p)
 		return 0;
@@ -380,14 +380,14 @@ int remove_exclusive_swap_page(struct page *page)
 	/* Is the only swap cache user the cache itself? */
 	retval = 0;
 	if (p->swap_map[swp_offset(entry)] == 1) {
-		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock_irq(&swapper_space.tree_lock);
+		/* Recheck the page count with the swapcache lock held.. */
+		spin_lock(&swapper_space.tree_lock);
 		if ((page_count(page) == 2) && !PageWriteback(page)) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock_irq(&swapper_space.tree_lock);
+		spin_unlock(&swapper_space.tree_lock);
 	}
 	swap_info_put(p);
 
@@ -410,8 +410,14 @@ void free_swap_and_cache(swp_entry_t entry)
 
 	p = swap_info_get(entry);
 	if (p) {
-		if (swap_entry_free(p, swp_offset(entry)) == 1)
-			page = find_trylock_page(&swapper_space, entry.val);
+		if (swap_entry_free(p, swp_offset(entry)) == 1) {
+			spin_lock(&swapper_space.tree_lock);
+			page = radix_tree_lookup(&swapper_space.page_tree,
+				entry.val);
+			if (page && TestSetPageLocked(page))
+				page = NULL;
+			spin_unlock(&swapper_space.tree_lock);
+		}
 		swap_info_put(p);
 	}
 	if (page) {
@@ -1053,14 +1059,14 @@ int page_queue_congested(struct page *page)
 
 	BUG_ON(!PageLocked(page));	/* It pins the swap_info_struct */
 
-	bdi = page->mapping->backing_dev_info;
 	if (PageSwapCache(page)) {
-		swp_entry_t entry = { .val = page->index };
+		swp_entry_t entry = { .val = page->private };
 		struct swap_info_struct *sis;
 
 		sis = get_swap_info_struct(swp_type(entry));
 		bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
-	}
+	} else
+		bdi = page->mapping->backing_dev_info;
 	return bdi_write_congested(bdi);
 }
 #endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 35fbca1c5168..34151f9aed30 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -176,20 +176,20 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask)
 /* Must be called with page's rmap lock held. */
 static inline int page_mapping_inuse(struct page *page)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping;
 
 	/* Page is in somebody's page tables. */
 	if (page_mapped(page))
 		return 1;
 
-	/* XXX: does this happen ? */
-	if (!mapping)
-		return 0;
-
 	/* Be more reluctant to reclaim swapcache than pagecache */
 	if (PageSwapCache(page))
 		return 1;
 
+	mapping = page_mapping(page);
+	if (!mapping)
+		return 0;
+
 	/* File is mmap'd by somebody. */
 	if (!list_empty(&mapping->i_mmap))
 		return 1;
@@ -233,7 +233,7 @@ static void handle_write_error(struct address_space *mapping,
 				struct page *page, int error)
 {
 	lock_page(page);
-	if (page->mapping == mapping) {
+	if (page_mapping(page) == mapping) {
 		if (error == -ENOSPC)
 			set_bit(AS_ENOSPC, &mapping->flags);
 		else
@@ -286,27 +286,28 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 			goto activate_locked;
 		}
 
-		mapping = page->mapping;
+		mapping = page_mapping(page);
+		may_enter_fs = (gfp_mask & __GFP_FS);
 
 #ifdef CONFIG_SWAP
 		/*
-		 * Anonymous process memory without backing store. Try to
-		 * allocate it some swap space here.
+		 * Anonymous process memory has backing store?
+		 * Try to allocate it some swap space here.
 		 *
 		 * XXX: implement swap clustering ?
 		 */
-		if (page_mapped(page) && !mapping && !PagePrivate(page)) {
+		if (PageAnon(page) && !PageSwapCache(page)) {
 			rmap_unlock(page);
 			if (!add_to_swap(page))
 				goto activate_locked;
 			rmap_lock(page);
-			mapping = page->mapping;
+		}
+		if (PageSwapCache(page)) {
+			mapping = &swapper_space;
+			may_enter_fs = (gfp_mask & __GFP_IO);
 		}
 #endif /* CONFIG_SWAP */
 
-		may_enter_fs = (gfp_mask & __GFP_FS) ||
-				(PageSwapCache(page) && (gfp_mask & __GFP_IO));
-
 		/*
 		 * The page is mapped into the page tables of one or more
 		 * processes. Try to unmap it here.
@@ -427,7 +428,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 
 #ifdef CONFIG_SWAP
 		if (PageSwapCache(page)) {
-			swp_entry_t swap = { .val = page->index };
+			swp_entry_t swap = { .val = page->private };
 			__delete_from_swap_cache(page);
 			spin_unlock_irq(&mapping->tree_lock);
 			swap_free(swap);
@@ -669,8 +670,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
 		 * FIXME: need to consider page_count(page) here if/when we
 		 * reap orphaned pages via the LRU (Daniel's locking stuff)
 		 */
-		if (total_swap_pages == 0 && !page->mapping &&
-						!PagePrivate(page)) {
+		if (total_swap_pages == 0 && PageAnon(page)) {
 			list_add(&page->lru, &l_active);
 			continue;
 		}
-- 
cgit v1.2.3


From fbf7adfafae19dd118facbbfe011510ba6aa8315 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:54:31 -0700
Subject: [PATCH] rmap 3 arches + mapping_mapped

From: Hugh Dickins <hugh@veritas.com>

Some arches refer to page->mapping for their dcache flushing: use
page_mapping(page) for safety, to avoid confusion on anon pages, which will
store a different pointer there - though in most cases flush_dcache_page is
being applied to pagecache pages.

arm has a useful mapping_mapped macro: move that to generic, and add
mapping_writably_mapped, to avoid explicit list_empty checks on i_mmap and
i_mmap_shared in several places.

Very tempted to add page_mapped(page) tests, perhaps along with the
mapping_writably_mapped tests in do_generic_mapping_read and
do_shmem_file_read, to cut down on wasted flush_dcache effort; but the
serialization is not obvious, too unsafe to do in a hurry.
---
 arch/arm/mm/fault-armv.c        |  4 ++--
 arch/mips/mm/cache.c            |  9 +++------
 arch/parisc/kernel/cache.c      |  4 ++--
 arch/sparc64/kernel/smp.c       |  8 ++++----
 arch/sparc64/mm/init.c          | 14 ++++++--------
 fs/locks.c                      | 22 ++++++++--------------
 fs/xfs/linux/xfs_vnode.h        |  4 +---
 include/asm-arm/cacheflush.h    | 12 ++++--------
 include/asm-parisc/cacheflush.h |  3 +--
 include/asm-sh/pgalloc.h        |  4 ++--
 include/linux/fs.h              | 20 ++++++++++++++++++++
 mm/filemap.c                    |  2 +-
 mm/shmem.c                      |  2 +-
 mm/vmscan.c                     |  9 ++-------
 14 files changed, 57 insertions(+), 60 deletions(-)

(limited to 'include')

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index 7aa6398abdb0..8c5ad6a4c2c0 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -191,7 +191,7 @@ void __flush_dcache_page(struct page *page)
 
 	__cpuc_flush_dcache_page(page_address(page));
 
-	if (!page->mapping)
+	if (!page_mapping(page))
 		return;
 
 	/*
@@ -292,7 +292,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 	if (!pfn_valid(pfn))
 		return;
 	page = pfn_to_page(pfn);
-	if (page->mapping) {
+	if (page_mapping(page)) {
 		int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags);
 
 		if (dirty)
diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c
index d384be0cb00e..5c9e9855caa8 100644
--- a/arch/mips/mm/cache.c
+++ b/arch/mips/mm/cache.c
@@ -57,16 +57,13 @@ void flush_dcache_page(struct page *page)
 {
 	unsigned long addr;
 
-	if (page->mapping &&
-	    list_empty(&page->mapping->i_mmap) &&
-	    list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		SetPageDcacheDirty(page);
-
 		return;
 	}
 
 	/*
-	 * We could delay the flush for the !page->mapping case too.  But that
+	 * We could delay the flush for the !page_mapping case too.  But that
 	 * case is for exec env/arg pages and those are %99 certainly going to
 	 * get faulted into the tlb (and thus flushed) anyways.
 	 */
@@ -81,7 +78,7 @@ void __update_cache(struct vm_area_struct *vma, unsigned long address,
 	unsigned long pfn, addr;
 
 	pfn = pte_pfn(pte);
-	if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page->mapping) &&
+	if (pfn_valid(pfn) && (page = pfn_to_page(pfn), page_mapping(page)) &&
 	    Page_dcache_dirty(page)) {
 		if (pages_do_alias((unsigned long)page_address(page),
 		                   address & PAGE_MASK)) {
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index a23bb15dc2f8..ac36c927ab5b 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -68,7 +68,7 @@ update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t pte)
 {
 	struct page *page = pte_page(pte);
 
-	if (VALID_PAGE(page) && page->mapping &&
+	if (VALID_PAGE(page) && page_mapping(page) &&
 	    test_bit(PG_dcache_dirty, &page->flags)) {
 
 		flush_kernel_dcache_page(page_address(page));
@@ -234,7 +234,7 @@ void __flush_dcache_page(struct page *page)
 
 	flush_kernel_dcache_page(page_address(page));
 
-	if (!page->mapping)
+	if (!page_mapping(page))
 		return;
 	/* check shared list first if it's not empty...it's usually
 	 * the shortest */
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index ce479585c484..88fe647652f4 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -671,9 +671,9 @@ static __inline__ void __local_flush_dcache_page(struct page *page)
 #if (L1DCACHE_SIZE > PAGE_SIZE)
 	__flush_dcache_page(page->virtual,
 			    ((tlb_type == spitfire) &&
-			     page->mapping != NULL));
+			     page_mapping(page) != NULL));
 #else
-	if (page->mapping != NULL &&
+	if (page_mapping(page) != NULL &&
 	    tlb_type == spitfire)
 		__flush_icache_page(__pa(page->virtual));
 #endif
@@ -694,7 +694,7 @@ void smp_flush_dcache_page_impl(struct page *page, int cpu)
 		if (tlb_type == spitfire) {
 			data0 =
 				((u64)&xcall_flush_dcache_page_spitfire);
-			if (page->mapping != NULL)
+			if (page_mapping(page) != NULL)
 				data0 |= ((u64)1 << 32);
 			spitfire_xcall_deliver(data0,
 					       __pa(page->virtual),
@@ -727,7 +727,7 @@ void flush_dcache_page_all(struct mm_struct *mm, struct page *page)
 		goto flush_self;
 	if (tlb_type == spitfire) {
 		data0 = ((u64)&xcall_flush_dcache_page_spitfire);
-		if (page->mapping != NULL)
+		if (page_mapping(page) != NULL)
 			data0 |= ((u64)1 << 32);
 		spitfire_xcall_deliver(data0,
 				       __pa(page->virtual),
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 690120faf6c8..81e68ee52f8d 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -139,9 +139,9 @@ __inline__ void flush_dcache_page_impl(struct page *page)
 #if (L1DCACHE_SIZE > PAGE_SIZE)
 	__flush_dcache_page(page->virtual,
 			    ((tlb_type == spitfire) &&
-			     page->mapping != NULL));
+			     page_mapping(page) != NULL));
 #else
-	if (page->mapping != NULL &&
+	if (page_mapping(page) != NULL &&
 	    tlb_type == spitfire)
 		__flush_icache_page(__pa(page->virtual));
 #endif
@@ -203,7 +203,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p
 
 	pfn = pte_pfn(pte);
 	if (pfn_valid(pfn) &&
-	    (page = pfn_to_page(pfn), page->mapping) &&
+	    (page = pfn_to_page(pfn), page_mapping(page)) &&
 	    ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) {
 		int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL));
 
@@ -227,9 +227,7 @@ void flush_dcache_page(struct page *page)
 	int dirty = test_bit(PG_dcache_dirty, &page->flags);
 	int dirty_cpu = dcache_dirty_cpu(page);
 
-	if (page->mapping &&
-	    list_empty(&page->mapping->i_mmap) &&
-	    list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		if (dirty) {
 			if (dirty_cpu == smp_processor_id())
 				return;
@@ -237,7 +235,7 @@ void flush_dcache_page(struct page *page)
 		}
 		set_dcache_dirty(page);
 	} else {
-		/* We could delay the flush for the !page->mapping
+		/* We could delay the flush for the !page_mapping
 		 * case too.  But that case is for exec env/arg
 		 * pages and those are %99 certainly going to get
 		 * faulted into the tlb (and thus flushed) anyways.
@@ -279,7 +277,7 @@ static inline void flush_cache_pte_range(struct mm_struct *mm, pmd_t *pmd, unsig
 			if (!pfn_valid(pfn))
 				continue;
 			page = pfn_to_page(pfn);
-			if (PageReserved(page) || !page->mapping)
+			if (PageReserved(page) || !page_mapping(page))
 				continue;
 			pgaddr = (unsigned long) page_address(page);
 			uaddr = address + offset;
diff --git a/fs/locks.c b/fs/locks.c
index c6a6010a7218..da593493962c 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1453,13 +1453,10 @@ int fcntl_setlk(struct file *filp, unsigned int cmd, struct flock __user *l)
 	 * and shared.
 	 */
 	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
-		struct address_space *mapping = filp->f_mapping;
-
-		if (!list_empty(&mapping->i_mmap_shared)) {
-			error = -EAGAIN;
-			goto out;
-		}
+	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+	    mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
 	}
 
 	error = flock_to_posix_lock(filp, file_lock, &flock);
@@ -1591,13 +1588,10 @@ int fcntl_setlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l)
 	 * and shared.
 	 */
 	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) {
-		struct address_space *mapping = filp->f_mapping;
-
-		if (!list_empty(&mapping->i_mmap_shared)) {
-			error = -EAGAIN;
-			goto out;
-		}
+	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
+	    mapping_writably_mapped(filp->f_mapping)) {
+		error = -EAGAIN;
+		goto out;
 	}
 
 	error = flock64_to_posix_lock(filp, file_lock, &flock);
diff --git a/fs/xfs/linux/xfs_vnode.h b/fs/xfs/linux/xfs_vnode.h
index 6736f7aa2b97..af0b65fe5136 100644
--- a/fs/xfs/linux/xfs_vnode.h
+++ b/fs/xfs/linux/xfs_vnode.h
@@ -596,9 +596,7 @@ static __inline__ void vn_flagclr(struct vnode *vp, uint flag)
 /*
  * Some useful predicates.
  */
-#define VN_MAPPED(vp)	\
-	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \
-	(!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared))))
+#define VN_MAPPED(vp)	mapping_mapped(LINVFS_GET_IP(vp)->i_mapping)
 #define VN_CACHED(vp)	(LINVFS_GET_IP(vp)->i_mapping->nrpages)
 #define VN_DIRTY(vp)	mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \
 					PAGECACHE_TAG_DIRTY)
diff --git a/include/asm-arm/cacheflush.h b/include/asm-arm/cacheflush.h
index 6968e8e90c3e..91b16cc3f502 100644
--- a/include/asm-arm/cacheflush.h
+++ b/include/asm-arm/cacheflush.h
@@ -283,23 +283,19 @@ flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr)
  * flush_dcache_page is used when the kernel has written to the page
  * cache page at virtual address page->virtual.
  *
- * If this page isn't mapped (ie, page->mapping = NULL), or it has
- * userspace mappings (page->mapping->i_mmap or page->mapping->i_mmap_shared)
- * then we _must_ always clean + invalidate the dcache entries associated
- * with the kernel mapping.
+ * If this page isn't mapped (ie, page_mapping == NULL), or it might
+ * have userspace mappings, then we _must_ always clean + invalidate
+ * the dcache entries associated with the kernel mapping.
  *
  * Otherwise we can defer the operation, and clean the cache when we are
  * about to change to user space.  This is the same method as used on SPARC64.
  * See update_mmu_cache for the user space part.
  */
-#define mapping_mapped(map)	(!list_empty(&(map)->i_mmap) || \
-				 !list_empty(&(map)->i_mmap_shared))
-
 extern void __flush_dcache_page(struct page *);
 
 static inline void flush_dcache_page(struct page *page)
 {
-	if (page->mapping && !mapping_mapped(page->mapping))
+	if (page_mapping(page) && !mapping_mapped(page->mapping))
 		set_bit(PG_dcache_dirty, &page->flags);
 	else
 		__flush_dcache_page(page);
diff --git a/include/asm-parisc/cacheflush.h b/include/asm-parisc/cacheflush.h
index 52b0c6a96aea..7a77986e3738 100644
--- a/include/asm-parisc/cacheflush.h
+++ b/include/asm-parisc/cacheflush.h
@@ -69,8 +69,7 @@ extern void __flush_dcache_page(struct page *page);
 
 static inline void flush_dcache_page(struct page *page)
 {
-	if (page->mapping && list_empty(&page->mapping->i_mmap) &&
-			list_empty(&page->mapping->i_mmap_shared)) {
+	if (page_mapping(page) && !mapping_mapped(page->mapping)) {
 		set_bit(PG_dcache_dirty, &page->flags);
 	} else {
 		__flush_dcache_page(page);
diff --git a/include/asm-sh/pgalloc.h b/include/asm-sh/pgalloc.h
index bd02728a69d5..4584c9e37a75 100644
--- a/include/asm-sh/pgalloc.h
+++ b/include/asm-sh/pgalloc.h
@@ -101,8 +101,8 @@ static inline pte_t ptep_get_and_clear(pte_t *ptep)
 		unsigned long pfn = pte_pfn(pte);
 		if (pfn_valid(pfn)) {
 			page = pfn_to_page(pfn);
-			if (!page->mapping
-			    || list_empty(&page->mapping->i_mmap_shared))
+			if (!page_mapping(page) ||
+			    !mapping_writably_mapped(page->mapping))
 				__clear_bit(PG_mapped, &page->flags);
 		}
 	}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c7f0052b4abd..3d7c320d675e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -373,6 +373,26 @@ struct block_device {
 
 int mapping_tagged(struct address_space *mapping, int tag);
 
+/*
+ * Might pages of this file be mapped into userspace?
+ */
+static inline int mapping_mapped(struct address_space *mapping)
+{
+	return	!list_empty(&mapping->i_mmap) ||
+		!list_empty(&mapping->i_mmap_shared);
+}
+
+/*
+ * Might pages of this file have been modified in userspace?
+ * Note that i_mmap_shared holds all the VM_SHARED vmas: do_mmap_pgoff
+ * marks vma as VM_SHARED if it is shared, and the file was opened for
+ * writing i.e. vma may be mprotected writable even if now readonly.
+ */
+static inline int mapping_writably_mapped(struct address_space *mapping)
+{
+	return	!list_empty(&mapping->i_mmap_shared);
+}
+
 /*
  * Use sequence counter to get consistent i_size on 32-bit processors.
  */
diff --git a/mm/filemap.c b/mm/filemap.c
index ca8fc1148296..c83a97b5aed7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -660,7 +660,7 @@ page_ok:
 		 * virtual addresses, take care about potential aliasing
 		 * before reading the page on the kernel side.
 		 */
-		if (!list_empty(&mapping->i_mmap_shared))
+		if (mapping_writably_mapped(mapping))
 			flush_dcache_page(page);
 
 		/*
diff --git a/mm/shmem.c b/mm/shmem.c
index 345e04cb0f6c..31001df23c3d 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1340,7 +1340,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
 			 * virtual addresses, take care about potential aliasing
 			 * before reading the page on the kernel side.
 			 */
-			if (!list_empty(&mapping->i_mmap_shared))
+			if (mapping_writably_mapped(mapping))
 				flush_dcache_page(page);
 			/*
 			 * Mark the page accessed if we read the beginning.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 34151f9aed30..55d8feae21a2 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -190,13 +190,8 @@ static inline int page_mapping_inuse(struct page *page)
 	if (!mapping)
 		return 0;
 
-	/* File is mmap'd by somebody. */
-	if (!list_empty(&mapping->i_mmap))
-		return 1;
-	if (!list_empty(&mapping->i_mmap_shared))
-		return 1;
-
-	return 0;
+	/* File is mmap'd by somebody? */
+	return mapping_mapped(mapping);
 }
 
 static inline int is_page_cache_freeable(struct page *page)
-- 
cgit v1.2.3


From 66fb1123391ed704997523cb6daea7fae00dcb83 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:54:44 -0700
Subject: [PATCH] rename page_to_nodenum()

From: "Martin J. Bligh" <mbligh@aracnet.com>

I'd prefer we renamed this to page_to_nid() before anyone starts using it.
This fits with the naming convention of everything else (pfn_to_nid, etc).
Nobody uses it right now - I grepped the whole tree.
---
 include/linux/mm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 796f498658d6..d453a0ab62a7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -351,7 +351,7 @@ static inline unsigned long page_zonenum(struct page *page)
 {
 	return (page->flags >> NODEZONE_SHIFT) & (~(~0UL << ZONES_SHIFT));
 }
-static inline unsigned long page_nodenum(struct page *page)
+static inline unsigned long page_to_nid(struct page *page)
 {
 	return (page->flags >> (NODEZONE_SHIFT + ZONES_SHIFT));
 }
-- 
cgit v1.2.3


From 816d607321a4ec72f203d350f93e027e3434d7a4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:56:55 -0700
Subject: [PATCH] i386 probe_roms(): preparation

From: Rene Herman <rene.herman@keyaccess.nl>

The i386 probe_roms() function has a fair number of problems currently:

- When you actually have an adapter ROM in the machine, your video ROM
  disappears.  This is due to the pc9800 subarch merge that split it up in
  probe_video_rom(int roms) and probe_extension_roms(int roms), but expects a
  "roms++" in probe_video_roms() to have an effect outside of that function.

- The majority of VGA adapters these days host a ROM larger then 32K, yet
  the current code hardcodes a 32K ROM.  The VGA BIOS "length" byte is
  normally valid (it in fact needs to be for a regular mainboard BIOS to
  accept it) and I've verified on a few dozen very new to very old VGAs that
  it is.  However, assuming someone actually did not check for the length and
  checksum there for a reason, the safe thing to do here is accept the length
  byte when we also get a valid checksum.

- The current code scans 0xc0000 to 0xdffff for a video ROM while the
  standard PC thing to do (that which the BIOS does) is only scan for a video
  ROM starting between 0xc0000 and 0xc7fff.  This means that on a headless-
  (or BIOS-less monochrome adapter-) box, the first adapter ROM found
  triggers the registration of a 32K "Video ROM" at hardcoded address
  0xc0000, even when _nothing_ is present between 0xc0000 and 0xc7fff.

- The current adapter ROM scan stops at 0xdffff, whether or not an
  extension ROM is present at 0xe0000.  The PC thing to do is scan 0xc8000
  upto 0xdffff if an extension ROM is present, and upto 0xeffff when it's not
  (it's not/hardly ever).

- Adapter ROMs are called "Extension ROM", but the latter term is really
  better reserved for a motherboard extension ROM.

- Currently, the code happily starts scanning through a ROM it just
  registered looking for the next one (just does += 2048, even when that's
  inside the previous ROM) which is at least silly.

Unfortunately, this code is "subarched" between mach-default and
mach-pc9800, meaning the patch got a bit involved. Currently all this
code, and gobs of data, is defined (not just declared) in the header:

   include/asm-i386/mach-{default,pc9800}/mach_resources.h

which isn't nice. That .h really wants to be a .c. The first patch, in
the next message, does not change any code but only undoes the
probe_video_rom / probe_extension_roms split and moves the code to a new
file

   arch/i386/mach-{default,pc9800}/std_resources.c

with a header

   include/asm-i386/std_resources.h

for the prototypes only. The second patch overhauls the code itself for
mach-default. Please see comments on top of that patch for (yet more)
comments. It's tested on various machines, with and without adapter ROMs.

I haven't touched pc9800. Nothing should have changed though. The pc9800
author, as given in the code, is CCed.

Also, x86-64 inherits the probe_roms() code from 2.4, and while it
doesn't have the subarch specific problems, it has all others. I'll
convert it to if this i386 version is deemed desirable.


This patch doesn't change any code, just moves stuff from the
"mach_resources.h" header to a "std_resources.c" subarch specific file, and
introduces a "std_resources.h" header for the prototypes.
---
 arch/i386/kernel/setup.c                       |  21 +--
 arch/i386/mach-default/Makefile                |   2 +-
 arch/i386/mach-default/std_resources.c         | 108 ++++++++++++++
 arch/i386/mach-pc9800/Makefile                 |   2 +-
 arch/i386/mach-pc9800/std_resources.c          | 195 +++++++++++++++++++++++++
 include/asm-i386/mach-default/mach_resources.h | 106 --------------
 include/asm-i386/mach-pc9800/mach_resources.h  | 191 ------------------------
 include/asm-i386/std_resources.h               |  14 ++
 8 files changed, 322 insertions(+), 317 deletions(-)
 create mode 100644 arch/i386/mach-default/std_resources.c
 create mode 100644 arch/i386/mach-pc9800/std_resources.c
 delete mode 100644 include/asm-i386/mach-default/mach_resources.h
 delete mode 100644 include/asm-i386/mach-pc9800/mach_resources.h
 create mode 100644 include/asm-i386/std_resources.h

(limited to 'include')

diff --git a/arch/i386/kernel/setup.c b/arch/i386/kernel/setup.c
index bcc5b24e5fd2..4d193a2b185a 100644
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -47,8 +47,8 @@
 #include <asm/sections.h>
 #include <asm/io_apic.h>
 #include <asm/ist.h>
+#include <asm/std_resources.h>
 #include "setup_arch_pre.h"
-#include "mach_resources.h"
 
 /* This value is set up by the early boot code to point to the value
    immediately after the boot time page tables.  It contains a *physical*
@@ -135,19 +135,6 @@ unsigned char __initdata boot_params[PARAM_SIZE];
 static struct resource code_resource = { "Kernel code", 0x100000, 0 };
 static struct resource data_resource = { "Kernel data", 0, 0 };
 
-static void __init probe_roms(void)
-{
-	int roms = 1;
-
-	request_resource(&iomem_resource, rom_resources+0);
-
-	/* Video ROM is standard at C000:0000 - C7FF:0000, check signature */
-	probe_video_rom(roms);
-
-	/* Extension roms */
-	probe_extension_roms(roms);
-}
-
 static void __init limit_regions(unsigned long long size)
 {
 	unsigned long long current_addr = 0;
@@ -951,19 +938,17 @@ legacy_init_iomem_resources(struct resource *code_resource, struct resource *dat
 static void __init register_memory(unsigned long max_low_pfn)
 {
 	unsigned long low_mem_size;
-	int i;
 
 	if (efi_enabled)
 		efi_initialize_iomem_resources(&code_resource, &data_resource);
 	else
 		legacy_init_iomem_resources(&code_resource, &data_resource);
 
- 	 /* EFI systems may still have VGA */
+	/* EFI systems may still have VGA */
 	request_graphics_resource();
 
 	/* request I/O space for devices used on all i[345]86 PCs */
-	for (i = 0; i < STANDARD_IO_RESOURCES; i++)
-		request_resource(&ioport_resource, standard_io_resources+i);
+	request_standard_io_resources();
 
 	/* Tell the PCI layer not to allocate too close to the RAM area.. */
 	low_mem_size = ((max_low_pfn << PAGE_SHIFT) + 0xfffff) & ~0xfffff;
diff --git a/arch/i386/mach-default/Makefile b/arch/i386/mach-default/Makefile
index e95bb0237921..7fff76564d29 100644
--- a/arch/i386/mach-default/Makefile
+++ b/arch/i386/mach-default/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the linux kernel.
 #
 
-obj-y				:= setup.o topology.o
+obj-y				:= setup.o topology.o std_resources.o
diff --git a/arch/i386/mach-default/std_resources.c b/arch/i386/mach-default/std_resources.c
new file mode 100644
index 000000000000..475d54032ec0
--- /dev/null
+++ b/arch/i386/mach-default/std_resources.c
@@ -0,0 +1,108 @@
+/*
+ *  Machine specific resource allocation for generic.
+ *  Split out from setup.c by Osamu Tomita <tomita@cinet.co.jp>
+ */
+
+#include <linux/ioport.h>
+#include <asm/io.h>
+#include <asm/std_resources.h>
+
+static struct resource standard_io_resources[] = {
+	{ "dma1", 0x00, 0x1f, IORESOURCE_BUSY },
+	{ "pic1", 0x20, 0x21, IORESOURCE_BUSY },
+	{ "timer", 0x40, 0x5f, IORESOURCE_BUSY },
+	{ "keyboard", 0x60, 0x6f, IORESOURCE_BUSY },
+	{ "dma page reg", 0x80, 0x8f, IORESOURCE_BUSY },
+	{ "pic2", 0xa0, 0xa1, IORESOURCE_BUSY },
+	{ "dma2", 0xc0, 0xdf, IORESOURCE_BUSY },
+	{ "fpu", 0xf0, 0xff, IORESOURCE_BUSY }
+};
+
+#define STANDARD_IO_RESOURCES (sizeof(standard_io_resources)/sizeof(struct resource))
+
+static struct resource vram_resource = { "Video RAM area", 0xa0000, 0xbffff, IORESOURCE_BUSY };
+
+/* System ROM resources */
+#define MAXROMS 6
+static struct resource rom_resources[MAXROMS] = {
+	{ "System ROM", 0xF0000, 0xFFFFF, IORESOURCE_BUSY },
+	{ "Video ROM", 0xc0000, 0xc7fff, IORESOURCE_BUSY }
+};
+
+#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
+
+void __init probe_roms(void)
+{
+	unsigned long base;
+	unsigned char *romstart;
+	int roms = 1;
+
+	request_resource(&iomem_resource, rom_resources+0);
+
+	/* Video ROM is standard at C000:0000 - C7FF:0000, check signature */
+	for (base = 0xC0000; base < 0xE0000; base += 2048) {
+		romstart = isa_bus_to_virt(base);
+		if (!romsignature(romstart))
+			continue;
+		request_resource(&iomem_resource, rom_resources + roms);
+		roms++;
+		break;
+	}
+
+	/* Extension roms at C800:0000 - DFFF:0000 */
+	for (base = 0xC8000; base < 0xE0000; base += 2048) {
+		unsigned long length;
+
+		romstart = isa_bus_to_virt(base);
+		if (!romsignature(romstart))
+			continue;
+		length = romstart[2] * 512;
+		if (length) {
+			unsigned int i;
+			unsigned char chksum;
+
+			chksum = 0;
+			for (i = 0; i < length; i++)
+				chksum += romstart[i];
+
+			/* Good checksum? */
+			if (!chksum) {
+				rom_resources[roms].start = base;
+				rom_resources[roms].end = base + length - 1;
+				rom_resources[roms].name = "Extension ROM";
+				rom_resources[roms].flags = IORESOURCE_BUSY;
+
+				request_resource(&iomem_resource, rom_resources + roms);
+				roms++;
+				if (roms >= MAXROMS)
+					return;
+			}
+		}
+	}
+
+	/* Final check for motherboard extension rom at E000:0000 */
+	base = 0xE0000;
+	romstart = isa_bus_to_virt(base);
+
+	if (romsignature(romstart)) {
+		rom_resources[roms].start = base;
+		rom_resources[roms].end = base + 65535;
+		rom_resources[roms].name = "Extension ROM";
+		rom_resources[roms].flags = IORESOURCE_BUSY;
+
+		request_resource(&iomem_resource, rom_resources + roms);
+	}
+}
+
+void __init request_graphics_resource(void)
+{
+	request_resource(&iomem_resource, &vram_resource);
+}
+
+void __init request_standard_io_resources(void)
+{
+	int i;
+
+	for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+		request_resource(&ioport_resource, standard_io_resources+i);
+}
diff --git a/arch/i386/mach-pc9800/Makefile b/arch/i386/mach-pc9800/Makefile
index e95bb0237921..7fff76564d29 100644
--- a/arch/i386/mach-pc9800/Makefile
+++ b/arch/i386/mach-pc9800/Makefile
@@ -2,4 +2,4 @@
 # Makefile for the linux kernel.
 #
 
-obj-y				:= setup.o topology.o
+obj-y				:= setup.o topology.o std_resources.o
diff --git a/arch/i386/mach-pc9800/std_resources.c b/arch/i386/mach-pc9800/std_resources.c
new file mode 100644
index 000000000000..06290bf82956
--- /dev/null
+++ b/arch/i386/mach-pc9800/std_resources.c
@@ -0,0 +1,195 @@
+/*
+ *  Machine specific resource allocation for PC-9800.
+ *  Written by Osamu Tomita <tomita@cinet.co.jp>
+ */
+
+#include <linux/ioport.h>
+#include <asm/io.h>
+#include <asm/std_resources.h>
+
+static char str_pic1[] = "pic1";
+static char str_dma[] = "dma";
+static char str_pic2[] = "pic2";
+static char str_calender_clock[] = "calender clock";
+static char str_system[] = "system";
+static char str_nmi_control[] = "nmi control";
+static char str_kanji_rom[] = "kanji rom";
+static char str_keyboard[] = "keyboard";
+static char str_text_gdc[] = "text gdc";
+static char str_crtc[] = "crtc";
+static char str_timer[] = "timer";
+static char str_graphic_gdc[] = "graphic gdc";
+static char str_dma_ex_bank[] = "dma ex. bank";
+static char str_beep_freq[] = "beep freq.";
+static char str_mouse_pio[] = "mouse pio";
+struct resource standard_io_resources[] = {
+	{ str_pic1, 0x00, 0x00, IORESOURCE_BUSY },
+	{ str_dma, 0x01, 0x01, IORESOURCE_BUSY },
+	{ str_pic1, 0x02, 0x02, IORESOURCE_BUSY },
+	{ str_dma, 0x03, 0x03, IORESOURCE_BUSY },
+	{ str_dma, 0x05, 0x05, IORESOURCE_BUSY },
+	{ str_dma, 0x07, 0x07, IORESOURCE_BUSY },
+	{ str_pic2, 0x08, 0x08, IORESOURCE_BUSY },
+	{ str_dma, 0x09, 0x09, IORESOURCE_BUSY },
+	{ str_pic2, 0x0a, 0x0a, IORESOURCE_BUSY },
+	{ str_dma, 0x0b, 0x0b, IORESOURCE_BUSY },
+	{ str_dma, 0x0d, 0x0d, IORESOURCE_BUSY },
+	{ str_dma, 0x0f, 0x0f, IORESOURCE_BUSY },
+	{ str_dma, 0x11, 0x11, IORESOURCE_BUSY },
+	{ str_dma, 0x13, 0x13, IORESOURCE_BUSY },
+	{ str_dma, 0x15, 0x15, IORESOURCE_BUSY },
+	{ str_dma, 0x17, 0x17, IORESOURCE_BUSY },
+	{ str_dma, 0x19, 0x19, IORESOURCE_BUSY },
+	{ str_dma, 0x1b, 0x1b, IORESOURCE_BUSY },
+	{ str_dma, 0x1d, 0x1d, IORESOURCE_BUSY },
+	{ str_dma, 0x1f, 0x1f, IORESOURCE_BUSY },
+	{ str_calender_clock, 0x20, 0x20, 0 },
+	{ str_dma, 0x21, 0x21, IORESOURCE_BUSY },
+	{ str_calender_clock, 0x22, 0x22, 0 },
+	{ str_dma, 0x23, 0x23, IORESOURCE_BUSY },
+	{ str_dma, 0x25, 0x25, IORESOURCE_BUSY },
+	{ str_dma, 0x27, 0x27, IORESOURCE_BUSY },
+	{ str_dma, 0x29, 0x29, IORESOURCE_BUSY },
+	{ str_dma, 0x2b, 0x2b, IORESOURCE_BUSY },
+	{ str_dma, 0x2d, 0x2d, IORESOURCE_BUSY },
+	{ str_system, 0x31, 0x31, IORESOURCE_BUSY },
+	{ str_system, 0x33, 0x33, IORESOURCE_BUSY },
+	{ str_system, 0x35, 0x35, IORESOURCE_BUSY },
+	{ str_system, 0x37, 0x37, IORESOURCE_BUSY },
+	{ str_nmi_control, 0x50, 0x50, IORESOURCE_BUSY },
+	{ str_nmi_control, 0x52, 0x52, IORESOURCE_BUSY },
+	{ "time stamp", 0x5c, 0x5f, IORESOURCE_BUSY },
+	{ str_kanji_rom, 0xa1, 0xa1, IORESOURCE_BUSY },
+	{ str_kanji_rom, 0xa3, 0xa3, IORESOURCE_BUSY },
+	{ str_kanji_rom, 0xa5, 0xa5, IORESOURCE_BUSY },
+	{ str_kanji_rom, 0xa7, 0xa7, IORESOURCE_BUSY },
+	{ str_kanji_rom, 0xa9, 0xa9, IORESOURCE_BUSY },
+	{ str_keyboard, 0x41, 0x41, IORESOURCE_BUSY },
+	{ str_keyboard, 0x43, 0x43, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x60, 0x60, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x62, 0x62, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x64, 0x64, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x66, 0x66, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x68, 0x68, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x6a, 0x6a, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x6c, 0x6c, IORESOURCE_BUSY },
+	{ str_text_gdc, 0x6e, 0x6e, IORESOURCE_BUSY },
+	{ str_crtc, 0x70, 0x70, IORESOURCE_BUSY },
+	{ str_crtc, 0x72, 0x72, IORESOURCE_BUSY },
+	{ str_crtc, 0x74, 0x74, IORESOURCE_BUSY },
+	{ str_crtc, 0x74, 0x74, IORESOURCE_BUSY },
+	{ str_crtc, 0x76, 0x76, IORESOURCE_BUSY },
+	{ str_crtc, 0x78, 0x78, IORESOURCE_BUSY },
+	{ str_crtc, 0x7a, 0x7a, IORESOURCE_BUSY },
+	{ str_timer, 0x71, 0x71, IORESOURCE_BUSY },
+	{ str_timer, 0x73, 0x73, IORESOURCE_BUSY },
+	{ str_timer, 0x75, 0x75, IORESOURCE_BUSY },
+	{ str_timer, 0x77, 0x77, IORESOURCE_BUSY },
+	{ str_graphic_gdc, 0xa0, 0xa0, IORESOURCE_BUSY },
+	{ str_graphic_gdc, 0xa2, 0xa2, IORESOURCE_BUSY },
+	{ str_graphic_gdc, 0xa4, 0xa4, IORESOURCE_BUSY },
+	{ str_graphic_gdc, 0xa6, 0xa6, IORESOURCE_BUSY },
+	{ "cpu", 0xf0, 0xf7, IORESOURCE_BUSY },
+	{ "fpu", 0xf8, 0xff, IORESOURCE_BUSY },
+	{ str_dma_ex_bank, 0x0e05, 0x0e05, 0 },
+	{ str_dma_ex_bank, 0x0e07, 0x0e07, 0 },
+	{ str_dma_ex_bank, 0x0e09, 0x0e09, 0 },
+	{ str_dma_ex_bank, 0x0e0b, 0x0e0b, 0 },
+	{ str_beep_freq, 0x3fd9, 0x3fd9, IORESOURCE_BUSY },
+	{ str_beep_freq, 0x3fdb, 0x3fdb, IORESOURCE_BUSY },
+	{ str_beep_freq, 0x3fdd, 0x3fdd, IORESOURCE_BUSY },
+	{ str_beep_freq, 0x3fdf, 0x3fdf, IORESOURCE_BUSY },
+	/* All PC-9800 have (exactly) one mouse interface.  */
+	{ str_mouse_pio, 0x7fd9, 0x7fd9, 0 },
+	{ str_mouse_pio, 0x7fdb, 0x7fdb, 0 },
+	{ str_mouse_pio, 0x7fdd, 0x7fdd, 0 },
+	{ str_mouse_pio, 0x7fdf, 0x7fdf, 0 },
+	{ "mouse timer", 0xbfdb, 0xbfdb, 0 },
+	{ "mouse irq", 0x98d7, 0x98d7, 0 },
+};
+
+#define STANDARD_IO_RESOURCES (sizeof(standard_io_resources)/sizeof(struct resource))
+
+static struct resource tvram_resource = { "Text VRAM/CG window", 0xa0000, 0xa4fff, IORESOURCE_BUSY };
+static struct resource gvram_brg_resource = { "Graphic VRAM (B/R/G)", 0xa8000, 0xbffff, IORESOURCE_BUSY };
+static struct resource gvram_e_resource = { "Graphic VRAM (E)", 0xe0000, 0xe7fff, IORESOURCE_BUSY };
+
+/* System ROM resources */
+#define MAXROMS 6
+static struct resource rom_resources[MAXROMS] = {
+	{ "System ROM", 0xe8000, 0xfffff, IORESOURCE_BUSY }
+};
+
+void __init probe_roms(void)
+{
+	int i;
+	__u8 *xrom_id;
+	int roms = 1;
+
+	request_resource(&iomem_resource, rom_resources+0);
+
+	xrom_id = (__u8 *) isa_bus_to_virt(PC9800SCA_XROM_ID + 0x10);
+
+	for (i = 0; i < 16; i++) {
+		if (xrom_id[i] & 0x80) {
+			int j;
+
+			for (j = i + 1; j < 16 && (xrom_id[j] & 0x80); j++)
+				;
+			rom_resources[roms].start = 0x0d0000 + i * 0x001000;
+			rom_resources[roms].end = 0x0d0000 + j * 0x001000 - 1;
+			rom_resources[roms].name = "Extension ROM";
+			rom_resources[roms].flags = IORESOURCE_BUSY;
+
+			request_resource(&iomem_resource,
+					  rom_resources + roms);
+			if (++roms >= MAXROMS)
+				return;
+		}
+	}
+}
+
+void __init request_graphics_resource(void)
+{
+	int i;
+
+	if (PC9800_HIGHRESO_P()) {
+		tvram_resource.start = 0xe0000;
+		tvram_resource.end   = 0xe4fff;
+		gvram_brg_resource.name  = "Graphic VRAM";
+		gvram_brg_resource.start = 0xc0000;
+		gvram_brg_resource.end   = 0xdffff;
+	}
+
+	request_resource(&iomem_resource, &tvram_resource);
+	request_resource(&iomem_resource, &gvram_brg_resource);
+	if (!PC9800_HIGHRESO_P())
+		request_resource(&iomem_resource, &gvram_e_resource);
+
+	if (PC9800_HIGHRESO_P() || PC9800_9821_P()) {
+		static char graphics[] = "graphics";
+		static struct resource graphics_resources[] = {
+			{ graphics, 0x9a0, 0x9a0, 0 },
+			{ graphics, 0x9a2, 0x9a2, 0 },
+			{ graphics, 0x9a4, 0x9a4, 0 },
+			{ graphics, 0x9a6, 0x9a6, 0 },
+			{ graphics, 0x9a8, 0x9a8, 0 },
+			{ graphics, 0x9aa, 0x9aa, 0 },
+			{ graphics, 0x9ac, 0x9ac, 0 },
+			{ graphics, 0x9ae, 0x9ae, 0 },
+		};
+
+#define GRAPHICS_RESOURCES (sizeof(graphics_resources)/sizeof(struct resource))
+
+		for (i = 0; i < GRAPHICS_RESOURCES; i++)
+			request_resource(&ioport_resource, graphics_resources + i);
+	}
+}
+
+void __init request_standard_io_resources(void)
+{
+	int i;
+
+	for (i = 0; i < STANDARD_IO_RESOURCES; i++)
+		request_resource(&ioport_resource, standard_io_resources+i);
+}
diff --git a/include/asm-i386/mach-default/mach_resources.h b/include/asm-i386/mach-default/mach_resources.h
deleted file mode 100644
index b37858d0d0bd..000000000000
--- a/include/asm-i386/mach-default/mach_resources.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- *  include/asm-i386/mach-default/mach_resources.h
- *
- *  Machine specific resource allocation for generic.
- *  Split out from setup.c by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef _MACH_RESOURCES_H
-#define _MACH_RESOURCES_H
-
-struct resource standard_io_resources[] = {
-	{ "dma1", 0x00, 0x1f, IORESOURCE_BUSY },
-	{ "pic1", 0x20, 0x21, IORESOURCE_BUSY },
-	{ "timer", 0x40, 0x5f, IORESOURCE_BUSY },
-	{ "keyboard", 0x60, 0x6f, IORESOURCE_BUSY },
-	{ "dma page reg", 0x80, 0x8f, IORESOURCE_BUSY },
-	{ "pic2", 0xa0, 0xa1, IORESOURCE_BUSY },
-	{ "dma2", 0xc0, 0xdf, IORESOURCE_BUSY },
-	{ "fpu", 0xf0, 0xff, IORESOURCE_BUSY }
-};
-
-#define STANDARD_IO_RESOURCES (sizeof(standard_io_resources)/sizeof(struct resource))
-
-static struct resource vram_resource = { "Video RAM area", 0xa0000, 0xbffff, IORESOURCE_BUSY };
-
-/* System ROM resources */
-#define MAXROMS 6
-static struct resource rom_resources[MAXROMS] = {
-	{ "System ROM", 0xF0000, 0xFFFFF, IORESOURCE_BUSY },
-	{ "Video ROM", 0xc0000, 0xc7fff, IORESOURCE_BUSY }
-};
-
-#define romsignature(x) (*(unsigned short *)(x) == 0xaa55)
-
-static inline void probe_video_rom(int roms)
-{
-	unsigned long base;
-	unsigned char *romstart;
-
-
-	/* Video ROM is standard at C000:0000 - C7FF:0000, check signature */
-	for (base = 0xC0000; base < 0xE0000; base += 2048) {
-		romstart = isa_bus_to_virt(base);
-		if (!romsignature(romstart))
-			continue;
-		request_resource(&iomem_resource, rom_resources + roms);
-		roms++;
-		break;
-	}
-}
-
-static inline void probe_extension_roms(int roms)
-{
-	unsigned long base;
-	unsigned char *romstart;
-
-	/* Extension roms at C800:0000 - DFFF:0000 */
-	for (base = 0xC8000; base < 0xE0000; base += 2048) {
-		unsigned long length;
-
-		romstart = isa_bus_to_virt(base);
-		if (!romsignature(romstart))
-			continue;
-		length = romstart[2] * 512;
-		if (length) {
-			unsigned int i;
-			unsigned char chksum;
-
-			chksum = 0;
-			for (i = 0; i < length; i++)
-				chksum += romstart[i];
-
-			/* Good checksum? */
-			if (!chksum) {
-				rom_resources[roms].start = base;
-				rom_resources[roms].end = base + length - 1;
-				rom_resources[roms].name = "Extension ROM";
-				rom_resources[roms].flags = IORESOURCE_BUSY;
-
-				request_resource(&iomem_resource, rom_resources + roms);
-				roms++;
-				if (roms >= MAXROMS)
-					return;
-			}
-		}
-	}
-
-	/* Final check for motherboard extension rom at E000:0000 */
-	base = 0xE0000;
-	romstart = isa_bus_to_virt(base);
-
-	if (romsignature(romstart)) {
-		rom_resources[roms].start = base;
-		rom_resources[roms].end = base + 65535;
-		rom_resources[roms].name = "Extension ROM";
-		rom_resources[roms].flags = IORESOURCE_BUSY;
-
-		request_resource(&iomem_resource, rom_resources + roms);
-	}
-}
-
-static inline void request_graphics_resource(void)
-{
-	request_resource(&iomem_resource, &vram_resource);
-}
-
-#endif /* !_MACH_RESOURCES_H */
diff --git a/include/asm-i386/mach-pc9800/mach_resources.h b/include/asm-i386/mach-pc9800/mach_resources.h
deleted file mode 100644
index bf1b2c470aec..000000000000
--- a/include/asm-i386/mach-pc9800/mach_resources.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- *  include/asm-i386/mach-pc9800/mach_resources.h
- *
- *  Machine specific resource allocation for PC-9800.
- *  Written by Osamu Tomita <tomita@cinet.co.jp>
- */
-#ifndef _MACH_RESOURCES_H
-#define _MACH_RESOURCES_H
-
-static char str_pic1[] = "pic1";
-static char str_dma[] = "dma";
-static char str_pic2[] = "pic2";
-static char str_calender_clock[] = "calender clock";
-static char str_system[] = "system";
-static char str_nmi_control[] = "nmi control";
-static char str_kanji_rom[] = "kanji rom";
-static char str_keyboard[] = "keyboard";
-static char str_text_gdc[] = "text gdc";
-static char str_crtc[] = "crtc";
-static char str_timer[] = "timer";
-static char str_graphic_gdc[] = "graphic gdc";
-static char str_dma_ex_bank[] = "dma ex. bank";
-static char str_beep_freq[] = "beep freq.";
-static char str_mouse_pio[] = "mouse pio";
-struct resource standard_io_resources[] = {
-	{ str_pic1, 0x00, 0x00, IORESOURCE_BUSY },
-	{ str_dma, 0x01, 0x01, IORESOURCE_BUSY },
-	{ str_pic1, 0x02, 0x02, IORESOURCE_BUSY },
-	{ str_dma, 0x03, 0x03, IORESOURCE_BUSY },
-	{ str_dma, 0x05, 0x05, IORESOURCE_BUSY },
-	{ str_dma, 0x07, 0x07, IORESOURCE_BUSY },
-	{ str_pic2, 0x08, 0x08, IORESOURCE_BUSY },
-	{ str_dma, 0x09, 0x09, IORESOURCE_BUSY },
-	{ str_pic2, 0x0a, 0x0a, IORESOURCE_BUSY },
-	{ str_dma, 0x0b, 0x0b, IORESOURCE_BUSY },
-	{ str_dma, 0x0d, 0x0d, IORESOURCE_BUSY },
-	{ str_dma, 0x0f, 0x0f, IORESOURCE_BUSY },
-	{ str_dma, 0x11, 0x11, IORESOURCE_BUSY },
-	{ str_dma, 0x13, 0x13, IORESOURCE_BUSY },
-	{ str_dma, 0x15, 0x15, IORESOURCE_BUSY },
-	{ str_dma, 0x17, 0x17, IORESOURCE_BUSY },
-	{ str_dma, 0x19, 0x19, IORESOURCE_BUSY },
-	{ str_dma, 0x1b, 0x1b, IORESOURCE_BUSY },
-	{ str_dma, 0x1d, 0x1d, IORESOURCE_BUSY },
-	{ str_dma, 0x1f, 0x1f, IORESOURCE_BUSY },
-	{ str_calender_clock, 0x20, 0x20, 0 },
-	{ str_dma, 0x21, 0x21, IORESOURCE_BUSY },
-	{ str_calender_clock, 0x22, 0x22, 0 },
-	{ str_dma, 0x23, 0x23, IORESOURCE_BUSY },
-	{ str_dma, 0x25, 0x25, IORESOURCE_BUSY },
-	{ str_dma, 0x27, 0x27, IORESOURCE_BUSY },
-	{ str_dma, 0x29, 0x29, IORESOURCE_BUSY },
-	{ str_dma, 0x2b, 0x2b, IORESOURCE_BUSY },
-	{ str_dma, 0x2d, 0x2d, IORESOURCE_BUSY },
-	{ str_system, 0x31, 0x31, IORESOURCE_BUSY },
-	{ str_system, 0x33, 0x33, IORESOURCE_BUSY },
-	{ str_system, 0x35, 0x35, IORESOURCE_BUSY },
-	{ str_system, 0x37, 0x37, IORESOURCE_BUSY },
-	{ str_nmi_control, 0x50, 0x50, IORESOURCE_BUSY },
-	{ str_nmi_control, 0x52, 0x52, IORESOURCE_BUSY },
-	{ "time stamp", 0x5c, 0x5f, IORESOURCE_BUSY },
-	{ str_kanji_rom, 0xa1, 0xa1, IORESOURCE_BUSY },
-	{ str_kanji_rom, 0xa3, 0xa3, IORESOURCE_BUSY },
-	{ str_kanji_rom, 0xa5, 0xa5, IORESOURCE_BUSY },
-	{ str_kanji_rom, 0xa7, 0xa7, IORESOURCE_BUSY },
-	{ str_kanji_rom, 0xa9, 0xa9, IORESOURCE_BUSY },
-	{ str_keyboard, 0x41, 0x41, IORESOURCE_BUSY },
-	{ str_keyboard, 0x43, 0x43, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x60, 0x60, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x62, 0x62, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x64, 0x64, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x66, 0x66, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x68, 0x68, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x6a, 0x6a, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x6c, 0x6c, IORESOURCE_BUSY },
-	{ str_text_gdc, 0x6e, 0x6e, IORESOURCE_BUSY },
-	{ str_crtc, 0x70, 0x70, IORESOURCE_BUSY },
-	{ str_crtc, 0x72, 0x72, IORESOURCE_BUSY },
-	{ str_crtc, 0x74, 0x74, IORESOURCE_BUSY },
-	{ str_crtc, 0x74, 0x74, IORESOURCE_BUSY },
-	{ str_crtc, 0x76, 0x76, IORESOURCE_BUSY },
-	{ str_crtc, 0x78, 0x78, IORESOURCE_BUSY },
-	{ str_crtc, 0x7a, 0x7a, IORESOURCE_BUSY },
-	{ str_timer, 0x71, 0x71, IORESOURCE_BUSY },
-	{ str_timer, 0x73, 0x73, IORESOURCE_BUSY },
-	{ str_timer, 0x75, 0x75, IORESOURCE_BUSY },
-	{ str_timer, 0x77, 0x77, IORESOURCE_BUSY },
-	{ str_graphic_gdc, 0xa0, 0xa0, IORESOURCE_BUSY },
-	{ str_graphic_gdc, 0xa2, 0xa2, IORESOURCE_BUSY },
-	{ str_graphic_gdc, 0xa4, 0xa4, IORESOURCE_BUSY },
-	{ str_graphic_gdc, 0xa6, 0xa6, IORESOURCE_BUSY },
-	{ "cpu", 0xf0, 0xf7, IORESOURCE_BUSY },
-	{ "fpu", 0xf8, 0xff, IORESOURCE_BUSY },
-	{ str_dma_ex_bank, 0x0e05, 0x0e05, 0 },
-	{ str_dma_ex_bank, 0x0e07, 0x0e07, 0 },
-	{ str_dma_ex_bank, 0x0e09, 0x0e09, 0 },
-	{ str_dma_ex_bank, 0x0e0b, 0x0e0b, 0 },
-	{ str_beep_freq, 0x3fd9, 0x3fd9, IORESOURCE_BUSY },
-	{ str_beep_freq, 0x3fdb, 0x3fdb, IORESOURCE_BUSY },
-	{ str_beep_freq, 0x3fdd, 0x3fdd, IORESOURCE_BUSY },
-	{ str_beep_freq, 0x3fdf, 0x3fdf, IORESOURCE_BUSY },
-	/* All PC-9800 have (exactly) one mouse interface.  */
-	{ str_mouse_pio, 0x7fd9, 0x7fd9, 0 },
-	{ str_mouse_pio, 0x7fdb, 0x7fdb, 0 },
-	{ str_mouse_pio, 0x7fdd, 0x7fdd, 0 },
-	{ str_mouse_pio, 0x7fdf, 0x7fdf, 0 },
-	{ "mouse timer", 0xbfdb, 0xbfdb, 0 },
-	{ "mouse irq", 0x98d7, 0x98d7, 0 },
-};
-
-#define STANDARD_IO_RESOURCES (sizeof(standard_io_resources)/sizeof(struct resource))
-
-static struct resource tvram_resource = { "Text VRAM/CG window", 0xa0000, 0xa4fff, IORESOURCE_BUSY };
-static struct resource gvram_brg_resource = { "Graphic VRAM (B/R/G)", 0xa8000, 0xbffff, IORESOURCE_BUSY };
-static struct resource gvram_e_resource = { "Graphic VRAM (E)", 0xe0000, 0xe7fff, IORESOURCE_BUSY };
-
-/* System ROM resources */
-#define MAXROMS 6
-static struct resource rom_resources[MAXROMS] = {
-	{ "System ROM", 0xe8000, 0xfffff, IORESOURCE_BUSY }
-};
-
-static inline void probe_video_rom(int roms)
-{
-	/* PC-9800 has no video ROM */
-}
-
-static inline void probe_extension_roms(int roms)
-{
-	int i;
-	__u8 *xrom_id;
-
-	xrom_id = (__u8 *) isa_bus_to_virt(PC9800SCA_XROM_ID + 0x10);
-
-	for (i = 0; i < 16; i++) {
-		if (xrom_id[i] & 0x80) {
-			int j;
-
-			for (j = i + 1; j < 16 && (xrom_id[j] & 0x80); j++)
-				;
-			rom_resources[roms].start = 0x0d0000 + i * 0x001000;
-			rom_resources[roms].end = 0x0d0000 + j * 0x001000 - 1;
-			rom_resources[roms].name = "Extension ROM";
-			rom_resources[roms].flags = IORESOURCE_BUSY;
-
-			request_resource(&iomem_resource,
-					  rom_resources + roms);
-			if (++roms >= MAXROMS)
-				return;
-		}
-	}
-}
-
-static inline void request_graphics_resource(void)
-{
-	int i;
-
-	if (PC9800_HIGHRESO_P()) {
-		tvram_resource.start = 0xe0000;
-		tvram_resource.end   = 0xe4fff;
-		gvram_brg_resource.name  = "Graphic VRAM";
-		gvram_brg_resource.start = 0xc0000;
-		gvram_brg_resource.end   = 0xdffff;
-	}
-
-	request_resource(&iomem_resource, &tvram_resource);
-	request_resource(&iomem_resource, &gvram_brg_resource);
-	if (!PC9800_HIGHRESO_P())
-		request_resource(&iomem_resource, &gvram_e_resource);
-
-	if (PC9800_HIGHRESO_P() || PC9800_9821_P()) {
-		static char graphics[] = "graphics";
-		static struct resource graphics_resources[] = {
-			{ graphics, 0x9a0, 0x9a0, 0 },
-			{ graphics, 0x9a2, 0x9a2, 0 },
-			{ graphics, 0x9a4, 0x9a4, 0 },
-			{ graphics, 0x9a6, 0x9a6, 0 },
-			{ graphics, 0x9a8, 0x9a8, 0 },
-			{ graphics, 0x9aa, 0x9aa, 0 },
-			{ graphics, 0x9ac, 0x9ac, 0 },
-			{ graphics, 0x9ae, 0x9ae, 0 },
-		};
-
-#define GRAPHICS_RESOURCES (sizeof(graphics_resources)/sizeof(struct resource))
-
-		for (i = 0; i < GRAPHICS_RESOURCES; i++)
-			request_resource(&ioport_resource, graphics_resources + i);
-	}
-}
-
-#endif /* !_MACH_RESOURCES_H */
diff --git a/include/asm-i386/std_resources.h b/include/asm-i386/std_resources.h
new file mode 100644
index 000000000000..53733988dabd
--- /dev/null
+++ b/include/asm-i386/std_resources.h
@@ -0,0 +1,14 @@
+/*
+ * include/asm-i386/std_resources.h
+ */
+
+#ifndef __ASM_I386_STD_RESOURCES_H
+#define __ASM_I386_STD_RESOURCES_H
+
+#include <linux/init.h>
+
+void probe_roms(void) __init;
+void request_graphics_resource(void) __init;
+void request_standard_io_resources(void) __init;
+
+#endif
-- 
cgit v1.2.3


From 69a03dedc92b7968fc9ca5c701e8d2d6c481750d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 00:57:22 -0700
Subject: [PATCH] swsusp update: supports discontingmem/highmem

From: Pavel Machek <pavel@ucw.cz>

Bill Irwin did some work on this.  It makes swsusp behave correctly w.r.t.
discontingmem, and adds highmem handling (very simple-minded, but should work
ok with 1GB).  It now should behave correctly w.r.t.  more than one swap
device, and fixes double restoring of console.
---
 include/linux/suspend.h |   2 +-
 kernel/power/swsusp.c   | 244 ++++++++++++++++++++++++++++++++++--------------
 2 files changed, 173 insertions(+), 73 deletions(-)

(limited to 'include')

diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 810947658d59..7e4409b7c55b 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -24,7 +24,7 @@ typedef struct pbe {
 #define SWAP_FILENAME_MAXLENGTH	32
 
 struct suspend_header {
-	__u32 version_code;
+	u32 version_code;
 	unsigned long num_physpages;
 	char machine[8];
 	char version[20];
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index ae748a467af5..23e577559fd9 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -1,11 +1,11 @@
 /*
- * linux/kernel/suspend.c
+ * linux/kernel/power/swsusp.c
  *
  * This file is to realize architecture-independent
  * machine suspend feature using pretty near only high-level routines
  *
  * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
- * Copyright (C) 1998,2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 1998,2001-2004 Pavel Machek <pavel@suse.cz>
  *
  * This file is released under the GPLv2.
  *
@@ -61,6 +61,7 @@
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
 #include <linux/console.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -74,11 +75,6 @@ unsigned char software_suspend_enabled = 0;
 #define NORESUME		1
 #define RESUME_SPECIFIED	2
 
-
-#define __ADDRESS(x)  ((unsigned long) phys_to_virt(x))
-#define ADDRESS(x) __ADDRESS((x) << PAGE_SHIFT)
-#define ADDRESS2(x) __ADDRESS(__pa(x))		/* Needed for x86-64 where some pages are in memory twice */
-
 /* References to section boundaries */
 extern char __nosave_begin, __nosave_end;
 
@@ -105,6 +101,10 @@ unsigned int nr_copy_pages __nosavedata = 0;
    time of suspend, that must be freed. Second is "pagedir_nosave", 
    allocated at time of resume, that travels through memory not to
    collide with anything.
+
+   Warning: this is even more evil than it seems. Pagedirs this file
+   talks about are completely different from page directories used by
+   MMU hardware.
  */
 suspend_pagedir_t *pagedir_nosave __nosavedata = NULL;
 static suspend_pagedir_t *pagedir_save;
@@ -139,15 +139,15 @@ static const char name_resume[] = "Resume Machine: ";
 #define TEST_SWSUSP 0		/* Set to 1 to reboot instead of halt machine after suspension */
 
 #ifdef DEBUG_DEFAULT
-# define PRINTK(f, a...)       printk(f, ## a)
+# define PRINTK(f, a...)	printk(f, ## a)
 #else
-# define PRINTK(f, a...)
+# define PRINTK(f, a...)       	do { } while(0)
 #endif
 
 #ifdef DEBUG_SLOW
 #define MDELAY(a) mdelay(a)
 #else
-#define MDELAY(a)
+#define MDELAY(a) do { } while(0)
 #endif
 
 /*
@@ -225,6 +225,7 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
 static void read_swapfiles(void) /* This is called before saving image */
 {
 	int i, len;
+	char buff[sizeof(resume_file)], *sname;
 	
 	len=strlen(resume_file);
 	root_swap = 0xFFFF;
@@ -243,8 +244,11 @@ static void read_swapfiles(void) /* This is called before saving image */
 					swapfile_used[i] = SWAPFILE_IGNORED;				  
 			} else {
 	  			/* we ignore all swap devices that are not the resume_file */
-				if (1) {
-// FIXME				if(resume_device == swap_info[i].swap_device) {
+				sname = d_path(swap_info[i].swap_file->f_dentry,
+					       swap_info[i].swap_file->f_vfsmnt,
+					       buff,
+					       sizeof(buff));
+				if (!strcmp(sname, resume_file)) {
 					swapfile_used[i] = SWAPFILE_SUSPEND;
 					root_swap = i;
 				} else {
@@ -346,7 +350,7 @@ static int write_suspend_image(void)
 
 	cur = (void *) buffer;
 	if (fill_suspend_header(&cur->sh))
-		panic("\nOut of memory while writing header");
+		BUG();		/* Not a BUG_ON(): we want fill_suspend_header to be called, always */
 		
 	cur->link.next = prev;
 
@@ -362,73 +366,165 @@ static int write_suspend_image(void)
 	return 0;
 }
 
-/* if pagedir_p != NULL it also copies the counted pages */
-static int count_and_copy_data_pages(struct pbe *pagedir_p)
-{
-	int chunk_size;
-	int nr_copy_pages = 0;
-	int pfn;
+struct highmem_page {
+	char *data;
 	struct page *page;
-	
-#ifdef CONFIG_DISCONTIGMEM
-	panic("Discontingmem not supported");
-#else
-	BUG_ON (max_pfn != num_physpages);
-#endif
-	for (pfn = 0; pfn < max_pfn; pfn++) {
+	struct highmem_page *next;
+};
+
+struct highmem_page *highmem_copy = NULL;
+
+static void save_highmem_zone(struct zone *zone)
+{
+	unsigned long zone_pfn;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		struct highmem_page *save;
+		void *kaddr;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		int chunk_size;
+
+		if (!(pfn%200))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
-		if (PageHighMem(page))
-			panic("Swsusp not supported on highmem boxes. Send 1GB of RAM to <pavel@ucw.cz> and try again ;-).");
+		/*
+		 * This condition results from rvmalloc() sans vmalloc_32()
+		 * and architectural memory reservations. This should be
+		 * corrected eventually when the cases giving rise to this
+		 * are better understood.
+		 */
+		if (PageReserved(page)) {
+			printk("highmem reserved page?!\n");
+			BUG();
+		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		save = kmalloc(sizeof(struct highmem_page), GFP_ATOMIC);
+		if (!save)
+			panic("Not enough memory");
+		save->next = highmem_copy;
+		save->page = page;
+		save->data = (void *) get_zeroed_page(GFP_ATOMIC);
+		if (!save->data)
+			panic("Not enough memory");
+		kaddr = kmap_atomic(page, KM_USER0);
+		memcpy(save->data, kaddr, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		highmem_copy = save;
+	}
+}
 
-		if (!PageReserved(page)) {
-			if (PageNosave(page))
-				continue;
+static void save_highmem(void)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (is_highmem(zone))
+			save_highmem_zone(zone);
+	}
+}
 
-			if ((chunk_size=is_head_of_free_region(page))!=0) {
-				pfn += chunk_size - 1;
-				continue;
-			}
-		} else if (PageReserved(page)) {
-			BUG_ON (PageNosave(page));
+static int restore_highmem(void)
+{
+	while (highmem_copy) {
+		struct highmem_page *save = highmem_copy;
+		void *kaddr;
+		highmem_copy = save->next;
+
+		kaddr = kmap_atomic(save->page, KM_USER0);
+		memcpy(kaddr, save->data, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		free_page((long) save->data);
+		kfree(save);
+	}
+	return 0;
+}
 
-			/*
-			 * Just copy whole code segment. Hopefully it is not that big.
-			 */
-			if ((ADDRESS(pfn) >= (unsigned long) ADDRESS2(&__nosave_begin)) && 
-			    (ADDRESS(pfn) <  (unsigned long) ADDRESS2(&__nosave_end))) {
-				PRINTK("[nosave %lx]", ADDRESS(pfn));
-				continue;
-			}
-			/* Hmm, perhaps copying all reserved pages is not too healthy as they may contain 
-			   critical bios data? */
-		} else	BUG();
+static int pfn_is_nosave(unsigned long pfn)
+{
+	unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
+	unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
+	return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
+}
 
-		nr_copy_pages++;
-		if (pagedir_p) {
-			pagedir_p->orig_address = ADDRESS(pfn);
-			copy_page((void *) pagedir_p->address, (void *) pagedir_p->orig_address);
-			pagedir_p++;
+/* if *pagedir_p != NULL it also copies the counted pages */
+static int count_and_copy_zone(struct zone *zone, struct pbe **pagedir_p)
+{
+	unsigned long zone_pfn, chunk_size, nr_copy_pages = 0;
+	struct pbe *pbe = *pagedir_p;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+
+		if (!(pfn%200))
+			printk(".");
+		if (!pfn_valid(pfn))
+			continue;
+		page = pfn_to_page(pfn);
+		BUG_ON(PageReserved(page) && PageNosave(page));
+		if (PageNosave(page))
+			continue;
+		if (PageReserved(page) && pfn_is_nosave(pfn)) {
+			PRINTK("[nosave pfn 0x%lx]", pfn);
+			continue;
 		}
+		if ((chunk_size = is_head_of_free_region(page))) {
+			pfn += chunk_size - 1;
+			zone_pfn += chunk_size - 1;
+			continue;
+		}
+		nr_copy_pages++;
+		if (!pbe)
+			continue;
+		pbe->orig_address = (long) page_address(page);
+		copy_page((void *)pbe->address, (void *)pbe->orig_address);
+		pbe++;
 	}
+	*pagedir_p = pbe;
 	return nr_copy_pages;
 }
 
-static void free_suspend_pagedir(unsigned long this_pagedir)
+static int count_and_copy_data_pages(struct pbe *pagedir_p)
 {
-	struct page *page;
-	int pfn;
-	unsigned long this_pagedir_end = this_pagedir +
-		(PAGE_SIZE << pagedir_order);
+	int nr_copy_pages = 0;
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			nr_copy_pages += count_and_copy_zone(zone, &pagedir_p);
+	}
+	return nr_copy_pages;
+}
 
-	for(pfn = 0; pfn < num_physpages; pfn++) {
+static void free_suspend_pagedir_zone(struct zone *zone, unsigned long pagedir)
+{
+	unsigned long zone_pfn, pagedir_end, pagedir_pfn, pagedir_end_pfn;
+	pagedir_end = pagedir + (PAGE_SIZE << pagedir_order);
+	pagedir_pfn = __pa(pagedir) >> PAGE_SHIFT;
+	pagedir_end_pfn = __pa(pagedir_end) >> PAGE_SHIFT;
+	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
+		struct page *page;
+		unsigned long pfn = zone_pfn + zone->zone_start_pfn;
+		if (!pfn_valid(pfn))
+			continue;
 		page = pfn_to_page(pfn);
 		if (!TestClearPageNosave(page))
 			continue;
+		else if (pfn >= pagedir_pfn && pfn < pagedir_end_pfn)
+			continue;
+		__free_page(page);
+	}
+}
 
-		if (ADDRESS(pfn) >= this_pagedir && ADDRESS(pfn) < this_pagedir_end)
-			continue; /* old pagedir gets freed in one */
-		
-		free_page(ADDRESS(pfn));
+static void free_suspend_pagedir(unsigned long this_pagedir)
+{
+	struct zone *zone;
+	for_each_zone(zone) {
+		if (!is_highmem(zone))
+			free_suspend_pagedir_zone(zone, this_pagedir);
 	}
 	free_pages(this_pagedir, pagedir_order);
 }
@@ -443,7 +539,7 @@ static suspend_pagedir_t *create_suspend_pagedir(int nr_copy_pages)
 	pagedir_order = get_bitmask_order(SUSPEND_PD_PAGES(nr_copy_pages));
 
 	p = pagedir = (suspend_pagedir_t *)__get_free_pages(GFP_ATOMIC | __GFP_COLD, pagedir_order);
-	if(!pagedir)
+	if (!pagedir)
 		return NULL;
 
 	page = virt_to_page(pagedir);
@@ -492,10 +588,12 @@ static int suspend_prepare_image(void)
 	struct sysinfo i;
 	unsigned int nr_needed_pages = 0;
 
-	drain_local_pages();
-
 	pagedir_nosave = NULL;
-	printk( "/critical section: Counting pages to copy" );
+	printk( "/critical section: Handling highmem" );
+	save_highmem();
+
+	printk(", counting pages to copy" );
+	drain_local_pages();
 	nr_copy_pages = count_and_copy_data_pages(NULL);
 	nr_needed_pages = nr_copy_pages + PAGES_FOR_IO;
 	
@@ -603,21 +701,23 @@ asmlinkage void do_magic_resume_2(void)
 
 	PRINTK( "Freeing prev allocated pagedir\n" );
 	free_suspend_pagedir((unsigned long) pagedir_save);
+
+	printk( "Restoring highmem\n" );
+	restore_highmem();
+	printk("done, devices\n");
+
 	device_power_up();
 	spin_unlock_irq(&suspend_pagedir_lock);
 	device_resume();
 
-	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
-	release_console_sem();
-
+	/* Fixme: this is too late; we should do this ASAP to avoid "infinite reboots" problem */
 	PRINTK( "Fixing swap signatures... " );
 	mark_swapfiles(((swp_entry_t) {0}), MARK_SWAP_RESUME);
 	PRINTK( "ok\n" );
 
 #ifdef SUSPEND_CONSOLE
 	acquire_console_sem();
-	update_screen(fg_console);	/* Hmm, is this the problem? */
+	update_screen(fg_console);
 	release_console_sem();
 #endif
 }
-- 
cgit v1.2.3


From 93616c25b89090f28ad3ee509c33d69bddbeb7a8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:01:04 -0700
Subject: [PATCH] remove concatenation with __FUNCTION__ include/*

From: Tony Breeds <tony@bakeyournoodle.com>
---
 include/linux/jbd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 62c7f363ae74..e9f6c69f79db 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -1012,10 +1012,10 @@ extern int	cleanup_journal_tail(journal_t *);
 /* Debugging code only: */
 
 #define jbd_ENOSYS() \
-do {								      \
-	printk (KERN_ERR "JBD unimplemented function " __FUNCTION__); \
-	current->state = TASK_UNINTERRUPTIBLE;			      \
-	schedule();						      \
+do {								           \
+	printk (KERN_ERR "JBD unimplemented function %s\n", __FUNCTION__); \
+	current->state = TASK_UNINTERRUPTIBLE;			           \
+	schedule();						           \
 } while (1)
 
 /*
-- 
cgit v1.2.3


From dbb1a307d5ba76355030e7bb4200fbef1a659539 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:01:57 -0700
Subject: [PATCH] fix test_and_change_bit comment

From: Paul Jackson <pj@sgi.com>

I've read over the code in each case, built and ran a test case for i386 in
particular, and studied the other uses and definitions of
test_and_change_bit().  Everything I see recommends this change.

- Fix test_and_change_bit() comment: returns old value, not new one.
---
 include/asm-cris/bitops.h   | 2 +-
 include/asm-i386/bitops.h   | 2 +-
 include/asm-ia64/bitops.h   | 2 +-
 include/asm-mips/bitops.h   | 4 ++--
 include/asm-x86_64/bitops.h | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h
index 10de1ccd1e01..f7401036610e 100644
--- a/include/asm-cris/bitops.h
+++ b/include/asm-cris/bitops.h
@@ -169,7 +169,7 @@ extern inline int __test_and_clear_bit(int nr, void *addr)
 	return retval;
 }
 /**
- * test_and_change_bit - Change a bit and return its new value
+ * test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h
index c39edc67d111..114ac7ac3680 100644
--- a/include/asm-i386/bitops.h
+++ b/include/asm-i386/bitops.h
@@ -212,7 +212,7 @@ static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr
 }
 
 /**
- * test_and_change_bit - Change a bit and return its new value
+ * test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h
index 502f51a1a0ee..bd39859c2d2a 100644
--- a/include/asm-ia64/bitops.h
+++ b/include/asm-ia64/bitops.h
@@ -236,7 +236,7 @@ __test_and_clear_bit(int nr, volatile void * addr)
 }
 
 /**
- * test_and_change_bit - Change a bit and return its new value
+ * test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to set
  * @addr: Address to count from
  *
diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h
index 8a3d1a32505b..cb06e891d0aa 100644
--- a/include/asm-mips/bitops.h
+++ b/include/asm-mips/bitops.h
@@ -296,7 +296,7 @@ static inline int __test_and_clear_bit(unsigned long nr,
 }
 
 /*
- * test_and_change_bit - Change a bit and return its new value
+ * test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
@@ -567,7 +567,7 @@ static inline int __test_and_clear_bit(unsigned long nr,
 }
 
 /*
- * test_and_change_bit - Change a bit and return its new value
+ * test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h
index bb086405d2d4..5ce5dcef4f35 100644
--- a/include/asm-x86_64/bitops.h
+++ b/include/asm-x86_64/bitops.h
@@ -204,7 +204,7 @@ static __inline__ int __test_and_change_bit(int nr, volatile void * addr)
 }
 
 /**
- * test_and_change_bit - Change a bit and return its new value
+ * test_and_change_bit - Change a bit and return its old value
  * @nr: Bit to change
  * @addr: Address to count from
  *
-- 
cgit v1.2.3


From 8447ac2688647d261af7a7397a53548a2a1afc13 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:03:29 -0700
Subject: [PATCH] Rename bitmap_clear to bitmap_zero, remove CLEAR_BITMAP

From: Rusty Russell <rusty@rustcorp.com.au>

clear_bit(n, addr) clears the nth bit.
test_and_clear_bit(n, addr) clears the nth bit.
cpu_clear(n, cpumask) clears the nth bit (vs. cpus_clear()).
bitmap_clear(bitmap, n) clears out all the bits up to n.

Moreover, there's a CLEAR_BITMAP() in linux/types.h which bitmap_clear() is
a wrapper for.

Rename bitmap_clear to bitmap_zero, which is harder to confuse (yes, it bit
me), and make everyone use it.
---
 arch/ia64/sn/kernel/sn2/sn2_smp.c   | 2 +-
 drivers/atm/lanai.c                 | 6 +++---
 drivers/ieee1394/ieee1394_types.h   | 2 +-
 drivers/scsi/atari_NCR5380.c        | 4 ++--
 include/asm-generic/cpumask_array.h | 2 +-
 include/asm-i386/mpspec.h           | 2 +-
 include/asm-x86_64/mpspec.h         | 2 +-
 include/linux/bitmap.h              | 4 ++--
 include/linux/types.h               | 2 --
 lib/bitmap.c                        | 2 +-
 mm/page_alloc.c                     | 2 +-
 11 files changed, 14 insertions(+), 16 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index 3cfb3cd74d51..e8bc389edebb 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -91,7 +91,7 @@ sn2_global_tlb_purge (unsigned long start, unsigned long end, unsigned long nbit
 	short			nasids[NR_NODES], nix;
 	DECLARE_BITMAP(nodes_flushed, NR_NODES);
 
-	CLEAR_BITMAP(nodes_flushed, NR_NODES);
+	bitmap_zero(nodes_flushed, NR_NODES);
 
 	i = 0;
 
diff --git a/drivers/atm/lanai.c b/drivers/atm/lanai.c
index 402e78a5ac97..5a7037af6add 100644
--- a/drivers/atm/lanai.c
+++ b/drivers/atm/lanai.c
@@ -1743,7 +1743,7 @@ static void run_service(struct lanai_dev *lanai)
 		read_lock(&vcc_sklist_lock);
 		vci_bitfield_iterate(lanai, lanai->transmit_ready,
 		    iter_transmit);
-		CLEAR_BITMAP(&lanai->transmit_ready, NUM_VCI);
+		bitmap_zero(lanai->transmit_ready, NUM_VCI);
 		read_unlock(&vcc_sklist_lock);
 	}
 }
@@ -2158,8 +2158,8 @@ static int __init lanai_dev_open(struct atm_dev *atmdev)
 	/* Basic device fields */
 	lanai->number = atmdev->number;
 	lanai->num_vci = NUM_VCI;
-	CLEAR_BITMAP(&lanai->backlog_vccs, NUM_VCI);
-	CLEAR_BITMAP(&lanai->transmit_ready, NUM_VCI);
+	bitmap_zero(lanai->backlog_vccs, NUM_VCI);
+	bitmap_zero(lanai->transmit_ready, NUM_VCI);
 	lanai->naal0 = 0;
 #ifdef USE_POWERDOWN
 	lanai->nbound = 0;
diff --git a/drivers/ieee1394/ieee1394_types.h b/drivers/ieee1394/ieee1394_types.h
index 552667142ce1..3165609ec1ec 100644
--- a/drivers/ieee1394/ieee1394_types.h
+++ b/drivers/ieee1394/ieee1394_types.h
@@ -24,7 +24,7 @@ struct hpsb_tlabel_pool {
 
 #define HPSB_TPOOL_INIT(_tp)			\
 do {						\
-	CLEAR_BITMAP((_tp)->pool, 64);		\
+	bitmap_zero((_tp)->pool, 64);		\
 	spin_lock_init(&(_tp)->lock);		\
 	(_tp)->next = 0;			\
 	(_tp)->allocations = 0;			\
diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c
index cd8ddb7084a2..5d1e78ebed83 100644
--- a/drivers/scsi/atari_NCR5380.c
+++ b/drivers/scsi/atari_NCR5380.c
@@ -329,7 +329,7 @@ static void __init init_tags( void )
     for( target = 0; target < 8; ++target ) {
 	for( lun = 0; lun < 8; ++lun ) {
 	    ta = &TagAlloc[target][lun];
-	    CLEAR_BITMAP( ta->allocated, MAX_TAGS );
+	    bitmap_zero(ta->allocated, MAX_TAGS);
 	    ta->nr_allocated = 0;
 	    /* At the beginning, assume the maximum queue size we could
 	     * support (MAX_TAGS). This value will be decreased if the target
@@ -438,7 +438,7 @@ static void free_all_tags( void )
     for( target = 0; target < 8; ++target ) {
 	for( lun = 0; lun < 8; ++lun ) {
 	    ta = &TagAlloc[target][lun];
-	    CLEAR_BITMAP( ta->allocated, MAX_TAGS );
+	    bitmap_zero(ta->allocated, MAX_TAGS);
 	    ta->nr_allocated = 0;
 	}
     }
diff --git a/include/asm-generic/cpumask_array.h b/include/asm-generic/cpumask_array.h
index bd5c49133c6c..c7e2db29dc53 100644
--- a/include/asm-generic/cpumask_array.h
+++ b/include/asm-generic/cpumask_array.h
@@ -16,7 +16,7 @@
 
 #define cpus_and(dst,src1,src2)	bitmap_and((dst).mask,(src1).mask, (src2).mask, NR_CPUS)
 #define cpus_or(dst,src1,src2)	bitmap_or((dst).mask, (src1).mask, (src2).mask, NR_CPUS)
-#define cpus_clear(map)		bitmap_clear((map).mask, NR_CPUS)
+#define cpus_clear(map)		bitmap_zero((map).mask, NR_CPUS)
 #define cpus_complement(map)	bitmap_complement((map).mask, NR_CPUS)
 #define cpus_equal(map1, map2)	bitmap_equal((map1).mask, (map2).mask, NR_CPUS)
 #define cpus_empty(map)		bitmap_empty(map.mask, NR_CPUS)
diff --git a/include/asm-i386/mpspec.h b/include/asm-i386/mpspec.h
index 78bd12b7ae42..b376b093749c 100644
--- a/include/asm-i386/mpspec.h
+++ b/include/asm-i386/mpspec.h
@@ -52,7 +52,7 @@ typedef struct physid_mask physid_mask_t;
 
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_clear(map)			bitmap_clear((map).mask, MAX_APICS)
+#define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
 #define physids_complement(map)			bitmap_complement((map).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
diff --git a/include/asm-x86_64/mpspec.h b/include/asm-x86_64/mpspec.h
index 896b99f11cec..cbe6058e9270 100644
--- a/include/asm-x86_64/mpspec.h
+++ b/include/asm-x86_64/mpspec.h
@@ -211,7 +211,7 @@ typedef struct physid_mask physid_mask_t;
 
 #define physids_and(dst, src1, src2)		bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
 #define physids_or(dst, src1, src2)		bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
-#define physids_clear(map)			bitmap_clear((map).mask, MAX_APICS)
+#define physids_clear(map)			bitmap_zero((map).mask, MAX_APICS)
 #define physids_complement(map)			bitmap_complement((map).mask, MAX_APICS)
 #define physids_empty(map)			bitmap_empty((map).mask, MAX_APICS)
 #define physids_equal(map1, map2)		bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h
index 2ad5fb97fa26..81e73cdc1a62 100644
--- a/include/linux/bitmap.h
+++ b/include/linux/bitmap.h
@@ -16,9 +16,9 @@ int bitmap_equal(const unsigned long *bitmap1,
 			unsigned long *bitmap2, int bits);
 void bitmap_complement(unsigned long *bitmap, int bits);
 
-static inline void bitmap_clear(unsigned long *bitmap, int bits)
+static inline void bitmap_zero(unsigned long *bitmap, int bits)
 {
-	CLEAR_BITMAP((unsigned long *)bitmap, bits);
+	memset(bitmap, 0, BITS_TO_LONGS(bits)*sizeof(unsigned long));
 }
 
 static inline void bitmap_fill(unsigned long *bitmap, int bits)
diff --git a/include/linux/types.h b/include/linux/types.h
index 93f5f3653561..23c414f11cbe 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -8,8 +8,6 @@
 	(((bits)+BITS_PER_LONG-1)/BITS_PER_LONG)
 #define DECLARE_BITMAP(name,bits) \
 	unsigned long name[BITS_TO_LONGS(bits)]
-#define CLEAR_BITMAP(name,bits) \
-	memset(name, 0, BITS_TO_LONGS(bits)*sizeof(unsigned long))
 #endif
 
 #include <linux/posix_types.h>
diff --git a/lib/bitmap.c b/lib/bitmap.c
index 602b919ef551..779d30365e46 100644
--- a/lib/bitmap.c
+++ b/lib/bitmap.c
@@ -273,7 +273,7 @@ int bitmap_parse(const char __user *ubuf, unsigned int ubuflen,
 	int c, old_c, totaldigits, ndigits, nchunks, nbits;
 	u32 chunk;
 
-	bitmap_clear(maskp, nmaskbits);
+	bitmap_zero(maskp, nmaskbits);
 
 	nchunks = nbits = totaldigits = c = 0;
 	do {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6b4d5dc0c930..8d3f6f46105e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1222,7 +1222,7 @@ static void __init build_zonelists(pg_data_t *pgdat)
 	local_node = pgdat->node_id;
 	load = numnodes;
 	prev_node = local_node;
-	CLEAR_BITMAP(used_mask, MAX_NUMNODES);
+	bitmap_zero(used_mask, MAX_NUMNODES);
 	while ((node = find_next_best_node(local_node, used_mask)) >= 0) {
 		/*
 		 * We don't want to pressure a particular node.
-- 
cgit v1.2.3


From d0d15d849f62d00edbc95de914f3bc655d3b8f7b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:06:06 -0700
Subject: [PATCH] Add CONFIG_SYSFS

From: Patrick Mochel <mochel@digitalimplant.org>

Here is a patch to make sysfs optional.  Note that with CONFIG_SYSFS=n you
must specify the boot device's major:minor on the kernel boot command line
with

	root=03:01

For embedded systems, it will save a significant amount of memory during
runtime.  And, it saves 4k from the built kernel image for me.
---
 fs/Kconfig            | 24 +++++++++++++++
 fs/Makefile           |  2 +-
 fs/namespace.c        |  8 +++++
 include/linux/sysfs.h | 83 ++++++++++++++++++++++++++++++++++++++++++++++-----
 init/do_mounts.c      |  2 ++
 5 files changed, 110 insertions(+), 9 deletions(-)

(limited to 'include')

diff --git a/fs/Kconfig b/fs/Kconfig
index c748a2ce35ee..3c0d06f5e359 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -780,6 +780,30 @@ config PROC_KCORE
 	bool
 	default y if !ARM
 
+config SYSFS
+	bool "sysfs file system support" if EMBEDDED
+	default y
+	help
+	The sysfs filesystem is a virtual filesystem that the kernel uses to
+	export internal kernel objects, their attributes, and their
+	relationships to one another.
+
+	Users can use sysfs to ascertain useful information about the running
+	kernel, such as the devices the kernel has discovered on each bus and
+	which driver each is bound to. sysfs can also be used to tune devices
+	and other kernel subsystems.
+
+	Some system agents rely on the information in sysfs to operate.
+	/sbin/hotplug uses device and object attributes in sysfs to assist in
+	delegating policy decisions, like persistantly naming devices.
+
+	sysfs is currently used by the block subsystem to mount the root
+	partition.  If sysfs is disabled you must specify the boot device on
+	the kernel boot command line via its major and minor numbers.  For
+	example, "root=03:01" for /dev/hda1.
+
+	Designers of embedded systems may wish to say N here to conserve space.
+
 config DEVFS_FS
 	bool "/dev file system support (OBSOLETE)"
 	depends on EXPERIMENTAL
diff --git a/fs/Makefile b/fs/Makefile
index 9647bebd4895..a288c0cb3645 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_QUOTACTL)		+= quota.o
 
 obj-$(CONFIG_PROC_FS)		+= proc/
 obj-y				+= partitions/
-obj-y				+= sysfs/
+obj-$(CONFIG_SYSFS)		+= sysfs/
 obj-y				+= devpts/
 
 obj-$(CONFIG_PROFILING)		+= dcookies.o
diff --git a/fs/namespace.c b/fs/namespace.c
index 4584a684c685..3bb33614d764 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -24,7 +24,15 @@
 #include <asm/uaccess.h>
 
 extern int __init init_rootfs(void);
+
+#ifdef CONFIG_SYSFS
 extern int __init sysfs_init(void);
+#else
+static inline int sysfs_init(void)
+{
+	return 0;
+}
+#endif
 
 /* spinlock for vfsmount related operations, inplace of dcache_lock */
 spinlock_t vfsmount_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index b34de79dcf3b..de2083939b74 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -18,6 +18,12 @@ struct attribute {
 	mode_t			mode;
 };
 
+struct attribute_group {
+	char			* name;
+	struct attribute	** attrs;
+};
+
+
 struct bin_attribute {
 	struct attribute	attr;
 	size_t			size;
@@ -25,14 +31,13 @@ struct bin_attribute {
 	ssize_t (*write)(struct kobject *, char *, loff_t, size_t);
 };
 
-int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr);
-int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr);
-
 struct sysfs_ops {
 	ssize_t	(*show)(struct kobject *, struct attribute *,char *);
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t);
 };
 
+#ifdef CONFIG_SYSFS
+
 extern int
 sysfs_create_dir(struct kobject *);
 
@@ -57,13 +62,75 @@ sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name);
 extern void
 sysfs_remove_link(struct kobject *, char * name);
 
-
-struct attribute_group {
-	char			* name;
-	struct attribute	** attrs;
-};
+int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr);
+int sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr);
 
 int sysfs_create_group(struct kobject *, const struct attribute_group *);
 void sysfs_remove_group(struct kobject *, const struct attribute_group *);
 
+#else /* CONFIG_SYSFS */
+
+static inline int sysfs_create_dir(struct kobject * k)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_dir(struct kobject * k)
+{
+	;
+}
+
+static inline void sysfs_rename_dir(struct kobject * k, const char *new_name)
+{
+	;
+}
+
+static inline int sysfs_create_file(struct kobject * k, const struct attribute * a)
+{
+	return 0;
+}
+
+static inline int sysfs_update_file(struct kobject * k, const struct attribute * a)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_file(struct kobject * k, const struct attribute * a)
+{
+	;
+}
+
+static inline int sysfs_create_link(struct kobject * k, struct kobject * t, char * n)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_link(struct kobject * k, char * name)
+{
+	;
+}
+
+
+static inline int sysfs_create_bin_file(struct kobject * k, struct bin_attribute * a)
+{
+	return 0;
+}
+
+static inline int sysfs_remove_bin_file(struct kobject * k, struct bin_attribute * a)
+{
+	return 0;
+}
+
+static inline int sysfs_create_group(struct kobject * k, const struct attribute_group *g)
+{
+	return 0;
+}
+
+static inline void sysfs_remove_group(struct kobject * k, const struct attribute_group * g)
+{
+	;
+}
+
+#endif /* CONFIG_SYSFS */
+
 #endif /* _SYSFS_H_ */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index fef791e4dcb6..02385f3c7697 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -141,9 +141,11 @@ dev_t __init name_to_dev_t(char *name)
 	dev_t res = 0;
 	int part;
 
+#ifdef CONFIG_SYSFS
 	sys_mkdir("/sys", 0700);
 	if (sys_mount("sysfs", "/sys", "sysfs", 0, NULL) < 0)
 		goto out;
+#endif
 
 	if (strncmp(name, "/dev/", 5) != 0) {
 		unsigned maj, min;
-- 
cgit v1.2.3


From c28abd70dfe80c4806d0b39fa7314aa50754dbf3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Mon, 12 Apr 2004 01:06:19 -0700
Subject: [PATCH] JBD: BH_Revoke cleanup

Use the bh bit test/set infrastructure rather than open-coding everything.
No functional changes.
---
 fs/jbd/revoke.c     | 42 ++++++++++++++++++------------------------
 include/linux/jbd.h |  4 ++++
 2 files changed, 22 insertions(+), 24 deletions(-)

(limited to 'include')

diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index a084064cb741..1564a48163d2 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -358,17 +358,15 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
 		bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
 		if (bh2) {
 			/* ... and it has RevokeValid status... */
-			if ((bh2 != bh) &&
-			    test_bit(BH_RevokeValid, &bh2->b_state))
+			if (bh2 != bh && buffer_revokevalid(bh2))
 				/* ...then it better be revoked too,
 				 * since it's illegal to create a revoke
 				 * record against a buffer_head which is
 				 * not marked revoked --- that would
 				 * risk missing a subsequent revoke
 				 * cancel. */
-				J_ASSERT_BH(bh2, test_bit(BH_Revoked, &
-							  bh2->b_state));
-			__brelse(bh2);
+				J_ASSERT_BH(bh2, buffer_revoked(bh2));
+			put_bh(bh2);
 		}
 	}
 #endif
@@ -377,9 +375,9 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
            first having the revoke cancelled: it's illegal to free a
            block twice without allocating it in between! */
 	if (bh) {
-		J_ASSERT_BH(bh, !test_bit(BH_Revoked, &bh->b_state));
-		set_bit(BH_Revoked, &bh->b_state);
-		set_bit(BH_RevokeValid, &bh->b_state);
+		J_ASSERT_BH(bh, !buffer_revoked(bh));
+		set_buffer_revoked(bh);
+		set_buffer_revokevalid(bh);
 		if (bh_in) {
 			BUFFER_TRACE(bh_in, "call journal_forget");
 			journal_forget(handle, bh_in);
@@ -400,7 +398,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
  * Cancel an outstanding revoke.  For use only internally by the
  * journaling code (called from journal_get_write_access).
  *
- * We trust the BH_Revoked bit on the buffer if the buffer is already
+ * We trust buffer_revoked() on the buffer if the buffer is already
  * being journaled: if there is no revoke pending on the buffer, then we
  * don't do anything here.
  *
@@ -427,11 +425,11 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 	 * only perform the full cancel if the revoke bit is set.  If
 	 * not, we can't trust the revoke bit, and we need to do the
 	 * full search for a revoke record. */
-	if (test_and_set_bit(BH_RevokeValid, &bh->b_state))
-		need_cancel = (test_and_clear_bit(BH_Revoked, &bh->b_state));
-	else {
+	if (test_set_buffer_revokevalid(bh)) {
+		need_cancel = test_clear_buffer_revoked(bh);
+	} else {
 		need_cancel = 1;
-		clear_bit(BH_Revoked, &bh->b_state);
+		clear_buffer_revoked(bh);
 	}
 
 	if (need_cancel) {
@@ -462,7 +460,7 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 		bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
 		if (bh2) {
 			if (bh2 != bh)
-				clear_bit(BH_Revoked, &bh2->b_state);
+				clear_buffer_revoked(bh2);
 			__brelse(bh2);
 		}
 	}
@@ -597,24 +595,20 @@ static void flush_descriptor(journal_t *journal,
 			     int offset)
 {
 	journal_revoke_header_t *header;
+	struct buffer_head *bh = jh2bh(descriptor);
 
 	if (is_journal_aborted(journal)) {
-		JBUFFER_TRACE(descriptor, "brelse");
-		__brelse(jh2bh(descriptor));
+		put_bh(bh);
 		return;
 	}
 
 	header = (journal_revoke_header_t *) jh2bh(descriptor)->b_data;
 	header->r_count = htonl(offset);
-	set_bit(BH_JWrite, &jh2bh(descriptor)->b_state);
-	{
-		struct buffer_head *bh = jh2bh(descriptor);
-		BUFFER_TRACE(bh, "write");
-		set_buffer_uptodate(bh);
-		ll_rw_block (WRITE, 1, &bh);
-	}
+	set_buffer_jwrite(bh);
+	BUFFER_TRACE(bh, "write");
+	set_buffer_uptodate(bh);
+	ll_rw_block(WRITE, 1, &bh);
 }
-
 #endif
 
 /* 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index e9f6c69f79db..0a625c3cd38b 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -305,6 +305,10 @@ BUFFER_FNS(JBD, jbd)
 BUFFER_FNS(JWrite, jwrite)
 BUFFER_FNS(JBDDirty, jbddirty)
 TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Revoked, revoked)
+TAS_BUFFER_FNS(Revoked, revoked)
+BUFFER_FNS(RevokeValid, revokevalid)
+TAS_BUFFER_FNS(RevokeValid, revokevalid)
 BUFFER_FNS(Freed, freed)
 
 static inline struct buffer_head *jh2bh(struct journal_head *jh)
-- 
cgit v1.2.3