From 736c7efc7075a7b1a3ee31895af7347278d345e8 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@osdl.org>
Date: Tue, 29 Oct 2002 01:19:51 -0800
Subject: introduce struct kobject: simple, generic object for embedding in
 other structures.

This is not meant to be fancy; just something simple for which we can
control the refcount and other common functionality using common code.

The basic operations for registration and reference count manipulation are
included.
---
 include/linux/kobject.h | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 include/linux/kobject.h

(limited to 'include/linux')

diff --git a/include/linux/kobject.h b/include/linux/kobject.h
new file mode 100644
index 000000000000..c78325e8a199
--- /dev/null
+++ b/include/linux/kobject.h
@@ -0,0 +1,29 @@
+/*
+ * kobject.h - generic kernel object infrastructure.
+ *
+ */
+
+#ifndef _KOBJECT_H_
+#define _KOBJECT_H_
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/sysfs.h>
+#include <asm/atomic.h>
+
+struct kobject {
+	char			name[16];
+	atomic_t		refcount;
+	struct list_head	entry;
+	struct kobject		* parent;
+};
+
+extern void kobject_init(struct kobject *);
+
+extern int kobject_register(struct kobject *);
+extern void kobject_unregister(struct kobject *);
+
+extern struct kobject * kobject_get(struct kobject *);
+extern void kobject_put(struct kobject *);
+
+#endif /* _KOBJECT_H_ */
-- 
cgit v1.2.3


From eda520259938bd3299486f12ab5058c29dcd5326 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@osdl.org>
Date: Tue, 29 Oct 2002 02:26:01 -0800
Subject: sysfs: marry api with struct kobject.

This works on obviating the need for a separate data type to describe a sysfs
directory (which was renamed from struct driver_dir_entry to struct sysfs_dir).

All sysfs creation and removal functions now take a struct kobject, instead
of a struct sysfs_dir. This kobject is embedded in ->d_fsdata of the directory.

sysfs_create_dir() takes only 1 parameter now: the object that we're creating
the directory for. The parent dentry is derived by looking at the object's
parent.

sysfs_create_file() takes the object as the first parameter, and the attribute
as the second, which makes more sense from an API perspective.

sysfs_remove_file() now takes an attribute as a second parameter, to be
consistent with the creation function.

sysfs_remove_link() is created, which is basically the old sysfs_remove_file().
(symlinks don't have an attribute associated with them; only a name, which was
prohibiting the previous change).

open() and close() look for a kobject now, and do refcounting directly on the
object. Because of that, we don't need the ->open() and ->close() callbacks
in struct sysfs_ops, so they've been removed.

read() and write() also now look for a kobject now.

The comments have been updated, too.
---
 fs/sysfs/inode.c        | 278 +++++++++++++++++++++++++++---------------------
 include/linux/kobject.h |   1 +
 include/linux/sysfs.h   |  26 +++--
 3 files changed, 167 insertions(+), 138 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 926b37af571c..804f3655dc73 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -33,7 +33,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/backing-dev.h>
-#include <linux/sysfs.h>
+#include <linux/kobject.h>
 
 #include <asm/uaccess.h>
 
@@ -154,30 +154,35 @@ static int sysfs_unlink(struct inode *dir, struct dentry *dentry)
 }
 
 /**
- * sysfs_read_file - "read" data from a file.
- * @file:	file pointer
- * @buf:	buffer to fill
- * @count:	number of bytes to read
- * @ppos:	starting offset in file
+ *	sysfs_read_file - read an attribute. 
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
  *
- * Userspace wants data from a file. It is up to the creator of the file to
- * provide that data.
- * There is a struct device_attribute embedded in file->private_data. We
- * obtain that and check if the read callback is implemented. If so, we call
- * it, passing the data field of the file entry.
- * Said callback is responsible for filling the buffer and returning the number
- * of bytes it put in it. We update @ppos correctly.
+ *	Userspace wants to read an attribute file. The attribute descriptor
+ *	is in the file's ->d_fsdata. The target object is in the directory's
+ *	->d_fsdata. 
+ *
+ *	We allocate a %PAGE_SIZE buffer, and pass it to the object's ->show()
+ *	method (along with the object). We loop doing this until @count is 
+ *	satisfied, or ->show() returns %0. 
  */
+
 static ssize_t
 sysfs_read_file(struct file *file, char *buf, size_t count, loff_t *ppos)
 {
 	struct attribute * attr = file->f_dentry->d_fsdata;
-	struct driver_dir_entry * dir;
+	struct sysfs_ops * ops = NULL;
+	struct kobject * kobj;
 	unsigned char *page;
 	ssize_t retval = 0;
 
-	dir = file->f_dentry->d_parent->d_fsdata;
-	if (!dir->ops->show)
+	kobj = file->f_dentry->d_parent->d_fsdata;
+	if (kobj)
+		ops = kobj->dir.ops;
+
+	if (!ops || !ops->show)
 		return 0;
 
 	if (count > PAGE_SIZE)
@@ -190,7 +195,7 @@ sysfs_read_file(struct file *file, char *buf, size_t count, loff_t *ppos)
 	while (count > 0) {
 		ssize_t len;
 
-		len = dir->ops->show(dir,attr,page,count,*ppos);
+		len = ops->show(kobj,attr,page,count,*ppos);
 
 		if (len <= 0) {
 			if (len < 0)
@@ -214,27 +219,32 @@ sysfs_read_file(struct file *file, char *buf, size_t count, loff_t *ppos)
 }
 
 /**
- * sysfs_write_file - "write" to a file
- * @file:	file pointer
- * @buf:	data to write
- * @count:	number of bytes
- * @ppos:	starting offset
+ *	sysfs_write_file - write an attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
  *
- * Similarly to sysfs_read_file, we act essentially as a bit pipe.
- * We check for a "write" callback in file->private_data, and pass
- * @buffer, @count, @ppos, and the file entry's data to the callback.
- * The number of bytes written is returned, and we handle updating
- * @ppos properly.
+ *	Identical to sysfs_read_file(), though going the opposite direction.
+ *	We allocate a %PAGE_SIZE buffer and copy in the userspace buffer. We
+ *	pass that to the object's ->store() method until we reach @count or 
+ *	->store() returns %0 or less.
  */
+
 static ssize_t
 sysfs_write_file(struct file *file, const char *buf, size_t count, loff_t *ppos)
 {
 	struct attribute * attr = file->f_dentry->d_fsdata;
-	struct driver_dir_entry * dir;
+	struct sysfs_ops * ops = NULL;
+	struct kobject * kobj;
 	ssize_t retval = 0;
 	char * page;
 
-	dir = file->f_dentry->d_parent->d_fsdata;
+	kobj = file->f_dentry->d_parent->d_fsdata;
+	if (kobj)
+		ops = kobj->dir.ops;
+	if (!ops || !ops->store)
+		return 0;
 
 	page = (char *)__get_free_page(GFP_KERNEL);
 	if (!page)
@@ -249,7 +259,7 @@ sysfs_write_file(struct file *file, const char *buf, size_t count, loff_t *ppos)
 	while (count > 0) {
 		ssize_t len;
 
-		len = dir->ops->store(dir,attr,page + retval,count,*ppos);
+		len = ops->store(kobj,attr,page + retval,count,*ppos);
 
 		if (len <= 0) {
 			if (len < 0)
@@ -268,29 +278,24 @@ sysfs_write_file(struct file *file, const char *buf, size_t count, loff_t *ppos)
 
 static int sysfs_open_file(struct inode * inode, struct file * filp)
 {
-	struct driver_dir_entry * dir;
+	struct kobject * kobj;
 	int error = 0;
 
-	dir = (struct driver_dir_entry *)filp->f_dentry->d_parent->d_fsdata;
-	if (dir) {
+	kobj = filp->f_dentry->d_parent->d_fsdata;
+	if ((kobj = kobject_get(kobj))) {
 		struct attribute * attr = filp->f_dentry->d_fsdata;
-		if (attr && dir->ops) {
-			if (dir->ops->open)
-				error = dir->ops->open(dir);
-			goto Done;
-		}
-	}
-	error = -EINVAL;
- Done:
+		if (!attr)
+			error = -EINVAL;
+	} else
+		error = -EINVAL;
 	return error;
 }
 
 static int sysfs_release(struct inode * inode, struct file * filp)
 {
-	struct driver_dir_entry * dir;
-	dir = (struct driver_dir_entry *)filp->f_dentry->d_parent->d_fsdata;
-	if (dir->ops->close)
-		dir->ops->close(dir);
+	struct kobject * kobj = filp->f_dentry->d_parent->d_fsdata;
+	if (kobj) 
+		kobject_put(kobj);
 	return 0;
 }
 
@@ -371,6 +376,7 @@ static int __init sysfs_init(void)
 
 core_initcall(sysfs_init);
 
+
 static struct dentry * get_dentry(struct dentry * parent, const char * name)
 {
 	struct qstr qstr;
@@ -381,137 +387,160 @@ static struct dentry * get_dentry(struct dentry * parent, const char * name)
 	return lookup_hash(&qstr,parent);
 }
 
+
 /**
- * sysfs_create_dir - create a directory in the filesystem
- * @entry:	directory entry
- * @parent:	parent directory entry
+ *	sysfs_create_dir - create a directory for an object.
+ *	@parent:	parent parent object.
+ *	@kobj:		object we're creating directory for. 
  */
-int
-sysfs_create_dir(struct driver_dir_entry * entry,
-		    struct driver_dir_entry * parent)
+
+int sysfs_create_dir(struct kobject * kobj)
 {
 	struct dentry * dentry = NULL;
-	struct dentry * parent_dentry;
+	struct dentry * parent;
 	int error = 0;
 
-	if (!entry)
+	if (!kobj)
 		return -EINVAL;
 
-	parent_dentry = parent ? parent->dentry : NULL;
-
-	if (!parent_dentry)
-		if (sysfs_mount && sysfs_mount->mnt_sb)
-			parent_dentry = sysfs_mount->mnt_sb->s_root;
-
-	if (!parent_dentry)
+	if (kobj->parent)
+		parent = kobj->parent->dir.dentry;
+	else if (sysfs_mount && sysfs_mount->mnt_sb)
+		parent = sysfs_mount->mnt_sb->s_root;
+	else
 		return -EFAULT;
 
-	down(&parent_dentry->d_inode->i_sem);
-	dentry = get_dentry(parent_dentry,entry->name);
+	down(&parent->d_inode->i_sem);
+	dentry = get_dentry(parent,kobj->name);
 	if (!IS_ERR(dentry)) {
-		dentry->d_fsdata = (void *) entry;
-		entry->dentry = dentry;
-		error = sysfs_mkdir(parent_dentry->d_inode,dentry,entry->mode);
+		dentry->d_fsdata = (void *)kobj;
+		kobj->dir.dentry = dentry;
+		error = sysfs_mkdir(parent->d_inode,dentry,
+				    (S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO));
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent_dentry->d_inode->i_sem);
+	up(&parent->d_inode->i_sem);
 
 	return error;
 }
 
+
 /**
- * sysfs_create_file - create a file
- * @entry:	structure describing the file
- * @parent:	directory to create it in
+ *	sysfs_create_file - create an attribute file for an object.
+ *	@kobj:	object we're creating for. 
+ *	@attr:	atrribute descriptor.
  */
-int
-sysfs_create_file(struct attribute * entry,
-		     struct driver_dir_entry * parent)
+
+int sysfs_create_file(struct kobject * kobj, struct attribute * attr)
 {
 	struct dentry * dentry;
+	struct dentry * parent;
 	int error = 0;
 
-	if (!entry || !parent)
+	if (!kobj || !attr)
 		return -EINVAL;
 
-	if (!parent->dentry)
-		return -EINVAL;
+	if (kobj->parent)
+		parent = kobj->parent->dir.dentry;
+	else
+		return -ENOENT;
 
-	down(&parent->dentry->d_inode->i_sem);
-	dentry = get_dentry(parent->dentry,entry->name);
+	down(&parent->d_inode->i_sem);
+	dentry = get_dentry(parent,attr->name);
 	if (!IS_ERR(dentry)) {
-		dentry->d_fsdata = (void *)entry;
-		error = sysfs_create(parent->dentry->d_inode,dentry,entry->mode);
+		dentry->d_fsdata = (void *)attr;
+		error = sysfs_create(parent->d_inode,dentry,attr->mode);
 	} else
 		error = PTR_ERR(dentry);
-	up(&parent->dentry->d_inode->i_sem);
+	up(&parent->d_inode->i_sem);
 	return error;
 }
 
+
 /**
- * sysfs_create_symlink - make a symlink
- * @parent:	directory we're creating in 
- * @entry:	entry describing link
- * @target:	place we're symlinking to
- * 
+ *	sysfs_create_symlink - make a symlink
+ *	@kobj:	object who's directory we're creating in. 
+ *	@name:	name of the symlink.
+ *	@target:	path we're pointing to.
  */
-int sysfs_create_symlink(struct driver_dir_entry * parent, 
-			    char * name, char * target)
+
+int sysfs_create_link(struct kobject * kobj, char * name, char * target)
 {
 	struct dentry * dentry;
-	int error = 0;
+	int error;
 
-	if (!parent || !parent->dentry)
-		return -EINVAL;
+	if (kobj) {
+		struct dentry * parent = kobj->dir.dentry;
 
-	down(&parent->dentry->d_inode->i_sem);
-	dentry = get_dentry(parent->dentry,name);
-	if (!IS_ERR(dentry))
-		error = sysfs_symlink(parent->dentry->d_inode,dentry,target);
-	else
-		error = PTR_ERR(dentry);
-	up(&parent->dentry->d_inode->i_sem);
+		down(&parent->d_inode->i_sem);
+		dentry = get_dentry(parent,name);
+		if (!IS_ERR(dentry))
+			error = sysfs_symlink(parent->d_inode,dentry,target);
+		else
+			error = PTR_ERR(dentry);
+		up(&parent->d_inode->i_sem);
+	} else
+		error = -EINVAL;
 	return error;
 }
 
+
+static void hash_and_remove(struct dentry * dir, const char * name)
+{
+	struct dentry * victim;
+
+	down(&dir->d_inode->i_sem);
+	victim = get_dentry(dir,name);
+	if (!IS_ERR(victim)) {
+		/* make sure dentry is really there */
+		if (victim->d_inode && 
+		    (victim->d_parent->d_inode == dir->d_inode)) {
+			sysfs_unlink(dir->d_inode,victim);
+		}
+	}
+	up(&dir->d_inode->i_sem);
+}
+
+
 /**
- * sysfs_remove_file - exported file removal
- * @dir:	directory the file supposedly resides in
- * @name:	name of the file
+ *	sysfs_remove_file - remove an object attribute.
+ *	@kobj:	object we're acting for.
+ *	@attr:	attribute descriptor.
  *
- * Try and find the file in the dir's list.
- * If it's there, call __remove_file() (above) for the dentry.
+ *	Hash the attribute name and kill the victim.
  */
-void sysfs_remove_file(struct driver_dir_entry * dir, const char * name)
+
+void sysfs_remove_file(struct kobject * kobj, struct attribute * attr)
 {
-	struct dentry * dentry;
+	hash_and_remove(kobj->dir.dentry,attr->name);
+}
 
-	if (!dir->dentry)
-		return;
 
-	down(&dir->dentry->d_inode->i_sem);
-	dentry = get_dentry(dir->dentry,name);
-	if (!IS_ERR(dentry)) {
-		/* make sure dentry is really there */
-		if (dentry->d_inode && 
-		    (dentry->d_parent->d_inode == dir->dentry->d_inode)) {
-			sysfs_unlink(dir->dentry->d_inode,dentry);
-		}
-	}
-	up(&dir->dentry->d_inode->i_sem);
+/**
+ *	sysfs_remove_link - remove symlink in object's directory.
+ *	@kobj:	object we're acting for.
+ *	@name:	name of the symlink to remove.
+ */
+
+void sysfs_remove_link(struct kobject * kobj, char * name)
+{
+	hash_and_remove(kobj->dir.dentry,name);
 }
 
+
 /**
- * sysfs_remove_dir - exportable directory removal
- * @dir:	directory to remove
+ *	sysfs_remove_dir - remove an object's directory.
+ *	@kobj:	object. 
  *
- * To make sure we don't orphan anyone, first remove
- * all the children in the list, then do clean up the directory.
+ *	The only thing special about this is that we remove any files in 
+ *	the directory before we remove the directory, and we've inlined
+ *	what used to be sysfs_rmdir() below, instead of calling separately.
  */
-void sysfs_remove_dir(struct driver_dir_entry * dir)
+
+void sysfs_remove_dir(struct kobject * kobj)
 {
 	struct list_head * node, * next;
-	struct dentry * dentry = dir->dentry;
+	struct dentry * dentry = kobj->dir.dentry;
 	struct dentry * parent;
 
 	if (!dentry)
@@ -542,8 +571,9 @@ void sysfs_remove_dir(struct driver_dir_entry * dir)
 }
 
 EXPORT_SYMBOL(sysfs_create_file);
-EXPORT_SYMBOL(sysfs_create_symlink);
-EXPORT_SYMBOL(sysfs_create_dir);
 EXPORT_SYMBOL(sysfs_remove_file);
+EXPORT_SYMBOL(sysfs_create_link);
+EXPORT_SYMBOL(sysfs_remove_link);
+EXPORT_SYMBOL(sysfs_create_dir);
 EXPORT_SYMBOL(sysfs_remove_dir);
 MODULE_LICENSE("GPL");
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index c78325e8a199..12431a980712 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -16,6 +16,7 @@ struct kobject {
 	atomic_t		refcount;
 	struct list_head	entry;
 	struct kobject		* parent;
+	struct sysfs_dir	dir;
 };
 
 extern void kobject_init(struct kobject *);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 6479902e1d20..fe82dff179ce 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -11,18 +11,15 @@
 
 struct driver_dir_entry;
 struct attribute;
+struct kobject;
 
 struct sysfs_ops {
-	int	(*open)(struct driver_dir_entry *);
-	int	(*close)(struct driver_dir_entry *);
-	ssize_t	(*show)(struct driver_dir_entry *, struct attribute *,char *, size_t, loff_t);
-	ssize_t	(*store)(struct driver_dir_entry *,struct attribute *,const char *, size_t, loff_t);
+	ssize_t	(*show)(struct kobject *, struct attribute *,char *, size_t, loff_t);
+	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t, loff_t);
 };
 
-struct driver_dir_entry {
-	char			* name;
+struct sysfs_dir {
 	struct dentry		* dentry;
-	mode_t			mode;
 	struct sysfs_ops	* ops;
 };
 
@@ -32,20 +29,21 @@ struct attribute {
 };
 
 extern int
-sysfs_create_dir(struct driver_dir_entry *, struct driver_dir_entry *);
+sysfs_create_dir(struct kobject *);
 
 extern void
-sysfs_remove_dir(struct driver_dir_entry * entry);
+sysfs_remove_dir(struct kobject *);
 
 extern int
-sysfs_create_file(struct attribute * attr,
-		     struct driver_dir_entry * parent);
+sysfs_create_file(struct kobject *, struct attribute *);
+
+extern void
+sysfs_remove_file(struct kobject *, struct attribute *);
 
 extern int 
-sysfs_create_symlink(struct driver_dir_entry * parent, 
-			char * name, char * target);
+sysfs_create_link(struct kobject * kobj, char * name, char * target);
 
 extern void
-sysfs_remove_file(struct driver_dir_entry *, const char * name);
+sysfs_remove_link(struct kobject *, char * name);
 
 #endif /* _SYSFS_H_ */
-- 
cgit v1.2.3


From 0862416ecefa629a52f3ee0907206e9633ca45c3 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@osdl.org>
Date: Tue, 29 Oct 2002 09:26:38 -0800
Subject: sysfs: make symlinks easier.

It's now

int sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name)

So, the caller doesn't have to determine the path of the target nor the depth of
the object we're creating the symlink for; it's all taken care of.
---
 fs/sysfs/inode.c      | 90 +++++++++++++++++++++++++++++++++++++++------------
 include/linux/sysfs.h |  2 +-
 2 files changed, 71 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 804f3655dc73..a82726460fca 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -457,34 +457,84 @@ int sysfs_create_file(struct kobject * kobj, struct attribute * attr)
 }
 
 
+static int object_depth(struct kobject * kobj)
+{
+	struct kobject * p = kobj;
+	int depth = 0;
+	do { depth++; } while ((p = p->parent));
+	return depth;
+}
+
+static int object_path_length(struct kobject * kobj)
+{
+	struct kobject * p = kobj;
+	int length = 1;
+	do {
+		length += strlen(p->name) + 1;
+		p = p->parent;
+	} while (p);
+	return length;
+}
+
+static void fill_object_path(struct kobject * kobj, char * buffer, int length)
+{
+	struct kobject * p;
+
+	--length;
+	for (p = kobj; p; p = p->parent) {
+		int cur = strlen(p->name);
+
+		/* back up enough to print this bus id with '/' */
+		length -= cur;
+		strncpy(buffer + length,p->name,cur);
+		*(buffer + --length) = '/';
+	}
+}
+
 /**
- *	sysfs_create_symlink - make a symlink
- *	@kobj:	object who's directory we're creating in. 
- *	@name:	name of the symlink.
- *	@target:	path we're pointing to.
+ *	sysfs_create_link - create symlink between two objects.
+ *	@kobj:	object whose directory we're creating the link in.
+ *	@target:	object we're pointing to.
+ *	@name:		name of the symlink.
  */
-
-int sysfs_create_link(struct kobject * kobj, char * name, char * target)
+int sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name)
 {
-	struct dentry * dentry;
-	int error;
+	struct dentry * dentry = kobj->dir.dentry;
+	struct dentry * d;
+	int error = 0;
+	int size;
+	int depth;
+	char * path;
+	char * s;
+
+	depth = object_depth(kobj);
+	size = object_path_length(target) + depth * 3 - 1;
+	if (size > PATH_MAX)
+		return -ENAMETOOLONG;
+	pr_debug("%s: depth = %d, size = %d\n",__FUNCTION__,depth,size);
+
+	path = kmalloc(size,GFP_KERNEL);
+	if (!path)
+		return -ENOMEM;
+	memset(path,0,size);
 
-	if (kobj) {
-		struct dentry * parent = kobj->dir.dentry;
+	for (s = path; depth--; s += 3)
+		strcpy(s,"../");
 
-		down(&parent->d_inode->i_sem);
-		dentry = get_dentry(parent,name);
-		if (!IS_ERR(dentry))
-			error = sysfs_symlink(parent->d_inode,dentry,target);
-		else
-			error = PTR_ERR(dentry);
-		up(&parent->d_inode->i_sem);
-	} else
-		error = -EINVAL;
+	fill_object_path(target,path,size);
+	pr_debug("%s: path = '%s'\n",__FUNCTION__,path);
+
+	down(&dentry->d_inode->i_sem);
+	d = get_dentry(dentry,name);
+	if (!IS_ERR(d))
+		error = sysfs_symlink(dentry->d_inode,d,path);
+	else
+		error = PTR_ERR(d);
+	up(&dentry->d_inode->i_sem);
+	kfree(path);
 	return error;
 }
 
-
 static void hash_and_remove(struct dentry * dir, const char * name)
 {
 	struct dentry * victim;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index fe82dff179ce..066a9ccc0fb4 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -41,7 +41,7 @@ extern void
 sysfs_remove_file(struct kobject *, struct attribute *);
 
 extern int 
-sysfs_create_link(struct kobject * kobj, char * name, char * target);
+sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name);
 
 extern void
 sysfs_remove_link(struct kobject *, char * name);
-- 
cgit v1.2.3


From a6c066de9d449b0bbe2efbf6431b19c270e02060 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@osdl.org>
Date: Tue, 29 Oct 2002 19:47:41 -0800
Subject: Introduce struct subsystem.

A struct subsystem is basically a collection of objects of a certain type,
and some callbacks to operate on objects of that type.

subsystems contain embedded kobjects themselves, and have a similar set of
library routines that kobjects do, which are mostly just wrappers for the
correlating kobject routines.

kobjects are inserted in depth-first order into their subsystem's list of
objects. Orphan kobjects are also given foster parents that point to their
subsystem. This provides a bit more rigidity in the hierarchy, and disallows
any orphan kobjects.

When an object is unregistered, it is removed from its subsystem's list. When
the objects' refcount hits 0, the subsystem's ->release() callback is called.

Documentation describing the objects and the interfaces has also been added.
---
 Documentation/kobject.txt | 144 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/kobject.h   |  26 +++++++++
 lib/kobject.c             |  74 ++++++++++++++++++++++--
 3 files changed, 239 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/kobject.txt

(limited to 'include/linux')

diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
new file mode 100644
index 000000000000..d8e2d36b9140
--- /dev/null
+++ b/Documentation/kobject.txt
@@ -0,0 +1,144 @@
+kobjects - Simple, Generic Kernel Objects
+
+Patrick Mochel <mochel@osdl.org>
+
+30 October 2002
+
+
+kobjects
+
+struct kobject introduces a simple, intregral datatype and a simple
+set of semantics for operating on the device. kobjects are intended to
+be embedded in larger data structures and replace fields it
+duplicates. A set of library functions has been developed to assist in
+the manipulation of kobjects.
+
+struct kobject looks like this:
+
+
+struct kobject {
+	char			name[16];
+	atomic_t		refcount;
+	struct list_head	entry;
+	struct kobject		* parent;
+	struct subsystem	* subsys;
+	struct sysfs_dir	dir;
+};
+
+void kobject_init(struct kobject *);
+int kobject_register(struct kobject *);
+void kobject_unregister(struct kobject *);
+struct kobject * kobject_get(struct kobject *);
+void kobject_put(struct kobject *);
+
+
+
+subsystems
+
+struct subsystem is introduced to describe a collection of objects of
+a certain type. subsystems are kobjects themselves, though they
+contain lists of kobjects that belong to that subsystem. Objects of a
+subsystem (the embedder objects in which kobjects live) are all of the
+same type. The interface looks like:
+
+struct subsystem {
+	struct kobject		kobj;
+	struct list_head	list;
+	struct rw_semaphore	rwsem;
+	struct subsystem	* parent;
+	struct sysfs_ops	* sysfs_ops;
+	void (*release)(struct kobject *);
+};
+
+void subsystem_init(struct subsystem *);
+int subsystem_register(struct subsystem *);
+void subsystem_unregister(struct subsystem *);
+
+struct subsystem * subsys_get(struct subsystem * s);
+void subsys_put(struct subsystem * s);
+
+
+
+Familial Relations
+
+kobjects and subsystems intersect and intertwine in several ways. Each
+is well-defined (though maybe they could be made simpler). Each kobject
+belongs to a subsystem. Since subsystems are kobjects themselves, they
+also belong to a controlling subsystem. This implies that subsystems
+are hierarchial. 
+
+Many kobjects are hierarchial in nature, which is represented by
+including a pointer to its parent kobject in struct kobject. Many
+different types of kobject-embedding objects may all point to the same
+parent. 
+
+The ancestral hierarchy of kobjects should not be confused with
+membership in a subsystem, or the ancestral relationship of
+subsystems. A set of kobjects may all belong to a subsystem, but all
+have different parents. 
+
+kobjects may be orphans and have no explicit parent. In that case, the
+subsystem to which the object belongs becomes its parent. 
+
+
+Sysfs
+
+These rules force a complete kobject hierarchy, which Suprise! maps
+very well onto a filesystem.
+
+driverfs was recently cloned, and there now exists sysfs. All driverfs
+operations operate on a separate data type: struct driver_dir_entry,
+which all objects that are represented in driverfs must have. driverfs
+also allowed rogue directory creation that had no explicit objects
+associated with them.
+
+struct kobject is intended to be the common data type which sysfs
+operates on. This gives the filesystem the ability to directly access
+more fields of the object, including the reference count. This also
+forces each directory in the filesystem to be tied directly to a
+kobject. 
+
+
+Directory Placement
+
+Parental relationships are determined in the kobject/subsystem layer,
+and the kobject is then passed off to the sysfs layer. kobjects with
+no parent have directories created for them in the sysfs root
+directory. Per the rules above, the only kobjects that remain orphans
+are subsystems without parent subsystems (since leaf objects either
+have an explicit parent, or are assigned their controlling subsystem
+as their foster parent). 
+
+
+File Callbacks
+
+Previously, each driverfs directory contained a pointer to a list of file
+operations for reading and writing driverfs files. These callbacks
+received a struct driver_dir_entry, when they performed a
+container_of() transform on to receive the specific object type for
+which the call was meant. 
+
+These callbacks have been converted to accept a struct kobject instead
+of struct driver_dir_entry. Since all kobjects belong to a subsystem
+that contains kobjects all of the same type, the sysfs operations
+have been moved to reside in the subsystem, since they are common for
+all kobjects.
+
+
+Reference Counting
+
+All objects contain reference counts. All functions accessing objects
+should increment the reference count until they are finished, and
+decrement the reference count. When an object is initialized, it
+receives a reference count of 1. When a device is unregistered, the
+reference is decremented. When the reference counts reaches 0, the
+subsystem's ->release() callback for that object type (remember
+subsystems control only one type of device each) is called; and the
+reference counts of the kobject's subsystem and parent are
+decremented. 
+
+The ->release() callback is the opportunity for the subsystem to free
+memory allocated for the object. It is the notification that
+absolutely no one is using the structure any more (and can't acquire a
+reference to it), so it is safe to free it.
+
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 12431a980712..32dfaaf52d88 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/list.h>
 #include <linux/sysfs.h>
+#include <linux/rwsem.h>
 #include <asm/atomic.h>
 
 struct kobject {
@@ -16,6 +17,7 @@ struct kobject {
 	atomic_t		refcount;
 	struct list_head	entry;
 	struct kobject		* parent;
+	struct subsystem	* subsys;
 	struct sysfs_dir	dir;
 };
 
@@ -27,4 +29,28 @@ extern void kobject_unregister(struct kobject *);
 extern struct kobject * kobject_get(struct kobject *);
 extern void kobject_put(struct kobject *);
 
+
+struct subsystem {
+	struct kobject		kobj;
+	struct list_head	list;
+	struct rw_semaphore	rwsem;
+	struct subsystem	* parent;
+	void (*release)(struct kobject *);
+	struct sysfs_ops	* sysfs_ops;
+};
+
+extern void subsystem_init(struct subsystem *);
+extern int subsystem_register(struct subsystem *);
+extern void subsystem_unregister(struct subsystem *);
+
+static inline struct subsystem * subsys_get(struct subsystem * s)
+{
+	return container_of(kobject_get(&s->kobj),struct subsystem,kobj);
+}
+
+static inline void subsys_put(struct subsystem * s)
+{
+	kobject_put(&s->kobj);
+}
+
 #endif /* _KOBJECT_H_ */
diff --git a/lib/kobject.c b/lib/kobject.c
index 48e950c9e4ce..89fb79c873fb 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -31,10 +31,21 @@ void kobject_init(struct kobject * kobj)
 
 int kobject_register(struct kobject * kobj)
 {
+	struct subsystem * s = subsys_get(kobj->subsys);
+	struct kobject * parent = kobject_get(kobj->parent);
+
 	pr_debug("kobject %s: registering\n",kobj->name);
-	if (kobj->parent)
-		kobject_get(kobj->parent);
-	return 0;
+	if (s) {
+		down_write(&s->rwsem);
+		if (parent) 
+			list_add_tail(&kobj->entry,&parent->entry);
+		else {
+			list_add_tail(&kobj->entry,&s->list);
+			kobj->parent = &s->kobj;
+		}
+		up_write(&s->rwsem);
+	}
+	return sysfs_create_dir(kobj);
 }
 
 /**
@@ -53,6 +64,12 @@ int kobject_register(struct kobject * kobj)
 void kobject_unregister(struct kobject * kobj)
 {
 	pr_debug("kobject %s: unregistering\n",kobj->name);
+	sysfs_remove_dir(kobj);
+	if (kobj->subsys) {
+		down_write(&kobj->subsys->rwsem);
+		list_del_init(&kobj->entry);
+		up_write(&kobj->subsys->rwsem);
+	}
 	kobject_put(kobj);
 }
 
@@ -64,7 +81,7 @@ void kobject_unregister(struct kobject * kobj)
 struct kobject * kobject_get(struct kobject * kobj)
 {
 	struct kobject * ret = kobj;
-	if (atomic_read(&kobj->refcount) > 0)
+	if (kobj && atomic_read(&kobj->refcount) > 0)
 		atomic_inc(&kobj->refcount);
 	else
 		ret = NULL;
@@ -78,21 +95,68 @@ struct kobject * kobject_get(struct kobject * kobj)
  *	Decrement the refcount, and check if 0. If it is, then 
  *	we're gonna need to clean it up, and decrement the refcount
  *	of its parent.
+ *
+ *	@kobj->parent could point to its subsystem, which we also 
+ *	want to decrement the reference count for. We always dec 
+ *	the refcount for the parent, but only do so for the subsystem
+ *	if it points to a different place than the parent.
  */
 
 void kobject_put(struct kobject * kobj)
 {
 	struct kobject * parent = kobj->parent;
+	struct subsystem * s = kobj->subsys;
 
 	if (!atomic_dec_and_test(&kobj->refcount))
 		return;
+
 	pr_debug("kobject %s: cleaning up\n",kobj->name);
-	if (parent)
+	if (s) {
+		if (s->release)
+			s->release(kobj);
+		if (&s->kobj != parent)
+			subsys_put(s);
+	} 
+
+	if (parent) 
 		kobject_put(parent);
 }
 
+
+void subsystem_init(struct subsystem * s)
+{
+	kobject_init(&s->kobj);
+	init_rwsem(&s->rwsem);
+	INIT_LIST_HEAD(&s->list);
+}
+
+/**
+ *	subsystem_register - register a subsystem.
+ *	@s:	the subsystem we're registering.
+ */
+
+int subsystem_register(struct subsystem * s)
+{
+	subsystem_init(s);
+	if (s->parent)
+		s->kobj.parent = &s->parent->kobj;
+	pr_debug("subsystem %s: registering\n",s->kobj.name);
+	return kobject_register(&s->kobj);
+}
+
+void subsystem_unregister(struct subsystem * s)
+{
+	pr_debug("subsystem %s: unregistering\n",s->kobj.name);
+	kobject_unregister(&s->kobj);
+}
+
+
 EXPORT_SYMBOL(kobject_init);
 EXPORT_SYMBOL(kobject_register);
 EXPORT_SYMBOL(kobject_unregister);
 EXPORT_SYMBOL(kobject_get);
 EXPORT_SYMBOL(kobject_put);
+
+EXPORT_SYMBOL(subsystem_init);
+EXPORT_SYMBOL(subsystem_register);
+EXPORT_SYMBOL(subsystem_unregister);
-- 
cgit v1.2.3


From 332ad69da7aca8bdcd3f873bb2bfeb9d9a6d1f98 Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@osdl.org>
Date: Tue, 29 Oct 2002 20:27:36 -0800
Subject: sysfs: kill struct sysfs_dir.

Previously, sysfs read() and write() calls looked for sysfs_ops in the struct
sysfs_dir, in the kobject. Since objects belong to a subsystem, and is a member
of a group of like devices, the sysfs_ops have been moved to struct subsystem,
and are referenced from there.

The only remaining member of struct sysfs_dir is the dentry of the object's
directory. That is moved out of the dir struct and directly into struct kobject.
That saves us 4 bytes/object.

All of the sysfs functions that referenced the struct have been changed to just
reference the dentry.
---
 fs/sysfs/inode.c        | 26 +++++++++++---------------
 include/linux/kobject.h |  2 +-
 include/linux/sysfs.h   |  5 -----
 3 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index a82726460fca..c25cadf77d88 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -179,9 +179,8 @@ sysfs_read_file(struct file *file, char *buf, size_t count, loff_t *ppos)
 	ssize_t retval = 0;
 
 	kobj = file->f_dentry->d_parent->d_fsdata;
-	if (kobj)
-		ops = kobj->dir.ops;
-
+	if (kobj && kobj->subsys)
+		ops = kobj->subsys->sysfs_ops;
 	if (!ops || !ops->show)
 		return 0;
 
@@ -241,8 +240,8 @@ sysfs_write_file(struct file *file, const char *buf, size_t count, loff_t *ppos)
 	char * page;
 
 	kobj = file->f_dentry->d_parent->d_fsdata;
-	if (kobj)
-		ops = kobj->dir.ops;
+	if (kobj && kobj->subsys)
+		ops = kobj->subsys->sysfs_ops;
 	if (!ops || !ops->store)
 		return 0;
 
@@ -404,7 +403,7 @@ int sysfs_create_dir(struct kobject * kobj)
 		return -EINVAL;
 
 	if (kobj->parent)
-		parent = kobj->parent->dir.dentry;
+		parent = kobj->parent->dentry;
 	else if (sysfs_mount && sysfs_mount->mnt_sb)
 		parent = sysfs_mount->mnt_sb->s_root;
 	else
@@ -414,7 +413,7 @@ int sysfs_create_dir(struct kobject * kobj)
 	dentry = get_dentry(parent,kobj->name);
 	if (!IS_ERR(dentry)) {
 		dentry->d_fsdata = (void *)kobj;
-		kobj->dir.dentry = dentry;
+		kobj->dentry = dentry;
 		error = sysfs_mkdir(parent->d_inode,dentry,
 				    (S_IFDIR| S_IRWXU | S_IRUGO | S_IXUGO));
 	} else
@@ -440,10 +439,7 @@ int sysfs_create_file(struct kobject * kobj, struct attribute * attr)
 	if (!kobj || !attr)
 		return -EINVAL;
 
-	if (kobj->parent)
-		parent = kobj->parent->dir.dentry;
-	else
-		return -ENOENT;
+	parent = kobj->dentry;
 
 	down(&parent->d_inode->i_sem);
 	dentry = get_dentry(parent,attr->name);
@@ -499,7 +495,7 @@ static void fill_object_path(struct kobject * kobj, char * buffer, int length)
  */
 int sysfs_create_link(struct kobject * kobj, struct kobject * target, char * name)
 {
-	struct dentry * dentry = kobj->dir.dentry;
+	struct dentry * dentry = kobj->dentry;
 	struct dentry * d;
 	int error = 0;
 	int size;
@@ -562,7 +558,7 @@ static void hash_and_remove(struct dentry * dir, const char * name)
 
 void sysfs_remove_file(struct kobject * kobj, struct attribute * attr)
 {
-	hash_and_remove(kobj->dir.dentry,attr->name);
+	hash_and_remove(kobj->dentry,attr->name);
 }
 
 
@@ -574,7 +570,7 @@ void sysfs_remove_file(struct kobject * kobj, struct attribute * attr)
 
 void sysfs_remove_link(struct kobject * kobj, char * name)
 {
-	hash_and_remove(kobj->dir.dentry,name);
+	hash_and_remove(kobj->dentry,name);
 }
 
 
@@ -590,7 +586,7 @@ void sysfs_remove_link(struct kobject * kobj, char * name)
 void sysfs_remove_dir(struct kobject * kobj)
 {
 	struct list_head * node, * next;
-	struct dentry * dentry = kobj->dir.dentry;
+	struct dentry * dentry = kobj->dentry;
 	struct dentry * parent;
 
 	if (!dentry)
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 32dfaaf52d88..5b236b6678cb 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -18,7 +18,7 @@ struct kobject {
 	struct list_head	entry;
 	struct kobject		* parent;
 	struct subsystem	* subsys;
-	struct sysfs_dir	dir;
+	struct dentry		* dentry;
 };
 
 extern void kobject_init(struct kobject *);
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index 066a9ccc0fb4..7a46c9f0c308 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -18,11 +18,6 @@ struct sysfs_ops {
 	ssize_t	(*store)(struct kobject *,struct attribute *,const char *, size_t, loff_t);
 };
 
-struct sysfs_dir {
-	struct dentry		* dentry;
-	struct sysfs_ops	* ops;
-};
-
 struct attribute {
 	char			* name;
 	mode_t			mode;
-- 
cgit v1.2.3


From c3f575f0143002cea266135ab7fb348b9bfad6aa Mon Sep 17 00:00:00 2001
From: Patrick Mochel <mochel@osdl.org>
Date: Tue, 29 Oct 2002 20:41:43 -0800
Subject: kobjects: add array of default attributes to subsystems, and create
 on registration.

struct subsystem may now contain a pointer to a NULL-terminated array of
default attributes to be exported when an object is registered with the subsystem.
kobject registration will check the return values of the directory creation and
the creation of each file, and handle it appropriately.

The documentation has also been updated.
---
 Documentation/kobject.txt | 14 ++++++++++++--
 include/linux/kobject.h   |  1 +
 lib/kobject.c             | 41 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 53 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index d8e2d36b9140..bc4ddfde8158 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -22,7 +22,7 @@ struct kobject {
 	struct list_head	entry;
 	struct kobject		* parent;
 	struct subsystem	* subsys;
-	struct sysfs_dir	dir;
+	struct dentry		* dentry;
 };
 
 void kobject_init(struct kobject *);
@@ -41,13 +41,15 @@ contain lists of kobjects that belong to that subsystem. Objects of a
 subsystem (the embedder objects in which kobjects live) are all of the
 same type. The interface looks like:
 
+
 struct subsystem {
 	struct kobject		kobj;
 	struct list_head	list;
 	struct rw_semaphore	rwsem;
 	struct subsystem	* parent;
-	struct sysfs_ops	* sysfs_ops;
 	void (*release)(struct kobject *);
+	struct sysfs_ops	* sysfs_ops;
+	struct attribute	** default_attrs;
 };
 
 void subsystem_init(struct subsystem *);
@@ -125,6 +127,14 @@ have been moved to reside in the subsystem, since they are common for
 all kobjects.
 
 
+Default Attributes
+
+Most subsystems have a set of default attributes associated with an
+object that registers with them. A subsystem definition may contain a
+NULL-terminated array of attributes that will be exported when an
+object is registered with the subsystem. 
+
+
 Reference Counting
 
 All objects contain reference counts. All functions accessing objects
diff --git a/include/linux/kobject.h b/include/linux/kobject.h
index 5b236b6678cb..d2f0629a6189 100644
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -37,6 +37,7 @@ struct subsystem {
 	struct subsystem	* parent;
 	void (*release)(struct kobject *);
 	struct sysfs_ops	* sysfs_ops;
+	struct attribute	** default_attrs;
 };
 
 extern void subsystem_init(struct subsystem *);
diff --git a/lib/kobject.c b/lib/kobject.c
index 89fb79c873fb..f4232029144c 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -8,6 +8,34 @@
 #include <linux/module.h>
 #include <linux/stat.h>
 
+/**
+ *	kobject_populate_dir - populate directory with attributes.
+ *	@kobj:	object we're working on.
+ *
+ *	Most subsystems have a set of default attributes that 
+ *	are associated with an object that registers with them.
+ *	This is a helper called during object registration that 
+ *	loops through the default attributes of the subsystem 
+ *	and creates attributes files for them in sysfs.
+ *
+ */
+
+static int kobject_populate_dir(struct kobject * kobj)
+{
+	struct subsystem * s = kobj->subsys;
+	struct attribute * attr;
+	int error = 0;
+	int i;
+	
+	if (s && s->default_attrs) {
+		for (i = 0; (attr = s->default_attrs[i]); i++) {
+			if ((error = sysfs_create_file(kobj,attr)))
+				break;
+		}
+	}
+	return error;
+}
+
 /**
  *	kobject_init - initialize object.
  *	@kobj:	object in question.
@@ -31,10 +59,13 @@ void kobject_init(struct kobject * kobj)
 
 int kobject_register(struct kobject * kobj)
 {
+	int error = 0;
 	struct subsystem * s = subsys_get(kobj->subsys);
 	struct kobject * parent = kobject_get(kobj->parent);
 
 	pr_debug("kobject %s: registering\n",kobj->name);
+	if (parent)
+		pr_debug("  parent is %s\n",parent->name);
 	if (s) {
 		down_write(&s->rwsem);
 		if (parent) 
@@ -45,7 +76,13 @@ int kobject_register(struct kobject * kobj)
 		}
 		up_write(&s->rwsem);
 	}
-	return sysfs_create_dir(kobj);
+	error = sysfs_create_dir(kobj);
+	if (!error) {
+		error = kobject_populate_dir(kobj);
+		if (error)
+			sysfs_remove_dir(kobj);
+	}
+	return error;
 }
 
 /**
@@ -141,6 +178,8 @@ int subsystem_register(struct subsystem * s)
 	if (s->parent)
 		s->kobj.parent = &s->parent->kobj;
 	pr_debug("subsystem %s: registering\n",s->kobj.name);
+	if (s->parent)
+		pr_debug("  parent is %s\n",s->parent->kobj.name);
 	return kobject_register(&s->kobj);
 }
 
-- 
cgit v1.2.3


From f751cfc01cc86a24d6266c7116cdc16f27e6da40 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 29 Oct 2002 22:35:31 -0800
Subject: [PATCH] sys_epoll 0.15

Latest version of the epoll interfaces.
---
 arch/i386/kernel/entry.S  |    4 +
 drivers/char/Makefile     |    4 +-
 drivers/char/eventpoll.c  | 1438 +++++++++++++++++++++++++++++++++++++++++++++
 fs/Makefile               |    4 +-
 fs/fcblist.c              |  146 +++++
 fs/file_table.c           |    4 +
 fs/pipe.c                 |   36 +-
 include/asm-i386/poll.h   |    1 +
 include/asm-i386/unistd.h |    3 +
 include/linux/eventpoll.h |   51 ++
 include/linux/fcblist.h   |   71 +++
 include/linux/fs.h        |    4 +
 include/linux/pipe_fs_i.h |    4 +
 include/linux/sys.h       |    2 +-
 include/net/sock.h        |   12 +-
 net/ipv4/tcp.c            |    4 +-
 16 files changed, 1777 insertions(+), 11 deletions(-)
 create mode 100644 drivers/char/eventpoll.c
 create mode 100644 fs/fcblist.c
 create mode 100644 include/linux/eventpoll.h
 create mode 100644 include/linux/fcblist.h

(limited to 'include/linux')

diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S
index e873703e0c34..11690c6cf60d 100644
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -737,6 +737,10 @@ ENTRY(sys_call_table)
 	.long sys_free_hugepages
 	.long sys_exit_group
 	.long sys_lookup_dcookie
+	.long sys_epoll_create
+	.long sys_epoll_ctl	/* 255 */
+	.long sys_epoll_wait
+
 
 	.rept NR_syscalls-(.-sys_call_table)/4
 		.long sys_ni_syscall
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index e65360cf6538..281ba60e6396 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -7,14 +7,14 @@
 #
 FONTMAPFILE = cp437.uni
 
-obj-y	 += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o
+obj-y	 += mem.o tty_io.o n_tty.o tty_ioctl.o pty.o misc.o random.o eventpoll.o
 
 # All of the (potential) objects that export symbols.
 # This list comes from 'grep -l EXPORT_SYMBOL *.[hc]'.
 
 export-objs     :=	busmouse.o vt.o generic_serial.o ip2main.o \
 			ite_gpio.o keyboard.o misc.o nvram.o random.o rtc.o \
-			selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o
+			selection.o sonypi.o sysrq.o tty_io.o tty_ioctl.o eventpoll.o
 
 obj-$(CONFIG_VT) += vt_ioctl.o vc_screen.o consolemap.o consolemap_deftbl.o selection.o keyboard.o
 obj-$(CONFIG_HW_CONSOLE) += vt.o defkeymap.o
diff --git a/drivers/char/eventpoll.c b/drivers/char/eventpoll.c
new file mode 100644
index 000000000000..57f2b40019c4
--- /dev/null
+++ b/drivers/char/eventpoll.c
@@ -0,0 +1,1438 @@
+/*
+ *  drivers/char/eventpoll.c ( Efficent event polling implementation )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/poll.h>
+#include <linux/miscdevice.h>
+#include <linux/random.h>
+#include <linux/smp_lock.h>
+#include <linux/wrapper.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/fcblist.h>
+#include <linux/rwsem.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <asm/system.h>
+#include <asm/io.h>
+#include <asm/mman.h>
+#include <asm/atomic.h>
+#include <linux/eventpoll.h>
+
+
+
+#define EVENTPOLLFS_MAGIC 0x03111965 /* My birthday should work for this :) */
+
+#define DEBUG_EPOLL 0
+
+#if DEBUG_EPOLL > 0
+#define DPRINTK(x) printk x
+#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
+#else /* #if DEBUG_EPOLL > 0 */
+#define DPRINTK(x) (void) 0
+#define DNPRINTK(n, x) (void) 0
+#endif /* #if DEBUG_EPOLL > 0 */
+
+#define DEBUG_DPI 0
+
+#if DEBUG_DPI != 0
+#define DPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
+#else /* #if DEBUG_DPI != 0 */
+#define DPI_SLAB_DEBUG 0
+#endif /* #if DEBUG_DPI != 0 */
+
+#define INITIAL_HASH_BITS 7
+#define MAX_HASH_BITS 18
+#define RESIZE_LENGTH 2
+
+#define DPI_MEM_ALLOC()	(struct epitem *) kmem_cache_alloc(dpi_cache, SLAB_KERNEL)
+#define DPI_MEM_FREE(p) kmem_cache_free(dpi_cache, p)
+#define IS_FILE_EPOLL(f) ((f)->f_op == &eventpoll_fops)
+
+
+/*
+ * Type used for versioning events snapshots inside the double buffer.
+ */
+typedef unsigned long long event_version_t;
+
+/*
+ * This structure is stored inside the "private_data" member of the file
+ * structure and rapresent the main data sructure for the eventpoll
+ * interface.
+ */
+struct eventpoll {
+	/*
+	 * Protect the evenpoll interface from sys_epoll_ctl(2), ioctl(EP_POLL)
+	 * and ->write() concurrency. It basically serialize the add/remove/edit
+	 * of items in the interest set.
+	 */
+	struct rw_semaphore acsem;
+
+	/*
+	 * Protect the this structure access. When the "acsem" is acquired
+	 * togheter with this one, "acsem" should be acquired first. Or,
+	 * "lock" nests inside "acsem".
+	 */
+	rwlock_t lock;
+
+	/* Wait queue used by sys_epoll_wait() and ioctl(EP_POLL) */
+	wait_queue_head_t wq;
+
+	/* Wait queue used by file->poll() */
+	wait_queue_head_t poll_wait;
+
+	/* This is the hash used to store the "struct epitem" elements */
+	struct list_head *hash;
+
+	unsigned int hbits;
+	unsigned int hmask;
+	atomic_t hents;
+	atomic_t resize;
+
+	/* Number of pages currently allocated in each side of the double buffer */
+	int numpages;
+
+	/*
+	 * Current page set pointer, switched from "pages0" and "pages1" each time
+	 * ep_poll() returns events to the caller.
+	 */
+	char **pages;
+
+	/* Each one of these contains the pages allocated for each side of
+	 * the double buffer.
+	 */
+	char *pages0[MAX_EVENTPOLL_PAGES];
+	char *pages1[MAX_EVENTPOLL_PAGES];
+
+	/*
+	 * Variable containing the vma base address where the double buffer
+	 * pages are mapped onto.
+	 */
+	unsigned long vmabase;
+
+	/*
+	 * Certain functions cannot be called if the double buffer pages are
+	 * not allocated and if the memory mapping is not in place. This tells
+	 * us that everything is setup to fully use the interface.
+	 */
+	atomic_t mmapped;
+
+	/* Number of events currently available inside the current snapshot */
+	int eventcnt;
+
+	/*
+	 * Variable storing the current "version" of the snapshot. It is used
+	 * to validate the validity of the current slot pointed by the "index"
+	 * member of a "struct epitem".
+	 */
+	event_version_t ver;
+};
+
+/*
+ * Each file descriptor added to the eventpoll interface will
+ * have an entry of this type linked to the hash.
+ */
+struct epitem {
+	/* List header used to link this structure to the eventpoll hash */
+	struct list_head llink;
+
+	/* The "container" of this item */
+	struct eventpoll *ep;
+
+	/* The file this item refers to */
+	struct file *file;
+
+	/* The structure that describe the interested events and the source fd */
+	struct pollfd pfd;
+
+	/*
+	 * The index inside the current double buffer that stores the active
+	 * event slot for this item ( file ).
+	 */
+	int index;
+
+	/*
+	 * The version that is used to validate if the current slot is still
+	 * valid or if it refers to an old snapshot. It is matches togheter
+	 * with the one inside the eventpoll structure.
+	 */
+	event_version_t ver;
+};
+
+
+
+
+static int ep_getfd(int *efd, struct inode **einode, struct file **efile);
+static int ep_alloc_pages(char **pages, int numpages);
+static int ep_free_pages(char **pages, int numpages);
+static int ep_init(struct eventpoll *ep);
+static void ep_free(struct eventpoll *ep);
+static struct epitem *ep_find_nl(struct eventpoll *ep, int fd);
+static struct epitem *ep_find(struct eventpoll *ep, int fd);
+static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags);
+static int ep_insert(struct eventpoll *ep, struct pollfd *pfd);
+static int ep_remove(struct eventpoll *ep, struct epitem *dpi);
+static void notify_proc(struct file *file, void *data, unsigned long *local,
+			long *event);
+static int open_eventpoll(struct inode *inode, struct file *file);
+static int close_eventpoll(struct inode *inode, struct file *file);
+static unsigned int poll_eventpoll(struct file *file, poll_table *wait);
+static int write_eventpoll(struct file *file, const char *buffer, size_t count,
+			   loff_t *ppos);
+static int ep_poll(struct eventpoll *ep, struct evpoll *dvp);
+static int ep_do_alloc_pages(struct eventpoll *ep, int numpages);
+static int ioctl_eventpoll(struct inode *inode, struct file *file,
+			   unsigned int cmd, unsigned long arg);
+static void eventpoll_mm_open(struct vm_area_struct * vma);
+static void eventpoll_mm_close(struct vm_area_struct * vma);
+static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma);
+static int eventpollfs_delete_dentry(struct dentry *dentry);
+static struct inode *get_eventpoll_inode(void);
+static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
+					      int flags, char *dev_name, void *data);
+
+
+
+/* Slab cache used to allocate "struct epitem" */
+static kmem_cache_t *dpi_cache;
+
+/* Virtual fs used to allocate inodes for eventpoll files */
+static struct vfsmount *eventpoll_mnt;
+
+/* File callbacks that implement the eventpoll file behaviour */
+static struct file_operations eventpoll_fops = {
+	.write		= write_eventpoll,
+	.ioctl		= ioctl_eventpoll,
+	.mmap		= mmap_eventpoll,
+	.open		= open_eventpoll,
+	.release	= close_eventpoll,
+	.poll		= poll_eventpoll
+};
+
+/* Memory mapping callbacks for the eventpoll file */
+static struct vm_operations_struct eventpoll_mmap_ops = {
+	.open		= eventpoll_mm_open,
+	.close		= eventpoll_mm_close,
+};
+
+/*
+ * The "struct miscdevice" is used to register the eventpoll device
+ * to make it suitable to be openend from a /dev file.
+ */
+static struct miscdevice eventpoll_miscdev = {
+	EVENTPOLL_MINOR, "eventpoll", &eventpoll_fops
+};
+
+/*
+ * This is used to register the virtual file system from where
+ * eventpoll inodes are allocated.
+ */
+static struct file_system_type eventpoll_fs_type = {
+	.name		= "eventpollfs",
+	.get_sb		= eventpollfs_get_sb,
+	.kill_sb	= kill_anon_super,
+};
+
+/* Very basic directory entry operations for the eventpoll virtual file system */
+static struct dentry_operations eventpollfs_dentry_operations = {
+	.d_delete	= eventpollfs_delete_dentry,
+};
+
+
+
+/*
+ * It opens an eventpoll file descriptor by allocating space for "maxfds"
+ * file descriptors. It is the kernel part of the userspace epoll_create(2).
+ */
+asmlinkage int sys_epoll_create(int maxfds)
+{
+	int error = -EINVAL, fd;
+	unsigned long addr;
+	struct inode *inode;
+	struct file *file;
+	struct eventpoll *ep;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
+		     current, maxfds));
+
+	/*
+	 * It is not possible to store more than MAX_FDS_IN_EVENTPOLL file
+	 * descriptors inside the eventpoll interface.
+	 */
+	if (maxfds > MAX_FDS_IN_EVENTPOLL)
+		goto eexit_1;
+
+	/*
+	 * Creates all the items needed to setup an eventpoll file. That is,
+	 * a file structure, and inode and a free file descriptor.
+	 */
+	error = ep_getfd(&fd, &inode, &file);
+	if (error)
+		goto eexit_1;
+
+	/*
+	 * Calls the code to initialize the eventpoll file. This code is
+	 * the same as the "open" file operation callback because inside
+	 * ep_getfd() we did what the kernel usually does before invoking
+	 * corresponding file "open" callback.
+	 */
+	error = open_eventpoll(inode, file);
+	if (error)
+		goto eexit_2;
+
+	/* The "private_data" member is setup by open_eventpoll() */
+	ep = file->private_data;
+
+	/* Alloc pages for the event double buffer */
+	error = ep_do_alloc_pages(ep, EP_FDS_PAGES(maxfds + 1));
+	if (error)
+		goto eexit_2;
+
+	/*
+	 * Create a user space mapping of the event double buffer to
+	 * avoid kernel to user space memory copy when returning events
+	 * to the caller.
+	 */
+	down_write(&current->mm->mmap_sem);
+	addr = do_mmap_pgoff(file, 0, EP_MAP_SIZE(maxfds + 1), PROT_READ,
+			     MAP_PRIVATE, 0);
+	up_write(&current->mm->mmap_sem);
+	error = PTR_ERR((void *) addr);
+	if (IS_ERR((void *) addr))
+		goto eexit_2;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
+		     current, maxfds, fd));
+
+	return fd;
+
+eexit_2:
+	sys_close(fd);
+eexit_1:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
+		     current, maxfds, error));
+	return error;
+}
+
+
+/*
+ * The following function implement the controller interface for the eventpoll
+ * file that enable the insertion/removal/change of file descriptors inside
+ * the interest set. It rapresents the kernel part of the user spcae epoll_ctl(2).
+ */
+asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events)
+{
+	int error = -EBADF;
+	struct file *file;
+	struct eventpoll *ep;
+	struct epitem *dpi;
+	struct pollfd pfd;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u)\n",
+		     current, epfd, op, fd, events));
+
+	file = fget(epfd);
+	if (!file)
+		goto eexit_1;
+
+	/*
+	 * We have to check that the file structure underneath the file descriptor
+	 * the user passed to us _is_ an eventpoll file.
+	 */
+	error = -EINVAL;
+	if (!IS_FILE_EPOLL(file))
+		goto eexit_2;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = file->private_data;
+
+	down_write(&ep->acsem);
+
+	pfd.fd = fd;
+	pfd.events = events | POLLERR | POLLHUP;
+	pfd.revents = 0;
+
+	dpi = ep_find(ep, fd);
+
+	error = -EINVAL;
+	switch (op) {
+	case EP_CTL_ADD:
+		if (!dpi)
+			error = ep_insert(ep, &pfd);
+		else
+			error = -EEXIST;
+		break;
+	case EP_CTL_DEL:
+		if (dpi)
+			error = ep_remove(ep, dpi);
+		else
+			error = -ENOENT;
+		break;
+	case EP_CTL_MOD:
+		if (dpi) {
+			dpi->pfd.events = events;
+			error = 0;
+		} else
+			error = -ENOENT;
+		break;
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %u) = %d\n",
+		     current, epfd, op, fd, events, error));
+
+	up_write(&ep->acsem);
+
+eexit_2:
+	fput(file);
+eexit_1:
+	return error;
+}
+
+
+/*
+ * Implement the event wait interface for the eventpoll file. It is the kernel
+ * part of the user space epoll_wait(2).
+ */
+asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout)
+{
+	int error = -EBADF;
+	void *eaddr;
+	struct file *file;
+	struct eventpoll *ep;
+	struct evpoll dvp;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d)\n",
+		     current, epfd, events, timeout));
+
+	file = fget(epfd);
+	if (!file)
+		goto eexit_1;
+
+	/*
+	 * We have to check that the file structure underneath the file descriptor
+	 * the user passed to us _is_ an eventpoll file.
+	 */
+	error = -EINVAL;
+	if (!IS_FILE_EPOLL(file))
+		goto eexit_2;
+
+	/*
+	 * At this point it is safe to assume that the "private_data" contains
+	 * our own data structure.
+	 */
+	ep = file->private_data;
+
+	/*
+	 * It is possible that the user created an eventpoll file by open()ing
+	 * the corresponding /dev/ file and he did not perform the correct
+	 * initialization required by the old /dev/epoll interface. This test
+	 * protect us from this scenario.
+	 */
+	error = -EINVAL;
+	if (!atomic_read(&ep->mmapped))
+		goto eexit_2;
+
+	dvp.ep_timeout = timeout;
+	error = ep_poll(ep, &dvp);
+	if (error > 0) {
+		eaddr = (void *) (ep->vmabase + dvp.ep_resoff);
+		if (copy_to_user(events, &eaddr, sizeof(struct pollfd *)))
+			error = -EFAULT;
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d) = %d\n",
+		     current, epfd, events, timeout, error));
+
+eexit_2:
+	fput(file);
+eexit_1:
+	return error;
+}
+
+
+/*
+ * Creates the file descriptor to be used by the epoll interface.
+ */
+static int ep_getfd(int *efd, struct inode **einode, struct file **efile)
+{
+	struct qstr this;
+	char name[32];
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *file;
+	int error, fd;
+
+	/* Get an ready to use file */
+	error = -ENFILE;
+	file = get_empty_filp();
+	if (!file)
+		goto eexit_1;
+
+	/* Allocates an inode from the eventpoll file system */
+	inode = get_eventpoll_inode();
+	error = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto eexit_2;
+
+	/* Allocates a free descriptor to plug the file onto */
+	error = get_unused_fd();
+	if (error < 0)
+		goto eexit_3;
+	fd = error;
+
+	/*
+	 * Link the inode to a directory entry by creating a unique name
+	 * using the inode number.
+	 */
+	error = -ENOMEM;
+	sprintf(name, "[%lu]", inode->i_ino);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = inode->i_ino;
+	dentry = d_alloc(eventpoll_mnt->mnt_sb->s_root, &this);
+	if (!dentry)
+		goto eexit_4;
+	dentry->d_op = &eventpollfs_dentry_operations;
+	d_add(dentry, inode);
+	file->f_vfsmnt = mntget(eventpoll_mnt);
+	file->f_dentry = dget(dentry);
+
+	/*
+	 * Initialize the file as read/write because it could be used
+	 * with write() to add/remove/change interest sets.
+	 */
+	file->f_pos = 0;
+	file->f_flags = O_RDWR;
+	file->f_op = &eventpoll_fops;
+	file->f_mode = FMODE_READ | FMODE_WRITE;
+	file->f_version = 0;
+	file->private_data = NULL;
+
+	/* Install the new setup file into the allocated fd. */
+	fd_install(fd, file);
+
+	*efd = fd;
+	*einode = inode;
+	*efile = file;
+	return 0;
+
+eexit_4:
+	put_unused_fd(fd);
+eexit_3:
+	iput(inode);
+eexit_2:
+	put_filp(file);
+eexit_1:
+	return error;
+}
+
+
+static int ep_alloc_pages(char **pages, int numpages)
+{
+	int ii;
+
+	for (ii = 0; ii < numpages; ii++) {
+		pages[ii] = (char *) __get_free_pages(GFP_KERNEL, 0);
+		if (!pages[ii]) {
+			for (--ii; ii >= 0; ii--) {
+				ClearPageReserved(virt_to_page(pages[ii]));
+				free_pages((unsigned long) pages[ii], 0);
+			}
+			return -ENOMEM;
+		}
+		SetPageReserved(virt_to_page(pages[ii]));
+	}
+	return 0;
+}
+
+
+static int ep_free_pages(char **pages, int numpages)
+{
+	int ii;
+
+	for (ii = 0; ii < numpages; ii++) {
+		ClearPageReserved(virt_to_page(pages[ii]));
+		free_pages((unsigned long) pages[ii], 0);
+	}
+	return 0;
+}
+
+
+static int ep_init(struct eventpoll *ep)
+{
+	int ii, hentries;
+
+	init_rwsem(&ep->acsem);
+	rwlock_init(&ep->lock);
+	init_waitqueue_head(&ep->wq);
+	init_waitqueue_head(&ep->poll_wait);
+	ep->hbits = INITIAL_HASH_BITS;
+	ep->hmask = (1 << ep->hbits) - 1;
+	atomic_set(&ep->hents, 0);
+	atomic_set(&ep->resize, 0);
+	atomic_set(&ep->mmapped, 0);
+	ep->numpages = 0;
+	ep->vmabase = 0;
+	ep->pages = ep->pages0;
+	ep->eventcnt = 0;
+	ep->ver = 1;
+
+	hentries = ep->hmask + 1;
+	if (!(ep->hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head))))
+		return -ENOMEM;
+
+	for (ii = 0; ii < hentries; ii++)
+		INIT_LIST_HEAD(&ep->hash[ii]);
+
+	return 0;
+}
+
+
+static void ep_free(struct eventpoll *ep)
+{
+	int ii;
+	struct list_head *lsthead;
+
+	/*
+	 * Walks through the whole hash by unregistering file callbacks and
+	 * freeing each "struct epitem".
+	 */
+	for (ii = 0; ii <= ep->hmask; ii++) {
+		lsthead = &ep->hash[ii];
+		while (!list_empty(lsthead)) {
+			struct epitem *dpi = list_entry(lsthead->next, struct epitem, llink);
+
+			file_notify_delcb(dpi->file, notify_proc);
+			list_del(lsthead->next);
+			DPI_MEM_FREE(dpi);
+		}
+	}
+	/*
+	 * At this point we can free the hash and the pages used for the event
+	 * double buffer. The ep_free() function is called from the "close"
+	 * file operations callback, and this garanties us that the pages are
+	 * already unmapped.
+	 */
+	vfree(ep->hash);
+	if (ep->numpages > 0) {
+		ep_free_pages(ep->pages0, ep->numpages);
+		ep_free_pages(ep->pages1, ep->numpages);
+	}
+}
+
+
+/*
+ * No lock version of ep_find(), used when the code had to acquire the lock
+ * before calling the function.
+ */
+static struct epitem *ep_find_nl(struct eventpoll *ep, int fd)
+{
+	struct epitem *dpi = NULL;
+	struct list_head *lsthead, *lnk;
+
+	lsthead = &ep->hash[fd & ep->hmask];
+	list_for_each(lnk, lsthead) {
+		dpi = list_entry(lnk, struct epitem, llink);
+
+		if (dpi->pfd.fd == fd) break;
+		dpi = NULL;
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%d) -> %p\n",
+		     current, fd, dpi));
+
+	return dpi;
+}
+
+
+static struct epitem *ep_find(struct eventpoll *ep, int fd)
+{
+	struct epitem *dpi;
+	unsigned long flags;
+
+	read_lock_irqsave(&ep->lock, flags);
+
+	dpi = ep_find_nl(ep, fd);
+
+	read_unlock_irqrestore(&ep->lock, flags);
+
+	return dpi;
+}
+
+
+static int ep_hashresize(struct eventpoll *ep, unsigned long *kflags)
+{
+	struct list_head *hash, *oldhash;
+	unsigned int hbits = ep->hbits + 1;
+	unsigned int hmask = (1 << hbits) - 1;
+	int ii, res, hentries = hmask + 1;
+	unsigned long flags = *kflags;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_hashresize(%p) bits=%u\n",
+		     current, ep, hbits));
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	res = -ENOMEM;
+	if (!(hash = (struct list_head *) vmalloc(hentries * sizeof(struct list_head)))) {
+		write_lock_irqsave(&ep->lock, flags);
+		goto eexit_1;
+	}
+
+	for (ii = 0; ii < hentries; ii++)
+		INIT_LIST_HEAD(&hash[ii]);
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	oldhash = ep->hash;
+	for (ii = 0; ii <= ep->hmask; ii++) {
+		struct list_head *oldhead = &oldhash[ii], *lnk;
+
+		while (!list_empty(oldhead)) {
+			struct epitem *dpi = list_entry(lnk = oldhead->next, struct epitem, llink);
+
+			list_del(lnk);
+			list_add(lnk, &hash[dpi->pfd.fd & hmask]);
+		}
+	}
+
+	ep->hash = hash;
+	ep->hbits = hbits;
+	ep->hmask = hmask;
+
+	write_unlock_irqrestore(&ep->lock, flags);
+	vfree(oldhash);
+	write_lock_irqsave(&ep->lock, flags);
+
+	res = 0;
+eexit_1:
+	*kflags = flags;
+	atomic_dec(&ep->resize);
+	return res;
+}
+
+
+static int ep_insert(struct eventpoll *ep, struct pollfd *pfd)
+{
+	int error;
+	struct epitem *dpi;
+	struct file *file;
+	unsigned long flags;
+
+	if (atomic_read(&ep->hents) >= (ep->numpages * POLLFD_X_PAGE))
+		return -E2BIG;
+
+	file = fget(pfd->fd);
+	if (!file)
+		return -EBADF;
+
+	error = -ENOMEM;
+	if (!(dpi = DPI_MEM_ALLOC()))
+		goto eexit_1;
+
+	INIT_LIST_HEAD(&dpi->llink);
+	dpi->ep = ep;
+	dpi->file = file;
+	dpi->pfd = *pfd;
+	dpi->index = -1;
+	dpi->ver = ep->ver - 1;
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	list_add(&dpi->llink, &ep->hash[pfd->fd & ep->hmask]);
+	atomic_inc(&ep->hents);
+
+	if (!atomic_read(&ep->resize) &&
+	    (atomic_read(&ep->hents) >> ep->hbits) > RESIZE_LENGTH &&
+	    ep->hbits < MAX_HASH_BITS) {
+		atomic_inc(&ep->resize);
+		ep_hashresize(ep, &flags);
+	}
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	file_notify_addcb(file, notify_proc, dpi);
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %d)\n",
+		     current, ep, pfd->fd));
+
+	error = 0;
+eexit_1:
+	fput(file);
+
+	return error;
+}
+
+
+/*
+ * Removes a "struct epitem" from the eventpoll hash and deallocates
+ * all the associated resources.
+ */
+static int ep_remove(struct eventpoll *ep, struct epitem *dpi)
+{
+	unsigned long flags;
+	struct pollfd *pfd, *lpfd;
+	struct epitem *ldpi;
+
+	/* First, removes the callback from the file callback list */
+	file_notify_delcb(dpi->file, notify_proc);
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	list_del(&dpi->llink);
+	atomic_dec(&ep->hents);
+
+	/*
+	 * This is to remove stale events. We don't want that the removed file
+	 * has a pending event that might be associated with a file inserted
+	 * at a later time inside the eventpoll interface. this code checks
+	 * if the currently removed file has a valid pending event and, if it does,
+	 * manages things to remove it and decrement the currently available
+	 * event count.
+	 */
+	if (dpi->index >= 0 && dpi->ver == ep->ver && dpi->index < ep->eventcnt) {
+		pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+					 EVENT_PAGE_OFFSET(dpi->index));
+		if (pfd->fd == dpi->pfd.fd && dpi->index < --ep->eventcnt) {
+			lpfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(ep->eventcnt)] +
+						  EVENT_PAGE_OFFSET(ep->eventcnt));
+			*pfd = *lpfd;
+
+			if ((ldpi = ep_find_nl(ep, pfd->fd))) ldpi->index = dpi->index;
+		}
+	}
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %d)\n",
+		     current, ep, dpi->pfd.fd));
+
+	/* At this point it is safe to free the eventpoll item */
+	DPI_MEM_FREE(dpi);
+
+	return 0;
+}
+
+
+/*
+ * This is the event notify callback that is called from fs/fcblist.c because
+ * of the registration ( file_notify_addcb() ) done in ep_insert().
+ */
+static void notify_proc(struct file *file, void *data, unsigned long *local,
+			long *event)
+{
+	struct epitem *dpi = data;
+	struct eventpoll *ep = dpi->ep;
+	struct pollfd *pfd;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: notify(%p, %p, %ld, %ld) ep=%p\n",
+		     current, file, data, event[0], event[1], ep));
+
+	/*
+	 * We don't need to disable IRQs here because the callback dispatch
+	 * routine inside fs/fcblist.c already call us with disabled IRQ.
+	 */
+	write_lock(&ep->lock);
+
+	/* We're not expecting any of those events. Jump out soon ... */
+	if (!(dpi->pfd.events & event[1]))
+		goto out;
+
+	/*
+	 * This logic determins if an active even slot is available for the
+	 * currently signaled file, or if we have to make space for a new one
+	 * and increment the number of ready file descriptors ( ep->eventcnt ).
+	 */
+	if (dpi->index < 0 || dpi->ver != ep->ver) {
+		if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
+			goto out;
+		dpi->index = ep->eventcnt++;
+		dpi->ver = ep->ver;
+		pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+					 EVENT_PAGE_OFFSET(dpi->index));
+		*pfd = dpi->pfd;
+	} else {
+		pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+					 EVENT_PAGE_OFFSET(dpi->index));
+		if (pfd->fd != dpi->pfd.fd) {
+			if (ep->eventcnt >= (ep->numpages * POLLFD_X_PAGE))
+				goto out;
+			dpi->index = ep->eventcnt++;
+			pfd = (struct pollfd *) (ep->pages[EVENT_PAGE_INDEX(dpi->index)] +
+						 EVENT_PAGE_OFFSET(dpi->index));
+			*pfd = dpi->pfd;
+		}
+	}
+
+	/*
+	 * Merge event bits into the corresponding event slot inside the
+	 * double buffer.
+	 */
+	pfd->revents |= (pfd->events & event[1]);
+
+	/*
+	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
+	 * wait list.
+	 */
+	if (waitqueue_active(&ep->wq))
+		wake_up(&ep->wq);
+	if (waitqueue_active(&ep->poll_wait))
+		wake_up(&ep->poll_wait);
+out:
+	write_unlock(&ep->lock);
+}
+
+
+static int open_eventpoll(struct inode *inode, struct file *file)
+{
+	int res;
+	struct eventpoll *ep;
+
+	if (!(ep = kmalloc(sizeof(struct eventpoll), GFP_KERNEL)))
+		return -ENOMEM;
+
+	memset(ep, 0, sizeof(*ep));
+	if ((res = ep_init(ep))) {
+		kfree(ep);
+		return res;
+	}
+
+	file->private_data = ep;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: open() ep=%p\n", current, ep));
+	return 0;
+}
+
+
+static int close_eventpoll(struct inode *inode, struct file *file)
+{
+	struct eventpoll *ep = file->private_data;
+
+	if (ep) {
+		ep_free(ep);
+		kfree(ep);
+	}
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
+	return 0;
+}
+
+
+static unsigned int poll_eventpoll(struct file *file, poll_table *wait)
+{
+	struct eventpoll *ep = file->private_data;
+
+	poll_wait(file, &ep->poll_wait, wait);
+	if (ep->eventcnt)
+		return POLLIN | POLLRDNORM;
+
+	return 0;
+}
+
+
+static int write_eventpoll(struct file *file, const char *buffer, size_t count,
+			   loff_t *ppos)
+{
+	int rcount;
+	struct eventpoll *ep = file->private_data;
+	struct epitem *dpi;
+	struct pollfd pfd;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d)\n", current, ep, count));
+
+	/* The size of the write must be a multiple of sizeof(struct pollfd) */
+	rcount = -EINVAL;
+	if (count % sizeof(struct pollfd))
+		goto eexit_1;
+
+	/*
+	 * And we have also to verify that that area is correctly accessible
+	 * for the user.
+	 */
+	if ((rcount = verify_area(VERIFY_READ, buffer, count)))
+		goto eexit_1;
+
+	down_write(&ep->acsem);
+
+	rcount = 0;
+
+	while (count > 0) {
+		if (__copy_from_user(&pfd, buffer, sizeof(pfd))) {
+			rcount = -EFAULT;
+			goto eexit_2;
+		}
+
+		dpi = ep_find(ep, pfd.fd);
+
+		if (pfd.fd >= current->files->max_fds || !current->files->fd[pfd.fd])
+			pfd.events = POLLREMOVE;
+		if (pfd.events & POLLREMOVE) {
+			if (dpi) {
+				ep_remove(ep, dpi);
+				rcount += sizeof(pfd);
+			}
+		}
+		else if (dpi) {
+			dpi->pfd.events = pfd.events;
+			rcount += sizeof(pfd);
+		} else {
+			pfd.revents = 0;
+			if (!ep_insert(ep, &pfd))
+				rcount += sizeof(pfd);
+		}
+
+		buffer += sizeof(pfd);
+		count -= sizeof(pfd);
+	}
+
+eexit_2:
+	up_write(&ep->acsem);
+eexit_1:
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: write(%p, %d) = %d\n",
+		     current, ep, count, rcount));
+
+	return rcount;
+}
+
+
+static int ep_poll(struct eventpoll *ep, struct evpoll *dvp)
+{
+	int res = 0;
+	long timeout;
+	unsigned long flags;
+	wait_queue_t wait;
+
+	/*
+	 * We don't want ep_poll() to be called if the correct sequence
+	 * of operations are performed to initialize it. This won't happen
+	 * for the system call interface but it could happen using the
+	 * old /dev/epoll interface, that is maintained for compatibility.
+	 */
+	if (!atomic_read(&ep->mmapped))
+		return -EINVAL;
+
+	write_lock_irqsave(&ep->lock, flags);
+
+	res = 0;
+	if (!ep->eventcnt) {
+		/*
+		 * We don't have any available event to return to the caller.
+		 * We need to sleep here, and we will be wake up by
+		 * notify_proc() when events will become available.
+		 */
+		init_waitqueue_entry(&wait, current);
+		add_wait_queue(&ep->wq, &wait);
+
+		/*
+		 * Calculate the timeout by checking for the "infinite" value ( -1 )
+		 * and the overflow condition ( > MAX_SCHEDULE_TIMEOUT / HZ ). The
+		 * passed timeout is in milliseconds, that why (t * HZ) / 1000.
+		 */
+		timeout = dvp->ep_timeout == -1 || dvp->ep_timeout > MAX_SCHEDULE_TIMEOUT / HZ ?
+			MAX_SCHEDULE_TIMEOUT: (dvp->ep_timeout * HZ) / 1000;
+
+		for (;;) {
+			/*
+			 * We don't want to sleep if the notify_proc() sends us
+			 * a wakeup in between. That's why we set the task state
+			 * to TASK_INTERRUPTIBLE before doing the checks.
+			 */
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (ep->eventcnt || !timeout)
+				break;
+			if (signal_pending(current)) {
+				res = -EINTR;
+				break;
+			}
+
+			write_unlock_irqrestore(&ep->lock, flags);
+			timeout = schedule_timeout(timeout);
+			write_lock_irqsave(&ep->lock, flags);
+		}
+		remove_wait_queue(&ep->wq, &wait);
+
+		set_current_state(TASK_RUNNING);
+	}
+
+	/*
+	 * If we've been wake up because of events became available, we need to:
+	 *
+	 * 1) null the number of available ready file descriptors
+	 * 2) increment the version of the current ( next ) snapshot
+	 * 3) swap the double buffer to return the current one to the caller
+	 * 4) set the current ( for the user, previous for the interface ) offset
+	 */
+	if (!res && ep->eventcnt) {
+		res = ep->eventcnt;
+		ep->eventcnt = 0;
+		++ep->ver;
+		if (ep->pages == ep->pages0) {
+			ep->pages = ep->pages1;
+			dvp->ep_resoff = 0;
+		} else {
+			ep->pages = ep->pages0;
+			dvp->ep_resoff = ep->numpages * PAGE_SIZE;
+		}
+	}
+
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	return res;
+}
+
+
+static int ep_do_alloc_pages(struct eventpoll *ep, int numpages)
+{
+	int res, pgalloc, pgcpy;
+	unsigned long flags;
+	char **pages, **pages0, **pages1;
+
+	if (atomic_read(&ep->mmapped))
+		return -EBUSY;
+	if (numpages > MAX_EVENTPOLL_PAGES)
+		return -EINVAL;
+
+	pgalloc = numpages - ep->numpages;
+	if ((pages = (char **) vmalloc(2 * (pgalloc + 1) * sizeof(char *))) == NULL)
+		return -ENOMEM;
+	pages0 = &pages[0];
+	pages1 = &pages[pgalloc + 1];
+
+	if ((res = ep_alloc_pages(pages0, pgalloc)))
+		goto eexit_1;
+
+	if ((res = ep_alloc_pages(pages1, pgalloc))) {
+		ep_free_pages(pages0, pgalloc);
+		goto eexit_1;
+	}
+
+	write_lock_irqsave(&ep->lock, flags);
+	pgcpy = (ep->numpages + pgalloc) > numpages ? numpages - ep->numpages: pgalloc;
+	if (pgcpy > 0) {
+		memcpy(&ep->pages0[ep->numpages], pages0, pgcpy * sizeof(char *));
+		memcpy(&ep->pages1[ep->numpages], pages1, pgcpy * sizeof(char *));
+		ep->numpages += pgcpy;
+	}
+	write_unlock_irqrestore(&ep->lock, flags);
+
+	if (pgcpy < pgalloc) {
+		if (pgcpy < 0)
+			pgcpy = 0;
+		ep_free_pages(&pages0[pgcpy], pgalloc - pgcpy);
+		ep_free_pages(&pages1[pgcpy], pgalloc - pgcpy);
+	}
+
+eexit_1:
+	vfree(pages);
+	return res;
+}
+
+
+static int ioctl_eventpoll(struct inode *inode, struct file *file,
+			   unsigned int cmd, unsigned long arg)
+{
+	int res;
+	struct eventpoll *ep = file->private_data;
+	struct epitem *dpi;
+	unsigned long flags;
+	struct pollfd pfd;
+	struct evpoll dvp;
+
+	switch (cmd) {
+	case EP_ALLOC:
+		res = ep_do_alloc_pages(ep, EP_FDS_PAGES(arg));
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ALLOC, %lu) == %d\n",
+			     current, ep, arg, res));
+		return res;
+
+	case EP_FREE:
+		if (atomic_read(&ep->mmapped))
+			return -EBUSY;
+
+		res = -EINVAL;
+		write_lock_irqsave(&ep->lock, flags);
+		if (ep->numpages > 0) {
+			ep_free_pages(ep->pages0, ep->numpages);
+			ep_free_pages(ep->pages1, ep->numpages);
+			ep->numpages = 0;
+			ep->pages = ep->pages0;
+			res = 0;
+		}
+		write_unlock_irqrestore(&ep->lock, flags);
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_FREE) == %d\n",
+			     current, ep, res));
+		return res;
+
+	case EP_POLL:
+		if (copy_from_user(&dvp, (void *) arg, sizeof(struct evpoll)))
+			return -EFAULT;
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d)\n",
+			     current, ep, dvp.ep_timeout));
+
+		res = ep_poll(ep, &dvp);
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_POLL, %d) == %d\n",
+			     current, ep, dvp.ep_timeout, res));
+
+		if (res > 0 && copy_to_user((void *) arg, &dvp, sizeof(struct evpoll)))
+			res = -EFAULT;
+
+		return res;
+
+	case EP_ISPOLLED:
+		if (copy_from_user(&pfd, (void *) arg, sizeof(struct pollfd)))
+			return 0;
+
+		read_lock_irqsave(&ep->lock, flags);
+
+		res = 0;
+		if (!(dpi = ep_find_nl(ep, pfd.fd)))
+			goto is_not_polled;
+
+		pfd = dpi->pfd;
+		res = 1;
+
+	is_not_polled:
+		read_unlock_irqrestore(&ep->lock, flags);
+
+		if (res)
+			copy_to_user((void *) arg, &pfd, sizeof(struct pollfd));
+
+		DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ioctl(%p, EP_ISPOLLED, %d) == %d\n",
+			     current, ep, pfd.fd, res));
+		return res;
+	}
+
+	return -EINVAL;
+}
+
+
+static void eventpoll_mm_open(struct vm_area_struct * vma)
+{
+	struct file *file = vma->vm_file;
+	struct eventpoll *ep = file->private_data;
+
+	if (ep) atomic_inc(&ep->mmapped);
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_open(%p)\n", current, ep));
+}
+
+
+static void eventpoll_mm_close(struct vm_area_struct * vma)
+{
+	struct file *file = vma->vm_file;
+	struct eventpoll *ep = file->private_data;
+
+	if (ep && atomic_dec_and_test(&ep->mmapped))
+		ep->vmabase = 0;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mm_close(%p)\n", current, ep));
+}
+
+
+static int mmap_eventpoll(struct file *file, struct vm_area_struct *vma)
+{
+	struct eventpoll *ep = file->private_data;
+	unsigned long start;
+	int ii, res, numpages;
+	size_t mapsize;
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx)\n",
+		     current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT));
+
+	/*
+	 * We need the eventpoll file to be RW but we don't want it to be
+	 * mapped RW. This test perform the test and reject RW mmaping.
+	 */
+	if (vma->vm_flags & VM_WRITE)
+		return -EACCES;
+
+	if ((vma->vm_pgoff << PAGE_SHIFT) != 0)
+		return -EINVAL;
+
+	/*
+	 * We need to verify that the mapped area covers all the allocated
+	 * double buffer.
+	 */
+	mapsize = PAGE_ALIGN(vma->vm_end - vma->vm_start);
+	numpages = mapsize >> PAGE_SHIFT;
+
+	res = -EINVAL;
+	if (numpages != (2 * ep->numpages))
+		goto eexit_1;
+
+	/*
+	 * Map the double buffer starting from "vma->vm_start" up to
+	 * "vma->vm_start + ep->numpages * PAGE_SIZE".
+	 */
+	start = vma->vm_start;
+	for (ii = 0; ii < ep->numpages; ii++) {
+		if ((res = remap_page_range(vma, start, __pa(ep->pages0[ii]),
+					    PAGE_SIZE, vma->vm_page_prot)))
+			goto eexit_1;
+		start += PAGE_SIZE;
+	}
+	for (ii = 0; ii < ep->numpages; ii++) {
+		if ((res = remap_page_range(vma, start, __pa(ep->pages1[ii]),
+					    PAGE_SIZE, vma->vm_page_prot)))
+			goto eexit_1;
+		start += PAGE_SIZE;
+	}
+	vma->vm_ops = &eventpoll_mmap_ops;
+
+	/* Saves the base mapping address for later use in sys_epoll_wait(2) */
+	ep->vmabase = vma->vm_start;
+
+	/*
+	 * Ok, mapping has been done. We can open the door to functions that
+	 * requires the mapping to be in place.
+	 */
+	atomic_set(&ep->mmapped, 1);
+
+	res = 0;
+eexit_1:
+
+	DNPRINTK(3, (KERN_INFO "[%p] eventpoll: mmap(%p, %lx, %lx) == %d\n",
+		     current, ep, vma->vm_start, vma->vm_pgoff << PAGE_SHIFT, res));
+	return res;
+}
+
+
+static int eventpollfs_delete_dentry(struct dentry *dentry)
+{
+
+	return 1;
+}
+
+
+static struct inode *get_eventpoll_inode(void)
+{
+	int error = -ENOMEM;
+	struct inode *inode = new_inode(eventpoll_mnt->mnt_sb);
+
+	if (!inode)
+		goto eexit_1;
+
+	inode->i_fop = &eventpoll_fops;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because "mark_inode_dirty()" will think
+	 * that it already _is_ on the dirty list.
+	 */
+	inode->i_state = I_DIRTY;
+	inode->i_mode = S_IRUSR | S_IWUSR;
+	inode->i_uid = current->fsuid;
+	inode->i_gid = current->fsgid;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_blksize = PAGE_SIZE;
+	return inode;
+
+eexit_1:
+	return ERR_PTR(error);
+}
+
+
+static struct super_block *eventpollfs_get_sb(struct file_system_type *fs_type,
+					      int flags, char *dev_name, void *data)
+{
+
+	return get_sb_pseudo(fs_type, "eventpoll:", NULL, EVENTPOLLFS_MAGIC);
+}
+
+
+static int __init eventpoll_init(void)
+{
+	int error;
+
+	/* Allocates slab cache used to allocate "struct epitem" items */
+	error = -ENOMEM;
+	dpi_cache = kmem_cache_create("eventpoll",
+				      sizeof(struct epitem),
+				      __alignof__(struct epitem),
+				      DPI_SLAB_DEBUG, NULL, NULL);
+	if (!dpi_cache)
+		goto eexit_1;
+
+	/*
+	 * Register the virtual file system that will be the source of inodes
+	 * for the eventpoll files
+	 */
+	error = register_filesystem(&eventpoll_fs_type);
+	if (error)
+		goto eexit_2;
+
+	/* Mount the above commented virtual file system */
+	eventpoll_mnt = kern_mount(&eventpoll_fs_type);
+	error = PTR_ERR(eventpoll_mnt);
+	if (IS_ERR(eventpoll_mnt))
+		goto eexit_3;
+
+	/*
+	 * This is to maintain compatibility with the old /dev/epoll interface.
+	 * We need to register a misc device so that the caller can open(2) it
+	 * through a file inside /dev.
+	 */
+	error = misc_register(&eventpoll_miscdev);
+	if (error)
+		goto eexit_4;
+
+	printk(KERN_INFO "[%p] eventpoll: driver installed.\n", current);
+
+	return error;
+
+eexit_4:
+	mntput(eventpoll_mnt);
+eexit_3:
+	unregister_filesystem(&eventpoll_fs_type);
+eexit_2:
+	kmem_cache_destroy(dpi_cache);
+eexit_1:
+
+	return error;
+}
+
+
+static void __exit eventpoll_exit(void)
+{
+	/* Undo all operations done inside eventpoll_init() */
+	unregister_filesystem(&eventpoll_fs_type);
+	mntput(eventpoll_mnt);
+	misc_deregister(&eventpoll_miscdev);
+	kmem_cache_destroy(dpi_cache);
+}
+
+module_init(eventpoll_init);
+module_exit(eventpoll_exit);
+
+MODULE_LICENSE("GPL");
+
diff --git a/fs/Makefile b/fs/Makefile
index 68d7074b5ae2..d52ba79eb0df 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -6,14 +6,14 @@
 # 
 
 export-objs :=	open.o dcache.o buffer.o bio.o inode.o dquot.o mpage.o aio.o \
-                fcntl.o read_write.o dcookies.o
+                fcntl.o read_write.o dcookies.o fcblist.o
 
 obj-y :=	open.o read_write.o devices.o file_table.o buffer.o \
 		bio.o super.o block_dev.o char_dev.o stat.o exec.o pipe.o \
 		namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
 		dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \
 		filesystems.o namespace.o seq_file.o xattr.o libfs.o \
-		fs-writeback.o mpage.o direct-io.o aio.o
+		fs-writeback.o mpage.o direct-io.o aio.o fcblist.o
 
 ifneq ($(CONFIG_NFSD),n)
 ifneq ($(CONFIG_NFSD),)
diff --git a/fs/fcblist.c b/fs/fcblist.c
new file mode 100644
index 000000000000..d1e823b42ba8
--- /dev/null
+++ b/fs/fcblist.c
@@ -0,0 +1,146 @@
+/*
+ *  linux/fs/fcblist.c ( File event callbacks handling )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/poll.h>
+#include <asm/bitops.h>
+#include <linux/fcblist.h>
+
+
+long ion_band_table[NSIGPOLL] = {
+	ION_IN,		/* POLL_IN */
+	ION_OUT,	/* POLL_OUT */
+	ION_IN,		/* POLL_MSG */
+	ION_ERR,	/* POLL_ERR */
+	0,		/* POLL_PRI */
+	ION_HUP		/* POLL_HUP */
+};
+
+long poll_band_table[NSIGPOLL] = {
+	POLLIN | POLLRDNORM,			/* POLL_IN */
+	POLLOUT | POLLWRNORM | POLLWRBAND,	/* POLL_OUT */
+	POLLIN | POLLRDNORM | POLLMSG,		/* POLL_MSG */
+	POLLERR,				/* POLL_ERR */
+	POLLPRI | POLLRDBAND,			/* POLL_PRI */
+	POLLHUP | POLLERR			/* POLL_HUP */
+};
+
+
+
+/*
+ * Walk through the file callback list by calling each registered callback
+ * with the event that happened on the "filep" file. Callbacks are called
+ * by holding a read lock on the callback list lock, and also by keeping
+ * local IRQs disabled.
+ */
+void file_notify_event(struct file *filep, long *event)
+{
+	unsigned long flags;
+	struct list_head *lnk, *lsthead;
+
+	read_lock_irqsave(&filep->f_cblock, flags);
+
+	lsthead = &filep->f_cblist;
+	list_for_each(lnk, lsthead) {
+		struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
+
+		fcbp->cbproc(filep, fcbp->data, fcbp->local, event);
+	}
+
+	read_unlock_irqrestore(&filep->f_cblock, flags);
+}
+
+
+/*
+ * Add a new callback to the list of file callbacks.
+ */
+int file_notify_addcb(struct file *filep,
+		      void (*cbproc)(struct file *, void *, unsigned long *, long *),
+		      void *data)
+{
+	unsigned long flags;
+	struct fcb_struct *fcbp;
+
+	if (!(fcbp = (struct fcb_struct *) kmalloc(sizeof(struct fcb_struct), GFP_KERNEL)))
+		return -ENOMEM;
+
+	memset(fcbp, 0, sizeof(struct fcb_struct));
+	fcbp->cbproc = cbproc;
+	fcbp->data = data;
+
+	write_lock_irqsave(&filep->f_cblock, flags);
+	list_add_tail(&fcbp->llink, &filep->f_cblist);
+	write_unlock_irqrestore(&filep->f_cblock, flags);
+
+	return 0;
+}
+
+
+/*
+ * Removes the callback "cbproc" from the file callback list.
+ */
+int file_notify_delcb(struct file *filep,
+		      void (*cbproc)(struct file *, void *, unsigned long *, long *))
+{
+	unsigned long flags;
+	struct list_head *lnk, *lsthead;
+
+	write_lock_irqsave(&filep->f_cblock, flags);
+
+	lsthead = &filep->f_cblist;
+	list_for_each(lnk, lsthead) {
+		struct fcb_struct *fcbp = list_entry(lnk, struct fcb_struct, llink);
+
+		if (fcbp->cbproc == cbproc) {
+			list_del(lnk);
+			write_unlock_irqrestore(&filep->f_cblock, flags);
+			kfree(fcbp);
+			return 0;
+		}
+	}
+
+	write_unlock_irqrestore(&filep->f_cblock, flags);
+
+	return -ENOENT;
+}
+
+
+/*
+ * It is called at file cleanup time and removes all the registered callbacks.
+ */
+void file_notify_cleanup(struct file *filep)
+{
+	unsigned long flags;
+	struct list_head *lsthead;
+
+	write_lock_irqsave(&filep->f_cblock, flags);
+
+	lsthead = &filep->f_cblist;
+	while (!list_empty(lsthead)) {
+		struct fcb_struct *fcbp = list_entry(lsthead->next, struct fcb_struct, llink);
+
+		list_del(lsthead->next);
+		write_unlock_irqrestore(&filep->f_cblock, flags);
+		kfree(fcbp);
+		write_lock_irqsave(&filep->f_cblock, flags);
+	}
+
+	write_unlock_irqrestore(&filep->f_cblock, flags);
+}
+
diff --git a/fs/file_table.c b/fs/file_table.c
index fe6c048c2bab..7a6285b72b4b 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -8,6 +8,7 @@
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/file.h>
+#include <linux/fcblist.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/smp_lock.h>
@@ -58,6 +59,7 @@ struct file * get_empty_filp(void)
 		f->f_gid = current->fsgid;
 		f->f_owner.lock = RW_LOCK_UNLOCKED;
 		list_add(&f->f_list, &anon_list);
+		file_notify_init(f);
 		file_list_unlock();
 		return f;
 	}
@@ -102,6 +104,7 @@ int init_private_file(struct file *filp, struct dentry *dentry, int mode)
 	filp->f_uid    = current->fsuid;
 	filp->f_gid    = current->fsgid;
 	filp->f_op     = dentry->d_inode->i_fop;
+	file_notify_init(filp);
 	if (filp->f_op->open)
 		return filp->f_op->open(dentry->d_inode, filp);
 	else
@@ -123,6 +126,7 @@ void __fput(struct file * file)
 	struct vfsmount * mnt = file->f_vfsmnt;
 	struct inode * inode = dentry->d_inode;
 
+	file_notify_cleanup(file);
 	locks_remove_flock(file);
 
 	if (file->f_op && file->f_op->release)
diff --git a/fs/pipe.c b/fs/pipe.c
index 30d14b9dcd42..572ef04527d3 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
+#include <linux/fcblist.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -47,7 +48,7 @@ static ssize_t
 pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	int do_wakeup;
+	int do_wakeup, pfull;
 	ssize_t ret;
 
 	/* pread is not allowed on pipes. */
@@ -63,6 +64,7 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 	down(PIPE_SEM(*inode));
 	for (;;) {
 		int size = PIPE_LEN(*inode);
+		pfull = PIPE_FULL(*inode);
 		if (size) {
 			char *pipebuf = PIPE_BASE(*inode) + PIPE_START(*inode);
 			ssize_t chars = PIPE_MAX_RCHUNK(*inode);
@@ -108,12 +110,18 @@ pipe_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
 			if (!ret) ret = -ERESTARTSYS;
 			break;
 		}
+		/* Send notification message */
+		if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
+			file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
 		if (do_wakeup) {
 			wake_up_interruptible(PIPE_WAIT(*inode));
  			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 		}
 		pipe_wait(inode);
 	}
+	/* Send notification message */
+	if (pfull && !PIPE_FULL(*inode) && PIPE_WRITEFILE(*inode))
+		file_send_notify(PIPE_WRITEFILE(*inode), ION_OUT, POLLOUT | POLLWRNORM | POLLWRBAND);
 	up(PIPE_SEM(*inode));
 	/* Signal writers asynchronously that there is more room.  */
 	if (do_wakeup) {
@@ -131,7 +139,7 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 	struct inode *inode = filp->f_dentry->d_inode;
 	ssize_t ret;
 	size_t min;
-	int do_wakeup;
+	int do_wakeup, pempty;
 
 	/* pwrite is not allowed on pipes. */
 	if (unlikely(ppos != &filp->f_pos))
@@ -149,6 +157,7 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 	down(PIPE_SEM(*inode));
 	for (;;) {
 		int free;
+		pempty = PIPE_EMPTY(*inode);
 		if (!PIPE_READERS(*inode)) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret) ret = -EPIPE;
@@ -194,6 +203,9 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 			if (!ret) ret = -ERESTARTSYS;
 			break;
 		}
+		/* Send notification message */
+		if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
+			file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
 		if (do_wakeup) {
 			wake_up_interruptible_sync(PIPE_WAIT(*inode));
 			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
@@ -203,6 +215,9 @@ pipe_write(struct file *filp, const char *buf, size_t count, loff_t *ppos)
 		pipe_wait(inode);
 		PIPE_WAITING_WRITERS(*inode)--;
 	}
+	/* Send notification message */
+	if (pempty && !PIPE_EMPTY(*inode) && PIPE_READFILE(*inode))
+		file_send_notify(PIPE_READFILE(*inode), ION_IN, POLLIN | POLLRDNORM);
 	up(PIPE_SEM(*inode));
 	if (do_wakeup) {
 		wake_up_interruptible(PIPE_WAIT(*inode));
@@ -266,9 +281,22 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
+	struct file *rdfile, *wrfile;
 	down(PIPE_SEM(*inode));
 	PIPE_READERS(*inode) -= decr;
 	PIPE_WRITERS(*inode) -= decw;
+	rdfile = PIPE_READFILE(*inode);
+	wrfile = PIPE_WRITEFILE(*inode);
+ 	if (decr && !PIPE_READERS(*inode)) {
+		PIPE_READFILE(*inode) = NULL;
+		if (wrfile)
+			file_send_notify(wrfile, ION_HUP, POLLHUP);
+	}
+	if (decw && !PIPE_WRITERS(*inode)) {
+		PIPE_WRITEFILE(*inode) = NULL;
+		if (rdfile)
+			file_send_notify(rdfile, ION_HUP, POLLHUP);
+	}
 	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
 		struct pipe_inode_info *info = inode->i_pipe;
 		inode->i_pipe = NULL;
@@ -488,6 +516,7 @@ struct inode* pipe_new(struct inode* inode)
 	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 0;
 	PIPE_WAITING_WRITERS(*inode) = 0;
 	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
+	PIPE_READFILE(*inode) = PIPE_WRITEFILE(*inode) = NULL;
 	*PIPE_FASYNC_READERS(*inode) = *PIPE_FASYNC_WRITERS(*inode) = NULL;
 
 	return inode;
@@ -596,6 +625,9 @@ int do_pipe(int *fd)
 	f2->f_mode = 2;
 	f2->f_version = 0;
 
+	PIPE_READFILE(*inode) = f1;
+	PIPE_WRITEFILE(*inode) = f2;
+
 	fd_install(i, f1);
 	fd_install(j, f2);
 	fd[0] = i;
diff --git a/include/asm-i386/poll.h b/include/asm-i386/poll.h
index e5feda71b356..aecc80a15d36 100644
--- a/include/asm-i386/poll.h
+++ b/include/asm-i386/poll.h
@@ -15,6 +15,7 @@
 #define POLLWRNORM	0x0100
 #define POLLWRBAND	0x0200
 #define POLLMSG		0x0400
+#define POLLREMOVE	0x1000
 
 struct pollfd {
 	int fd;
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 159dfa7fefe1..902054f38279 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -258,6 +258,9 @@
 #define __NR_free_hugepages	251
 #define __NR_exit_group		252
 #define __NR_lookup_dcookie	253
+#define __NR_sys_epoll_create	254
+#define __NR_sys_epoll_ctl	255
+#define __NR_sys_epoll_wait	256
   
 
 /* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
new file mode 100644
index 000000000000..c028d5b7576b
--- /dev/null
+++ b/include/linux/eventpoll.h
@@ -0,0 +1,51 @@
+/*
+ *  include/linux/eventpoll.h ( Efficent event polling implementation )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef _LINUX_EVENTPOLL_H
+#define _LINUX_EVENTPOLL_H
+
+
+#define EVENTPOLL_MINOR	124
+#define POLLFD_X_PAGE (PAGE_SIZE / sizeof(struct pollfd))
+#define MAX_FDS_IN_EVENTPOLL (1024 * 128)
+#define MAX_EVENTPOLL_PAGES (MAX_FDS_IN_EVENTPOLL / POLLFD_X_PAGE)
+#define EVENT_PAGE_INDEX(n) ((n) / POLLFD_X_PAGE)
+#define EVENT_PAGE_REM(n) ((n) % POLLFD_X_PAGE)
+#define EVENT_PAGE_OFFSET(n) (((n) % POLLFD_X_PAGE) * sizeof(struct pollfd))
+#define EP_FDS_PAGES(n) (((n) + POLLFD_X_PAGE - 1) / POLLFD_X_PAGE)
+#define EP_MAP_SIZE(n) (EP_FDS_PAGES(n) * PAGE_SIZE * 2)
+
+
+struct evpoll {
+	int ep_timeout;
+	unsigned long ep_resoff;
+};
+
+#define EP_ALLOC _IOR('P', 1, int)
+#define EP_POLL _IOWR('P', 2, struct evpoll)
+#define EP_FREE _IO('P', 3)
+#define EP_ISPOLLED _IOWR('P', 4, struct pollfd)
+
+#define EP_CTL_ADD 1
+#define EP_CTL_DEL 2
+#define EP_CTL_MOD 3
+
+
+asmlinkage int sys_epoll_create(int maxfds);
+asmlinkage int sys_epoll_ctl(int epfd, int op, int fd, unsigned int events);
+asmlinkage int sys_epoll_wait(int epfd, struct pollfd const **events, int timeout);
+
+
+
+#endif
+
diff --git a/include/linux/fcblist.h b/include/linux/fcblist.h
new file mode 100644
index 000000000000..85be93ae40fd
--- /dev/null
+++ b/include/linux/fcblist.h
@@ -0,0 +1,71 @@
+/*
+ *  include/linux/fcblist.h ( File event callbacks handling )
+ *  Copyright (C) 2001,...,2002  Davide Libenzi
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  Davide Libenzi <davidel@xmailserver.org>
+ *
+ */
+
+#ifndef __LINUX_FCBLIST_H
+#define __LINUX_FCBLIST_H
+
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+
+
+/* file callback notification events */
+#define ION_IN		1
+#define ION_OUT		2
+#define ION_HUP		3
+#define ION_ERR		4
+
+#define FCB_LOCAL_SIZE	4
+
+
+struct fcb_struct {
+	struct list_head llink;
+	void (*cbproc)(struct file *, void *, unsigned long *, long *);
+	void *data;
+	unsigned long local[FCB_LOCAL_SIZE];
+};
+
+
+extern long ion_band_table[];
+extern long poll_band_table[];
+
+
+void file_notify_event(struct file *filep, long *event);
+
+int file_notify_addcb(struct file *filep,
+		      void (*cbproc)(struct file *, void *, unsigned long *, long *),
+		      void *data);
+
+int file_notify_delcb(struct file *filep,
+		      void (*cbproc)(struct file *, void *, unsigned long *, long *));
+
+void file_notify_cleanup(struct file *filep);
+
+
+static inline void file_notify_init(struct file *filep)
+{
+	rwlock_init(&filep->f_cblock);
+	INIT_LIST_HEAD(&filep->f_cblist);
+}
+
+static inline void file_send_notify(struct file *filep, long ioevt, long plevt)
+{
+	long event[] = { ioevt, plevt, -1 };
+
+	file_notify_event(filep, event);
+}
+
+#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c2e39a247227..906f221f4b26 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -506,6 +506,10 @@ struct file {
 
 	/* needed for tty driver, and maybe others */
 	void			*private_data;
+
+	/* file callback list */
+	rwlock_t f_cblock;
+	struct list_head f_cblist;
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 407c0e0b3e84..bdf0a3686916 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -12,6 +12,8 @@ struct pipe_inode_info {
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
+	struct file *rdfile;
+	struct file *wrfile;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
 };
@@ -30,6 +32,8 @@ struct pipe_inode_info {
 #define PIPE_WAITING_WRITERS(inode)	((inode).i_pipe->waiting_writers)
 #define PIPE_RCOUNTER(inode)	((inode).i_pipe->r_counter)
 #define PIPE_WCOUNTER(inode)	((inode).i_pipe->w_counter)
+#define PIPE_READFILE(inode)	((inode).i_pipe->rdfile)
+#define PIPE_WRITEFILE(inode)	((inode).i_pipe->wrfile)
 #define PIPE_FASYNC_READERS(inode)     (&((inode).i_pipe->fasync_readers))
 #define PIPE_FASYNC_WRITERS(inode)     (&((inode).i_pipe->fasync_writers))
 
diff --git a/include/linux/sys.h b/include/linux/sys.h
index dcd3256684cf..95b431dbebff 100644
--- a/include/linux/sys.h
+++ b/include/linux/sys.h
@@ -4,7 +4,7 @@
 /*
  * system call entry points ... but not all are defined
  */
-#define NR_syscalls 256
+#define NR_syscalls 260
 
 /*
  * These are system calls that will be removed at some time
diff --git a/include/net/sock.h b/include/net/sock.h
index d2790e2ca00a..4a4094b93d07 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -52,6 +52,9 @@
 #include <asm/atomic.h>
 #include <net/dst.h>
 #include <net/scm.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fcblist.h>
 
 /*
  * This structure really needs to be cleaned up.
@@ -766,8 +769,13 @@ static inline unsigned long sock_wspace(struct sock *sk)
 
 static inline void sk_wake_async(struct sock *sk, int how, int band)
 {
-	if (sk->socket && sk->socket->fasync_list)
-		sock_wake_async(sk->socket, how, band);
+	if (sk->socket) {
+		if (sk->socket->file)
+			file_send_notify(sk->socket->file, ion_band_table[band - POLL_IN],
+					 poll_band_table[band - POLL_IN]);
+		if (sk->socket->fasync_list)
+			sock_wake_async(sk->socket, how, band);
+	}
 }
 
 #define SOCK_MIN_SNDBUF 2048
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index e7c122e30969..c737b6f94414 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -476,8 +476,8 @@ void tcp_write_space(struct sock *sk)
 		if (sk->sleep && waitqueue_active(sk->sleep))
 			wake_up_interruptible(sk->sleep);
 
-		if (sock->fasync_list && !(sk->shutdown & SEND_SHUTDOWN))
-			sock_wake_async(sock, 2, POLL_OUT);
+		if (!(sk->shutdown & SEND_SHUTDOWN))
+			sk_wake_async(sk, 2, POLL_OUT);
 	}
 }
 
-- 
cgit v1.2.3


From af4d0bf616ac9b79fa52911edb4da9a1ef574550 Mon Sep 17 00:00:00 2001
From: Stelian Pop <stelian.pop@fr.alcove.com>
Date: Tue, 29 Oct 2002 22:59:36 -0800
Subject: [PATCH] sonypi driver update

This patch adds some new events to the sonypi driver (Fn key
pressed alone, jogdial turned fast or very fast) and cleanups
the code a little bit.

Thanks to Christian Gennerat for this contribution.
---
 drivers/char/sonypi.c  | 16 ++++++++--------
 drivers/char/sonypi.h  | 28 ++++++++++++++++++++++++++--
 include/linux/sonypi.h | 10 ++++++++++
 3 files changed, 44 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/sonypi.c b/drivers/char/sonypi.c
index 80198eb4dfdc..e7b563ebd399 100644
--- a/drivers/char/sonypi.c
+++ b/drivers/char/sonypi.c
@@ -1,4 +1,4 @@
-/* 
+/*
  * Sony Programmable I/O Control Device driver for VAIO
  *
  * Copyright (C) 2001 Stelian Pop <stelian.pop@fr.alcove.com>, Alc�ve
@@ -542,7 +542,7 @@ static int sonypi_misc_ioctl(struct inode *ip, struct file *fp,
 	down(&sonypi_device.lock);
 	switch (cmd) {
 	case SONYPI_IOCGBRT:
-		val8 = sonypi_ecrget(0x96);
+		val8 = sonypi_ecrget(SONYPI_LCD_LIGHT);
 		if (copy_to_user((u8 *)arg, &val8, sizeof(val8))) {
 			ret = -EFAULT;
 			goto out;
@@ -553,38 +553,38 @@ static int sonypi_misc_ioctl(struct inode *ip, struct file *fp,
 			ret = -EFAULT;
 			goto out;
 		}
-		sonypi_ecrset(0x96, val8);
+		sonypi_ecrset(SONYPI_LCD_LIGHT, val8);
 		break;
 	case SONYPI_IOCGBAT1CAP:
-		val16 = sonypi_ecrget16(0xb2);
+		val16 = sonypi_ecrget16(SONYPI_BAT1_FULL);
 		if (copy_to_user((u16 *)arg, &val16, sizeof(val16))) {
 			ret = -EFAULT;
 			goto out;
 		}
 		break;
 	case SONYPI_IOCGBAT1REM:
-		val16 = sonypi_ecrget16(0xa2);
+		val16 = sonypi_ecrget16(SONYPI_BAT1_LEFT);
 		if (copy_to_user((u16 *)arg, &val16, sizeof(val16))) {
 			ret = -EFAULT;
 			goto out;
 		}
 		break;
 	case SONYPI_IOCGBAT2CAP:
-		val16 = sonypi_ecrget16(0xba);
+		val16 = sonypi_ecrget16(SONYPI_BAT2_FULL);
 		if (copy_to_user((u16 *)arg, &val16, sizeof(val16))) {
 			ret = -EFAULT;
 			goto out;
 		}
 		break;
 	case SONYPI_IOCGBAT2REM:
-		val16 = sonypi_ecrget16(0xaa);
+		val16 = sonypi_ecrget16(SONYPI_BAT2_LEFT);
 		if (copy_to_user((u16 *)arg, &val16, sizeof(val16))) {
 			ret = -EFAULT;
 			goto out;
 		}
 		break;
 	case SONYPI_IOCGBATFLAGS:
-		val8 = sonypi_ecrget(0x81) & 0x07;
+		val8 = sonypi_ecrget(SONYPI_BAT_FLAGS) & 0x07;
 		if (copy_to_user((u8 *)arg, &val8, sizeof(val8))) {
 			ret = -EFAULT;
 			goto out;
diff --git a/drivers/char/sonypi.h b/drivers/char/sonypi.h
index 5b64e7c533dd..25acec82eb0f 100644
--- a/drivers/char/sonypi.h
+++ b/drivers/char/sonypi.h
@@ -35,7 +35,7 @@
 #ifdef __KERNEL__
 
 #define SONYPI_DRIVER_MAJORVERSION	 1
-#define SONYPI_DRIVER_MINORVERSION	13
+#define SONYPI_DRIVER_MINORVERSION	14
 
 #include <linux/types.h>
 #include <linux/pci.h>
@@ -54,6 +54,20 @@
 #define SONYPI_SHIB			0x9d
 #define SONYPI_TYPE2_REGION_SIZE	0x20
 
+/* battery / brightness addresses */
+#define SONYPI_BAT_FLAGS	0x81
+#define SONYPI_LCD_LIGHT	0x96
+#define SONYPI_BAT1_PCTRM	0xa0
+#define SONYPI_BAT1_LEFT	0xa2
+#define SONYPI_BAT1_MAXRT	0xa4
+#define SONYPI_BAT2_PCTRM	0xa8
+#define SONYPI_BAT2_LEFT	0xaa
+#define SONYPI_BAT2_MAXRT	0xac
+#define SONYPI_BAT1_MAXTK	0xb0
+#define SONYPI_BAT1_FULL	0xb2
+#define SONYPI_BAT2_MAXTK	0xb8
+#define SONYPI_BAT2_FULL	0xba
+
 /* ioports used for brightness and type2 events */
 #define SONYPI_DATA_IOPORT	0x62
 #define SONYPI_CST_IOPORT	0x66
@@ -156,6 +170,14 @@ static struct sonypi_event sonypi_joggerev[] = {
 	{ 0x01, SONYPI_EVENT_JOGDIAL_DOWN },
 	{ 0x5f, SONYPI_EVENT_JOGDIAL_UP_PRESSED },
 	{ 0x41, SONYPI_EVENT_JOGDIAL_DOWN_PRESSED },
+	{ 0x1e, SONYPI_EVENT_JOGDIAL_FAST_UP },
+	{ 0x02, SONYPI_EVENT_JOGDIAL_FAST_DOWN },
+	{ 0x5e, SONYPI_EVENT_JOGDIAL_FAST_UP_PRESSED },
+	{ 0x42, SONYPI_EVENT_JOGDIAL_FAST_DOWN_PRESSED },
+	{ 0x1d, SONYPI_EVENT_JOGDIAL_VFAST_UP },
+	{ 0x03, SONYPI_EVENT_JOGDIAL_VFAST_DOWN },
+	{ 0x5d, SONYPI_EVENT_JOGDIAL_VFAST_UP_PRESSED },
+	{ 0x43, SONYPI_EVENT_JOGDIAL_VFAST_DOWN_PRESSED },
 	{ 0x40, SONYPI_EVENT_JOGDIAL_PRESSED },
 	{ 0x00, SONYPI_EVENT_JOGDIAL_RELEASED },
 	{ 0x00, 0x00 }
@@ -192,6 +214,7 @@ static struct sonypi_event sonypi_fnkeyev[] = {
 	{ 0x33, SONYPI_EVENT_FNKEY_F },
 	{ 0x34, SONYPI_EVENT_FNKEY_S },
 	{ 0x35, SONYPI_EVENT_FNKEY_B },
+	{ 0x36, SONYPI_EVENT_FNKEY_ONLY },
 	{ 0x00, 0x00 }
 };
 
@@ -214,6 +237,7 @@ static struct sonypi_event sonypi_blueev[] = {
 /* The set of possible back button events */
 static struct sonypi_event sonypi_backev[] = {
 	{ 0x20, SONYPI_EVENT_BACK_PRESSED },
+	{ 0x3b, SONYPI_EVENT_HELP_PRESSED },
 	{ 0x00, 0x00 }
 };
 
@@ -258,7 +282,7 @@ struct sonypi_device {
 	while (--n && (command)) \
 		udelay(1); \
 	if (!n && (verbose || !quiet)) \
-		printk(KERN_WARNING "sonypi command failed at %s : %s(line %d)\n", __FILE__, __FUNCTION__, __LINE__); \
+		printk(KERN_WARNING "sonypi command failed at %s : %s (line %d)\n", __FILE__, __FUNCTION__, __LINE__); \
 }
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/sonypi.h b/include/linux/sonypi.h
index 4a53f5b8852a..8828b98b3029 100644
--- a/include/linux/sonypi.h
+++ b/include/linux/sonypi.h
@@ -75,6 +75,16 @@
 #define SONYPI_EVENT_LID_OPENED			37
 #define SONYPI_EVENT_BLUETOOTH_ON		38
 #define SONYPI_EVENT_BLUETOOTH_OFF		39
+#define SONYPI_EVENT_HELP_PRESSED		40
+#define SONYPI_EVENT_FNKEY_ONLY			41
+#define SONYPI_EVENT_JOGDIAL_FAST_DOWN		42
+#define SONYPI_EVENT_JOGDIAL_FAST_UP		43
+#define SONYPI_EVENT_JOGDIAL_FAST_DOWN_PRESSED	44
+#define SONYPI_EVENT_JOGDIAL_FAST_UP_PRESSED	45
+#define SONYPI_EVENT_JOGDIAL_VFAST_DOWN		46
+#define SONYPI_EVENT_JOGDIAL_VFAST_UP		47
+#define SONYPI_EVENT_JOGDIAL_VFAST_DOWN_PRESSED	48
+#define SONYPI_EVENT_JOGDIAL_VFAST_UP_PRESSED	49
 
 /* get/set brightness */
 #define SONYPI_IOCGBRT		_IOR('v', 0, __u8)
-- 
cgit v1.2.3


From 706489d8a6f1187875ebdb86a751b6d77d1911e4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:23:34 -0800
Subject: [PATCH] slab: extended cpu notifiers

Patch from Dipankar Sarma  <dipankar@in.ibm.com>

This is Manfred's patch which provides a CPU_UP_PREPARE cpu notifier to
allow initialization of per_cpu data just before the cpu becomes fully
functional.

It also provides a facility for the CPU_UP_PREPARE handler to return
NOTIFY_BAD to signify that the CPU is not permitted to come up.  If
that happens, a CPU_UP_CANCELLED message is passed to all the handlers.

The patch also fixes a bogus NOFITY_BAD return from the softirq setup
code.

Patch has been acked by Rusty.

We need this mechanism in slab for starting per-cpu timers and for
allocating the per-cpu slab hgead arrays *before* the CPU has come up
and started using slab.
---
 include/linux/notifier.h |  6 +++++-
 kernel/cpu.c             | 20 ++++++++++++++++----
 kernel/softirq.c         |  3 +--
 3 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index 05d2e7968646..f9638ff66bb9 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -60,7 +60,11 @@ extern int notifier_call_chain(struct notifier_block **n, unsigned long val, voi
 
 #define NETLINK_URELEASE	0x0001	/* Unicast netlink socket released */
 
-#define CPU_ONLINE	0x0002 /* CPU (unsigned)v coming up */
+#define CPU_ONLINE	0x0002 /* CPU (unsigned)v is up */
+#define CPU_UP_PREPARE	0x0003 /* CPU (unsigned)v coming up */
+#define CPU_UP_CANCELED	0x0004 /* CPU (unsigned)v NOT coming up */
+#define CPU_OFFLINE	0x0005 /* CPU (unsigned)v offline (still scheduling) */
+#define CPU_DEAD	0x0006 /* CPU (unsigned)v dead */
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_NOTIFIER_H */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a155998dbe3e..4c0ada2b99ae 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -29,26 +29,38 @@ void unregister_cpu_notifier(struct notifier_block *nb)
 int __devinit cpu_up(unsigned int cpu)
 {
 	int ret;
+	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = down_interruptible(&cpucontrol)) != 0) 
+	if ((ret = down_interruptible(&cpucontrol)) != 0)
 		return ret;
 
 	if (cpu_online(cpu)) {
 		ret = -EINVAL;
 		goto out;
 	}
+	ret = notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
+	if (ret == NOTIFY_BAD) {
+		printk("%s: attempt to bring up CPU %u failed\n",
+				__FUNCTION__, cpu);
+		ret = -EINVAL;
+		goto out_notify;
+	}
 
 	/* Arch-specific enabling code. */
 	ret = __cpu_up(cpu);
-	if (ret != 0) goto out;
+	if (ret != 0)
+		goto out_notify;
 	if (!cpu_online(cpu))
 		BUG();
 
 	/* Now call notifier in preparation. */
 	printk("CPU %u IS NOW UP!\n", cpu);
-	notifier_call_chain(&cpu_chain, CPU_ONLINE, (void *)(long)cpu);
+	notifier_call_chain(&cpu_chain, CPU_ONLINE, hcpu);
 
- out:
+out_notify:
+	if (ret != 0)
+		notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu);
+out:
 	up(&cpucontrol);
 	return ret;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 45e8712f9f70..1c0f1c4e39e0 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -316,9 +316,8 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 
 		while (!ksoftirqd_task(hotcpu))
 			yield();
-		return NOTIFY_OK;
  	}
-	return NOTIFY_BAD;
+	return NOTIFY_OK;
 }
 
 static struct notifier_block cpu_nfb = { &cpu_callback, NULL, 0 };
-- 
cgit v1.2.3


From 22331dad4687ac6f91428884e2b9a02cb4d8a6df Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:23:44 -0800
Subject: [PATCH] slab: add_timer_on: add a timer on a particular CPU

add_timer_on is like add_timer, except it takes a target CPU on which
to add the timer.

The slab code needs per-cpu timers for shrinking the per-cpu caches.
---
 include/linux/timer.h |  1 +
 kernel/timer.c        | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index cfedb5e8bb07..d8ed753c8caa 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -44,6 +44,7 @@ static inline int timer_pending(const struct timer_list * timer)
 }
 
 extern void add_timer(struct timer_list * timer);
+extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
   
diff --git a/kernel/timer.c b/kernel/timer.c
index 2d30f7fd0ecb..58c80293060b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -134,6 +134,26 @@ void add_timer(timer_t *timer)
 	put_cpu();
 }
 
+/***
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+	tvec_base_t *base = tvec_bases+ cpu;
+  	unsigned long flags;
+  
+  	BUG_ON(timer_pending(timer) || !timer->function);
+
+	spin_lock_irqsave(&base->lock, flags);
+	internal_add_timer(base, timer);
+	timer->base = base;
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+
 /***
  * mod_timer - modify a timer's timeout
  * @timer: the timer to be modified
-- 
cgit v1.2.3


From c12e16e28b4cf576840cff509caf0c06ff4dc299 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:31:27 -0800
Subject: [PATCH] percpu: convert RCU

Patch from Dipankar Sarma <dipankar@in.ibm.com>

This patch convers RCU per_cpu data to use per_cpu data area
and makes it safe for cpu_possible allocation by using CPU
notifiers.
---
 include/linux/rcupdate.h | 15 ++++++++-------
 kernel/rcupdate.c        | 43 ++++++++++++++++++++++++++++++++++---------
 2 files changed, 42 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index a5ffb7bb5743..e9e2287e1e1c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -39,6 +39,7 @@
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/threads.h>
+#include <linux/percpu.h>
 
 /**
  * struct rcu_head - callback structure for use with RCU
@@ -94,16 +95,16 @@ struct rcu_data {
         long  	       	batch;           /* Batch # for current RCU batch */
         struct list_head  nxtlist;
         struct list_head  curlist;
-} ____cacheline_aligned_in_smp;
+};
 
-extern struct rcu_data rcu_data[NR_CPUS];
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
 extern struct rcu_ctrlblk rcu_ctrlblk;
 
-#define RCU_qsctr(cpu) 		(rcu_data[(cpu)].qsctr)
-#define RCU_last_qsctr(cpu) 	(rcu_data[(cpu)].last_qsctr)
-#define RCU_batch(cpu) 		(rcu_data[(cpu)].batch)
-#define RCU_nxtlist(cpu) 	(rcu_data[(cpu)].nxtlist)
-#define RCU_curlist(cpu) 	(rcu_data[(cpu)].curlist)
+#define RCU_qsctr(cpu) 		(per_cpu(rcu_data, (cpu)).qsctr)
+#define RCU_last_qsctr(cpu) 	(per_cpu(rcu_data, (cpu)).last_qsctr)
+#define RCU_batch(cpu) 		(per_cpu(rcu_data, (cpu)).batch)
+#define RCU_nxtlist(cpu) 	(per_cpu(rcu_data, (cpu)).nxtlist)
+#define RCU_curlist(cpu) 	(per_cpu(rcu_data, (cpu)).curlist)
 
 #define RCU_QSCTR_INVALID	0
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 1a149dff7832..91483119714c 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -41,13 +41,14 @@
 #include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/percpu.h>
+#include <linux/notifier.h>
 #include <linux/rcupdate.h>
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
 	{ .mutex = SPIN_LOCK_UNLOCKED, .curbatch = 1, 
 	  .maxbatch = 1, .rcu_cpu_mask = 0 };
-struct rcu_data rcu_data[NR_CPUS] __cacheline_aligned;
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 
 /* Fake initialization required by compiler */
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
@@ -198,6 +199,33 @@ void rcu_check_callbacks(int cpu, int user)
 	tasklet_schedule(&RCU_tasklet(cpu));
 }
 
+static void __devinit rcu_online_cpu(int cpu)
+{
+	memset(&per_cpu(rcu_data, cpu), 0, sizeof(struct rcu_data));
+	tasklet_init(&RCU_tasklet(cpu), rcu_process_callbacks, 0UL);
+	INIT_LIST_HEAD(&RCU_nxtlist(cpu));
+	INIT_LIST_HEAD(&RCU_curlist(cpu));
+}
+
+static int __devinit rcu_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	switch (action) {
+	case CPU_UP_PREPARE:
+		rcu_online_cpu(cpu);
+		break;
+	/* Space reserved for CPU_OFFLINE :) */
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
 /*
  * Initializes rcu mechanism.  Assumed to be called early.
  * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
@@ -206,16 +234,13 @@ void rcu_check_callbacks(int cpu, int user)
  */
 void __init rcu_init(void)
 {
-	int i;
-
-	memset(&rcu_data[0], 0, sizeof(rcu_data));
-	for (i = 0; i < NR_CPUS; i++) {
-		tasklet_init(&RCU_tasklet(i), rcu_process_callbacks, 0UL);
-		INIT_LIST_HEAD(&RCU_nxtlist(i));
-		INIT_LIST_HEAD(&RCU_curlist(i));
-	}
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
 }
 
+
 /* Because of FASTCALL declaration of complete, we use this wrapper */
 static void wakeme_after_rcu(void *completion)
 {
-- 
cgit v1.2.3


From afce7191a73f632a138f5511cbe245d39c526331 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:32:18 -0800
Subject: [PATCH] percpu: convert global page accounting

Convert global page state accounting to use per-cpu storage

(I think this code remains a little buggy, btw.  Note how I do

	per_cpu(page_states, cpu).member += (delta);

This gets done at interrupt time and hence is assuming that
the "+=" operation on a ulong is atomic wrt interrupts on
all architectures. How do we feel about that assumption?)
---
 include/linux/gfp.h        |  2 ++
 include/linux/page-flags.h | 10 +++++++---
 init/main.c                |  2 ++
 mm/page_alloc.c            | 37 ++++++++++++++++++++++++++++++++++---
 4 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 939f16910233..c340b447a963 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -86,4 +86,6 @@ extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
 
+void page_alloc_init(void);
+
 #endif /* __LINUX_GFP_H */
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5c770f49787a..282902bb9816 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -5,6 +5,8 @@
 #ifndef PAGE_FLAGS_H
 #define PAGE_FLAGS_H
 
+#include <linux/percpu.h>
+
 /*
  * Various page->flags bits:
  *
@@ -73,7 +75,7 @@
  * Global page accounting.  One instance per CPU.  Only unsigned longs are
  * allowed.
  */
-extern struct page_state {
+struct page_state {
 	unsigned long nr_dirty;
 	unsigned long nr_writeback;
 	unsigned long nr_pagecache;
@@ -103,7 +105,9 @@ extern struct page_state {
 	unsigned long kswapd_steal;
 	unsigned long pageoutrun;
 	unsigned long allocstall;
-} ____cacheline_aligned_in_smp page_states[NR_CPUS];
+};
+
+DECLARE_PER_CPU(struct page_state, page_states);
 
 extern void get_page_state(struct page_state *ret);
 extern void get_full_page_state(struct page_state *ret);
@@ -111,7 +115,7 @@ extern void get_full_page_state(struct page_state *ret);
 #define mod_page_state(member, delta)					\
 	do {								\
 		int cpu = get_cpu();					\
-		page_states[cpu].member += (delta);			\
+		per_cpu(page_states, cpu).member += (delta);		\
 		put_cpu();						\
 	} while (0)
 
diff --git a/init/main.c b/init/main.c
index 97d88c50366b..d75bf0acb1eb 100644
--- a/init/main.c
+++ b/init/main.c
@@ -26,6 +26,7 @@
 #include <linux/hdreg.h>
 #include <linux/bootmem.h>
 #include <linux/tty.h>
+#include <linux/gfp.h>
 #include <linux/percpu.h>
 #include <linux/kernel_stat.h>
 #include <linux/security.h>
@@ -388,6 +389,7 @@ asmlinkage void __init start_kernel(void)
 	setup_arch(&command_line);
 	setup_per_cpu_areas();
 	build_all_zonelists();
+	page_alloc_init();
 	printk("Kernel command line: %s\n", saved_command_line);
 	parse_options(command_line);
 	trap_init();
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 848b1ed8f001..0e7425d56652 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/pagevec.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/notifier.h>
 
 struct pglist_data *pgdat_list;
 unsigned long totalram_pages;
@@ -573,8 +574,8 @@ unsigned int nr_free_highpages (void)
  * The result is unavoidably approximate - it can change
  * during and after execution of this function.
  */
-struct page_state page_states[NR_CPUS] __cacheline_aligned;
-EXPORT_SYMBOL(page_states);
+DEFINE_PER_CPU(struct page_state, page_states) = {0};
+EXPORT_PER_CPU_SYMBOL(page_states);
 
 void __get_page_state(struct page_state *ret, int nr)
 {
@@ -587,7 +588,7 @@ void __get_page_state(struct page_state *ret, int nr)
 		if (!cpu_online(cpu))
 			continue;
 
-		in = (unsigned long *)(page_states + cpu);
+		in = (unsigned long *)&per_cpu(page_states, cpu);
 		out = (unsigned long *)ret;
 		for (off = 0; off < nr; off++)
 			*out++ += *in++;
@@ -1197,3 +1198,33 @@ struct seq_operations vmstat_op = {
 };
 
 #endif /* CONFIG_PROC_FS */
+
+static void __devinit init_page_alloc_cpu(int cpu)
+{
+	struct page_state *ps = &per_cpu(page_states, cpu);
+	memset(ps, 0, sizeof(*ps));
+}
+	
+static int __devinit page_alloc_cpu_notify(struct notifier_block *self, 
+				unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	switch(action) {
+	case CPU_UP_PREPARE:
+		init_page_alloc_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata page_alloc_nb = {
+	.notifier_call	= page_alloc_cpu_notify,
+};
+
+void __init page_alloc_init(void)
+{
+	init_page_alloc_cpu(smp_processor_id());
+	register_cpu_notifier(&page_alloc_nb);
+}
-- 
cgit v1.2.3


From 1d2652dd2c3e942e75dc3137b3cb1774b43ae377 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:35:44 -0800
Subject: [PATCH] hot-n-cold pages: bulk page freeing

Patch from Martin Bligh.

Implements __free_pages_bulk().  Release multiple pages of a given
order into the buddy all within a single acquisition of the zone lock.

This also removes current->local_pages.  The per-task list of pages
which only ever contained one page.  To prevent other tasks from
stealing pages which this task has just freed up.

Given that we're freeing into the per-cpu caches, and that those are
multipage caches, and the cpu-stickiness of the scheduler, I think
current->local_pages is no longer needed.
---
 include/linux/sched.h |  22 +++----
 kernel/fork.c         |   2 -
 mm/page_alloc.c       | 156 ++++++++++++++++++++------------------------------
 3 files changed, 70 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b5e63d8ade25..65f9799aa896 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -293,9 +293,6 @@ struct task_struct {
 	struct list_head ptrace_list;
 
 	struct mm_struct *mm, *active_mm;
-	struct list_head local_pages;
-
-	unsigned int allocation_order, nr_local_pages;
 
 /* task state */
 	struct linux_binfmt *binfmt;
@@ -411,16 +408,15 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
-#define PF_FREE_PAGES	0x00002000	/* per process page freeing */
-#define PF_FLUSHER	0x00004000	/* responsible for disk writeback */
-#define PF_NOWARN	0x00008000	/* debug: don't warn if alloc fails */
-
-#define PF_FREEZE	0x00010000	/* this task should be frozen for suspend */
-#define PF_IOTHREAD	0x00020000	/* this thread is needed for doing I/O to swap */
-#define PF_FROZEN	0x00040000	/* frozen for system suspend */
-#define PF_SYNC		0x00080000	/* performing fsync(), etc */
-#define PF_FSTRANS	0x00100000	/* inside a filesystem transaction */
-#define PF_KSWAPD	0x00200000	/* I am kswapd */
+#define PF_FLUSHER	0x00002000	/* responsible for disk writeback */
+#define PF_NOWARN	0x00004000	/* debug: don't warn if alloc fails */
+
+#define PF_FREEZE	0x00008000	/* this task should be frozen for suspend */
+#define PF_IOTHREAD	0x00010000	/* this thread is needed for doing I/O to swap */
+#define PF_FROZEN	0x00020000	/* frozen for system suspend */
+#define PF_SYNC		0x00040000	/* performing fsync(), etc */
+#define PF_FSTRANS	0x00080000	/* inside a filesystem transaction */
+#define PF_KSWAPD	0x00100000	/* I am kswapd */
 
 /*
  * Ptrace flags
diff --git a/kernel/fork.c b/kernel/fork.c
index 2f5f00301182..4a33d682dfaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -769,8 +769,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->start_time = jiffies;
 	p->security = NULL;
 
-	INIT_LIST_HEAD(&p->local_pages);
-
 	retval = -ENOMEM;
 	if (security_ops->task_alloc_security(p))
 		goto bad_fork_cleanup;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d4fca60114ea..dd35f4d7ac49 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -91,49 +91,17 @@ static void bad_page(const char *function, struct page *page)
  * -- wli
  */
 
-void __free_pages_ok (struct page *page, unsigned int order)
+static inline void __free_pages_bulk (struct page *page, struct page *base,
+		struct zone *zone, struct free_area *area, unsigned long mask,
+		unsigned int order)
 {
-	unsigned long index, page_idx, mask, flags;
-	struct free_area *area;
-	struct page *base;
-	struct zone *zone;
-
-	mod_page_state(pgfree, 1<<order);
-
-	if (	page_mapped(page) ||
-		page->mapping != NULL ||
-		page_count(page) != 0 ||
-		(page->flags & (
-			1 << PG_lru	|
-			1 << PG_private |
-			1 << PG_locked	|
-			1 << PG_active	|
-			1 << PG_writeback )))
-		bad_page(__FUNCTION__, page);
-
-	if (PageDirty(page))
-		ClearPageDirty(page);
+	unsigned long page_idx, index;
 
-	if (unlikely(current->flags & PF_FREE_PAGES)) {
-		if (!current->nr_local_pages && !in_interrupt()) {
-			list_add(&page->list, &current->local_pages);
-			page->index = order;
-			current->nr_local_pages++;
-			goto out;
-		}
-	}
-
-	zone = page_zone(page);
-
-	mask = (~0UL) << order;
-	base = zone->zone_mem_map;
 	page_idx = page - base;
 	if (page_idx & ~mask)
 		BUG();
 	index = page_idx >> (1 + order);
-	area = zone->free_area + order;
 
-	spin_lock_irqsave(&zone->lock, flags);
 	zone->free_pages -= mask;
 	while (mask + (1 << (MAX_ORDER-1))) {
 		struct page *buddy1, *buddy2;
@@ -160,9 +128,58 @@ void __free_pages_ok (struct page *page, unsigned int order)
 		page_idx &= mask;
 	}
 	list_add(&(base + page_idx)->list, &area->free_list);
+}
+
+static inline void free_pages_check(const char *function, struct page *page)
+{
+	if (	page_mapped(page) ||
+		page->mapping != NULL ||
+		page_count(page) != 0 ||
+		(page->flags & (
+			1 << PG_lru	|
+			1 << PG_private |
+			1 << PG_locked	|
+			1 << PG_active	|
+			1 << PG_writeback )))
+		bad_page(function, page);
+	if (PageDirty(page))
+		ClearPageDirty(page);
+}
+
+/*
+ * Frees a list of pages. 
+ * Assumes all pages on list are in same zone, and of same order.
+ * count is the number of pages to free, or 0 for all on the list.
+ */
+static void
+free_pages_bulk(struct zone *zone, int count,
+		struct list_head *list, unsigned int order)
+{
+	unsigned long mask, flags;
+	struct free_area *area;
+	struct page *base, *page = NULL;
+
+	mask = (~0UL) << order;
+	base = zone->zone_mem_map;
+	area = zone->free_area + order;
+	spin_lock_irqsave(&zone->lock, flags);
+	while (!list_empty(list) && count--) {
+		page = list_entry(list->prev, struct page, list);
+		/* have to delete it as __free_pages_bulk list manipulates */
+		list_del(&page->list);
+		__free_pages_bulk(page, base, zone, area, mask, order);
+		mod_page_state(pgfree, count<<order);
+	}
 	spin_unlock_irqrestore(&zone->lock, flags);
-out:
-	return;
+}
+
+void __free_pages_ok(struct page *page, unsigned int order)
+{
+	LIST_HEAD(list);
+
+	free_pages_check(__FUNCTION__, page);
+	list_add(&page->list, &list);
+	free_pages_bulk(page_zone(page), 1, &list, order);
 }
 
 #define MARK_USED(index, order, area) \
@@ -323,59 +340,6 @@ int is_head_of_free_region(struct page *page)
 }
 #endif /* CONFIG_SOFTWARE_SUSPEND */
 
-static /* inline */ struct page *
-balance_classzone(struct zone* classzone, unsigned int gfp_mask,
-			unsigned int order, int * freed)
-{
-	struct page * page = NULL;
-	int __freed = 0;
-
-	BUG_ON(in_interrupt());
-
-	current->allocation_order = order;
-	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
-
-	__freed = try_to_free_pages(classzone, gfp_mask, order);
-
-	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
-
-	if (current->nr_local_pages) {
-		struct list_head * entry, * local_pages;
-		struct page * tmp;
-		int nr_pages;
-
-		local_pages = &current->local_pages;
-
-		if (likely(__freed)) {
-			/* pick from the last inserted so we're lifo */
-			entry = local_pages->next;
-			do {
-				tmp = list_entry(entry, struct page, list);
-				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
-					list_del(entry);
-					page = tmp;
-					current->nr_local_pages--;
-					prep_new_page(page);
-					break;
-				}
-			} while ((entry = entry->next) != local_pages);
-		}
-
-		nr_pages = current->nr_local_pages;
-		/* free in reverse order so that the global order will be lifo */
-		while ((entry = local_pages->prev) != local_pages) {
-			list_del(entry);
-			tmp = list_entry(entry, struct page, list);
-			__free_pages_ok(tmp, tmp->index);
-			if (!nr_pages--)
-				BUG();
-		}
-		current->nr_local_pages = 0;
-	}
-	*freed = __freed;
-	return page;
-}
-
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
@@ -386,7 +350,8 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 	unsigned long min;
 	struct zone **zones, *classzone;
 	struct page * page;
-	int freed, i;
+	int cflags;
+	int i;
 
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();
@@ -463,9 +428,10 @@ nopage:
 		goto nopage;
 
 	inc_page_state(allocstall);
-	page = balance_classzone(classzone, gfp_mask, order, &freed);
-	if (page)
-		return page;
+	cflags = current->flags;
+	current->flags |= PF_MEMALLOC;
+	try_to_free_pages(classzone, gfp_mask, order);
+	current->flags = cflags;
 
 	/* go through the zonelist yet one more time */
 	min = 1UL << order;
-- 
cgit v1.2.3


From a206231bbe6ffb988cdf9fcbdfd98e49abaf4819 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:35:53 -0800
Subject: [PATCH] hot-n-cold pages: page allocator core

Hot/Cold pages and zone->lock amortisation
---
 include/linux/gfp.h    |   7 ++-
 include/linux/mm.h     |   1 -
 include/linux/mmzone.h |  17 ++++++
 mm/page_alloc.c        | 160 +++++++++++++++++++++++++++++++++++++++----------
 mm/swap.c              |   5 +-
 5 files changed, 151 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index c340b447a963..8e093813e4f7 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -17,6 +17,7 @@
 #define __GFP_IO	0x40	/* Can start low memory physical IO? */
 #define __GFP_HIGHIO	0x80	/* Can start high mem physical IO? */
 #define __GFP_FS	0x100	/* Can call down to low-level FS? */
+#define __GFP_COLD	0x200	/* Cache-cold page required */
 
 #define GFP_NOHIGHIO	(             __GFP_WAIT | __GFP_IO)
 #define GFP_NOIO	(             __GFP_WAIT)
@@ -32,6 +33,7 @@
 
 #define GFP_DMA		__GFP_DMA
 
+
 /*
  * There is only one page-allocator function, and two main namespaces to
  * it. The alloc_page*() variants return 'struct page *' and as such
@@ -77,11 +79,10 @@ extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask));
 #define __get_dma_pages(gfp_mask, order) \
 		__get_free_pages((gfp_mask) | GFP_DMA,(order))
 
-/*
- * There is only one 'core' page-freeing function.
- */
 extern void FASTCALL(__free_pages(struct page *page, unsigned int order));
 extern void FASTCALL(free_pages(unsigned long addr, unsigned int order));
+extern void FASTCALL(free_hot_page(struct page *page));
+extern void FASTCALL(free_cold_page(struct page *page));
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr),0)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cab2c4342047..d9d2f20732d4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -211,7 +211,6 @@ struct page {
 #define set_page_count(p,v) 	atomic_set(&(p)->count, v)
 
 extern void FASTCALL(__page_cache_release(struct page *));
-void FASTCALL(__free_pages_ok(struct page *page, unsigned int order));
 
 static inline void put_page(struct page *page)
 {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 10c4ee968020..d80490b1265c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -9,6 +9,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/cache.h>
+#include <linux/threads.h>
 #include <asm/atomic.h>
 #ifdef CONFIG_DISCONTIGMEM
 #include <asm/numnodes.h>
@@ -46,6 +47,18 @@ struct zone_padding {
 #define ZONE_PADDING(name)
 #endif
 
+struct per_cpu_pages {
+	int count;		/* number of pages in the list */
+	int low;		/* low watermark, refill needed */
+	int high;		/* high watermark, emptying needed */
+	int batch;		/* chunk size for buddy add/remove */
+	struct list_head list;	/* the list of pages */
+};
+
+struct per_cpu_pageset {
+	struct per_cpu_pages pcp[2];	/* 0: hot.  1: cold */
+} ____cacheline_aligned_in_smp;
+
 /*
  * On machines where it is needed (eg PCs) we divide physical memory
  * into multiple physical zones. On a PC we have 3 zones:
@@ -107,6 +120,10 @@ struct zone {
 	unsigned long		wait_table_size;
 	unsigned long		wait_table_bits;
 
+	ZONE_PADDING(_pad3_)
+
+	struct per_cpu_pageset	pageset[NR_CPUS];
+
 	/*
 	 * Discontig memory support fields.
 	 */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dd35f4d7ac49..f46471b25586 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -10,6 +10,8 @@
  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
+ *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
+ *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  */
 
 #include <linux/config.h>
@@ -151,13 +153,14 @@ static inline void free_pages_check(const char *function, struct page *page)
  * Assumes all pages on list are in same zone, and of same order.
  * count is the number of pages to free, or 0 for all on the list.
  */
-static void
+static int
 free_pages_bulk(struct zone *zone, int count,
 		struct list_head *list, unsigned int order)
 {
 	unsigned long mask, flags;
 	struct free_area *area;
 	struct page *base, *page = NULL;
+	int ret = 0;
 
 	mask = (~0UL) << order;
 	base = zone->zone_mem_map;
@@ -169,8 +172,10 @@ free_pages_bulk(struct zone *zone, int count,
 		list_del(&page->list);
 		__free_pages_bulk(page, base, zone, area, mask, order);
 		mod_page_state(pgfree, count<<order);
+		ret++;
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
+	return ret;
 }
 
 void __free_pages_ok(struct page *page, unsigned int order)
@@ -201,14 +206,13 @@ expand(struct zone *zone, struct page *page,
 		index += size;
 		page += size;
 	}
-	BUG_ON(bad_range(zone, page));
 	return page;
 }
 
 /*
  * This page is about to be returned from the page allocator
  */
-static inline void prep_new_page(struct page *page)
+static void prep_new_page(struct page *page)
 {
 	if (	page->mapping ||
 		page_mapped(page) ||
@@ -248,36 +252,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
 			continue;
 
 		page = list_entry(curr, struct page, list);
-		BUG_ON(bad_range(zone, page));
 		list_del(curr);
 		index = page - zone->zone_mem_map;
 		if (current_order != MAX_ORDER-1)
 			MARK_USED(index, current_order, area);
 		zone->free_pages -= 1UL << order;
-		page = expand(zone, page, index, order, current_order, area);
-		return page;
+		return expand(zone, page, index, order, current_order, area);
 	}
 
 	return NULL;
 }
 
-/* Obtain a single element from the buddy allocator */
-static struct page *rmqueue(struct zone *zone, unsigned int order)
-{
-	unsigned long flags;
-	struct page *page;
-
-	spin_lock_irqsave(&zone->lock, flags);
-	page = __rmqueue(zone, order);
-	spin_unlock_irqrestore(&zone->lock, flags);
-
-	if (page != NULL) {
-		BUG_ON(bad_range(zone, page));
-		prep_new_page(page);
-	}
-	return page;
-}
-
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -340,6 +325,72 @@ int is_head_of_free_region(struct page *page)
 }
 #endif /* CONFIG_SOFTWARE_SUSPEND */
 
+/*
+ * Free a 0-order page
+ */
+static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
+static void free_hot_cold_page(struct page *page, int cold)
+{
+	struct zone *zone = page_zone(page);
+	struct per_cpu_pages *pcp;
+	unsigned long flags;
+
+	free_pages_check(__FUNCTION__, page);
+	pcp = &zone->pageset[get_cpu()].pcp[cold];
+	local_irq_save(flags);
+	if (pcp->count >= pcp->high)
+		pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+	list_add(&page->list, &pcp->list);
+	pcp->count++;
+	local_irq_restore(flags);
+	put_cpu();
+}
+
+void free_hot_page(struct page *page)
+{
+	free_hot_cold_page(page, 0);
+}
+	
+void free_cold_page(struct page *page)
+{
+	free_hot_cold_page(page, 1);
+}
+
+static struct page *buffered_rmqueue(struct zone *zone, int order, int cold)
+{
+	unsigned long flags;
+	struct page *page = NULL;
+
+	if (order == 0) {
+		struct per_cpu_pages *pcp;
+
+		pcp = &zone->pageset[get_cpu()].pcp[cold];
+		local_irq_save(flags);
+		if (pcp->count <= pcp->low)
+			pcp->count += rmqueue_bulk(zone, 0,
+						pcp->batch, &pcp->list);
+		if (pcp->count) {
+			page = list_entry(pcp->list.next, struct page, list);
+			list_del(&page->list);
+			pcp->count--;
+		}
+		local_irq_restore(flags);
+		put_cpu();
+	}
+
+	if (page == NULL) {
+		spin_lock_irqsave(&zone->lock, flags);
+		page = __rmqueue(zone, order);
+		spin_unlock_irqrestore(&zone->lock, flags);
+	}
+
+	if (page != NULL) {
+		BUG_ON(bad_range(zone, page));
+		prep_new_page(page);
+	}
+	return page;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator:
  */
@@ -349,13 +400,18 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 {
 	unsigned long min;
 	struct zone **zones, *classzone;
-	struct page * page;
+	struct page *page;
 	int cflags;
 	int i;
+	int cold;
 
 	if (gfp_mask & __GFP_WAIT)
 		might_sleep();
 
+	cold = 0;
+	if (gfp_mask & __GFP_COLD)
+		cold = 1;
+
 	mod_page_state(pgalloc, 1<<order);
 
 	zones = zonelist->zones;  /* the list of zones suitable for gfp_mask */
@@ -371,7 +427,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 		/* the incremental min is allegedly to discourage fallback */
 		min += z->pages_low;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -396,7 +452,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
 			local_min >>= 2;
 		min += local_min;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -410,7 +466,7 @@ rebalance:
 		for (i = 0; zones[i] != NULL; i++) {
 			struct zone *z = zones[i];
 
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -440,7 +496,7 @@ nopage:
 
 		min += z->pages_min;
 		if (z->free_pages > min || z->free_pages >= z->pages_high) {
-			page = rmqueue(z, order);
+			page = buffered_rmqueue(z, order, cold);
 			if (page)
 				return page;
 		}
@@ -492,13 +548,17 @@ void __pagevec_free(struct pagevec *pvec)
 	int i = pagevec_count(pvec);
 
 	while (--i >= 0)
-		__free_pages_ok(pvec->pages[i], 0);
+		free_hot_page(pvec->pages[i]);
 }
 
 void __free_pages(struct page *page, unsigned int order)
 {
-	if (!PageReserved(page) && put_page_testzero(page))
-		__free_pages_ok(page, order);
+	if (!PageReserved(page) && put_page_testzero(page)) {
+		if (order == 0)
+			free_hot_page(page);
+		else
+			__free_pages_ok(page, order);
+	}
 }
 
 void free_pages(unsigned long addr, unsigned int order)
@@ -899,7 +959,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 	unsigned long i, j;
 	unsigned long local_offset;
 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
-	int nid = pgdat->node_id;
+	int cpu, nid = pgdat->node_id;
 	struct page *lmem_map = pgdat->node_mem_map;
 	unsigned long zone_start_pfn = pgdat->node_start_pfn;
 
@@ -911,13 +971,13 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		struct zone *zone = pgdat->node_zones + j;
 		unsigned long mask;
 		unsigned long size, realsize;
+		unsigned long batch;
 
 		zone_table[nid * MAX_NR_ZONES + j] = zone;
 		realsize = size = zones_size[j];
 		if (zholes_size)
 			realsize -= zholes_size[j];
 
-		printk("  %s zone: %lu pages\n", zone_names[j], realsize);
 		zone->spanned_pages = size;
 		zone->present_pages = realsize;
 		zone->name = zone_names[j];
@@ -925,6 +985,40 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
 		spin_lock_init(&zone->lru_lock);
 		zone->zone_pgdat = pgdat;
 		zone->free_pages = 0;
+
+		/*
+		 * The per-cpu-pages pools are set to around 1000th of the
+		 * size of the zone.  But no more than 1/4 of a meg - there's
+		 * no point in going beyond the size of L2 cache.
+		 *
+		 * OK, so we don't know how big the cache is.  So guess.
+		 */
+		batch = zone->present_pages / 1024;
+		if (batch * PAGE_SIZE > 256 * 1024)
+			batch = (256 * 1024) / PAGE_SIZE;
+		batch /= 4;		/* We effectively *= 4 below */
+		if (batch < 1)
+			batch = 1;
+
+		for (cpu = 0; cpu < NR_CPUS; cpu++) {
+			struct per_cpu_pages *pcp;
+
+			pcp = &zone->pageset[cpu].pcp[0];	/* hot */
+			pcp->count = 0;
+			pcp->low = 2 * batch;
+			pcp->high = 6 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+
+			pcp = &zone->pageset[cpu].pcp[1];	/* cold */
+			pcp->count = 0;
+			pcp->low = 0;
+			pcp->high = 2 * batch;
+			pcp->batch = 1 * batch;
+			INIT_LIST_HEAD(&pcp->list);
+		}
+		printk("  %s zone: %lu pages, LIFO batch:%lu\n",
+				zone_names[j], realsize, batch);
 		INIT_LIST_HEAD(&zone->active_list);
 		INIT_LIST_HEAD(&zone->inactive_list);
 		atomic_set(&zone->refill_counter, 0);
diff --git a/mm/swap.c b/mm/swap.c
index 72f4c9cdd5c4..225e24f1973d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -69,7 +69,8 @@ void lru_add_drain(void)
 }
 
 /*
- * This path almost never happens - pages are normally freed via pagevecs.
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
  */
 void __page_cache_release(struct page *page)
 {
@@ -83,7 +84,7 @@ void __page_cache_release(struct page *page)
 		page = NULL;
 	spin_unlock_irqrestore(&zone->lru_lock, flags);
 	if (page)
-		__free_pages_ok(page, 0);
+		free_hot_page(page);
 }
 
 /*
-- 
cgit v1.2.3


From 5019ce29f74a7d7829beb25257f13b5080e2e6d6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:36:03 -0800
Subject: [PATCH] hot-n-cold pages: use cold pages for readahead

It is usually the case that pagecache reads use busmastering hardware
to transfer the data into pagecache.  This invalidates the CPU cache of
the pagecache pages.

So use cache-cold pages for pagecache reads.  To avoid wasting
cache-hot pages.
---
 include/linux/pagemap.h | 5 +++++
 mm/filemap.c            | 6 +++---
 mm/readahead.c          | 2 +-
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 1fe640eaf601..d2e647ac9623 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -31,6 +31,11 @@ static inline struct page *page_cache_alloc(struct address_space *x)
 	return alloc_pages(x->gfp_mask, 0);
 }
 
+static inline struct page *page_cache_alloc_cold(struct address_space *x)
+{
+	return alloc_pages(x->gfp_mask|__GFP_COLD, 0);
+}
+
 typedef int filler_t(void *, struct page *);
 
 extern struct page * find_get_page(struct address_space *mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index 5cc2c080c331..af26d617f922 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -258,7 +258,7 @@ static int page_cache_read(struct file * file, unsigned long offset)
 	struct page *page; 
 	int error;
 
-	page = page_cache_alloc(mapping);
+	page = page_cache_alloc_cold(mapping);
 	if (!page)
 		return -ENOMEM;
 
@@ -705,7 +705,7 @@ no_cached_page:
 		 * page..
 		 */
 		if (!cached_page) {
-			cached_page = page_cache_alloc(mapping);
+			cached_page = page_cache_alloc_cold(mapping);
 			if (!cached_page) {
 				desc->error = -ENOMEM;
 				break;
@@ -1199,7 +1199,7 @@ repeat:
 	page = find_get_page(mapping, index);
 	if (!page) {
 		if (!cached_page) {
-			cached_page = page_cache_alloc(mapping);
+			cached_page = page_cache_alloc_cold(mapping);
 			if (!cached_page)
 				return ERR_PTR(-ENOMEM);
 		}
diff --git a/mm/readahead.c b/mm/readahead.c
index 0ab09031edeb..6174f412e6e8 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -176,7 +176,7 @@ int do_page_cache_readahead(struct address_space *mapping,
 			continue;
 
 		read_unlock(&mapping->page_lock);
-		page = page_cache_alloc(mapping);
+		page = page_cache_alloc_cold(mapping);
 		read_lock(&mapping->page_lock);
 		if (!page)
 			break;
-- 
cgit v1.2.3


From 8d6282a1cf812279f490875cd55cb7a85623ac89 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@digeo.com>
Date: Tue, 29 Oct 2002 23:36:13 -0800
Subject: [PATCH] hot-n-cold pages: free and allocate hints

Add a `cold' hint to struct pagevec, and teach truncate and page
reclaim to use it.

Empirical testing showed that truncate's pages tend to be hot.  And page
reclaim's are certainly cold.
---
 fs/mpage.c              |  4 ++--
 include/linux/pagemap.h |  2 +-
 include/linux/pagevec.h | 10 +++++++++-
 mm/filemap.c            |  2 +-
 mm/page_alloc.c         |  2 +-
 mm/readahead.c          |  2 +-
 mm/swap.c               | 15 ++++++++-------
 mm/swap_state.c         |  2 +-
 mm/truncate.c           | 10 +++++++---
 mm/vmscan.c             |  6 +++---
 10 files changed, 34 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/mpage.c b/fs/mpage.c
index 86302997f797..7d4bc97259ab 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -263,7 +263,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	sector_t last_block_in_bio = 0;
 	struct pagevec lru_pvec;
 
-	pagevec_init(&lru_pvec);
+	pagevec_init(&lru_pvec, 0);
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, list);
 
@@ -560,7 +560,7 @@ mpage_writepages(struct address_space *mapping,
 	if (get_block == NULL)
 		writepage = mapping->a_ops->writepage;
 
-	pagevec_init(&pvec);
+	pagevec_init(&pvec, 0);
 	write_lock(&mapping->page_lock);
 
 	list_splice_init(&mapping->dirty_pages, &mapping->io_pages);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index d2e647ac9623..04751ceba493 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -24,7 +24,7 @@
 
 #define page_cache_get(page)		get_page(page)
 #define page_cache_release(page)	put_page(page)
-void release_pages(struct page **pages, int nr);
+void release_pages(struct page **pages, int nr, int cold);
 
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 0207270b0fe7..d149e0688b1e 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -12,6 +12,7 @@ struct address_space;
 
 struct pagevec {
 	unsigned nr;
+	int cold;
 	struct page *pages[PAGEVEC_SIZE];
 };
 
@@ -25,7 +26,13 @@ void pagevec_strip(struct pagevec *pvec);
 unsigned int pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned int nr_pages);
 
-static inline void pagevec_init(struct pagevec *pvec)
+static inline void pagevec_init(struct pagevec *pvec, int cold)
+{
+	pvec->nr = 0;
+	pvec->cold = cold;
+}
+
+static inline void pagevec_reinit(struct pagevec *pvec)
 {
 	pvec->nr = 0;
 }
@@ -49,6 +56,7 @@ static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
 	return pagevec_space(pvec);
 }
 
+
 static inline void pagevec_release(struct pagevec *pvec)
 {
 	if (pagevec_count(pvec))
diff --git a/mm/filemap.c b/mm/filemap.c
index af26d617f922..24f05d430231 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1449,7 +1449,7 @@ generic_file_write_nolock(struct file *file, const struct iovec *iov,
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 
-	pagevec_init(&lru_pvec);
+	pagevec_init(&lru_pvec, 0);
 
 	if (unlikely(file->f_error)) {
 		err = file->f_error;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f46471b25586..7d5459c91f2e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -548,7 +548,7 @@ void __pagevec_free(struct pagevec *pvec)
 	int i = pagevec_count(pvec);
 
 	while (--i >= 0)
-		free_hot_page(pvec->pages[i]);
+		free_hot_cold_page(pvec->pages[i], pvec->cold);
 }
 
 void __free_pages(struct page *page, unsigned int order)
diff --git a/mm/readahead.c b/mm/readahead.c
index 6174f412e6e8..61285671c793 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -49,7 +49,7 @@ read_pages(struct address_space *mapping, struct file *filp,
 	unsigned page_idx;
 	struct pagevec lru_pvec;
 
-	pagevec_init(&lru_pvec);
+	pagevec_init(&lru_pvec, 0);
 
 	if (mapping->a_ops->readpages)
 		return mapping->a_ops->readpages(mapping, pages, nr_pages);
diff --git a/mm/swap.c b/mm/swap.c
index 225e24f1973d..713acb01b143 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -99,13 +99,13 @@ void __page_cache_release(struct page *page)
  * page count inside the lock to see whether shrink_cache grabbed the page
  * via the LRU.  If it did, give up: shrink_cache will free it.
  */
-void release_pages(struct page **pages, int nr)
+void release_pages(struct page **pages, int nr, int cold)
 {
 	int i;
 	struct pagevec pages_to_free;
 	struct zone *zone = NULL;
 
-	pagevec_init(&pages_to_free);
+	pagevec_init(&pages_to_free, cold);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
 		struct zone *pagezone;
@@ -126,7 +126,7 @@ void release_pages(struct page **pages, int nr)
 			if (!pagevec_add(&pages_to_free, page)) {
 				spin_unlock_irq(&zone->lru_lock);
 				__pagevec_free(&pages_to_free);
-				pagevec_init(&pages_to_free);
+				pagevec_reinit(&pages_to_free);
 				zone = NULL;	/* No lock is held */
 			}
 		}
@@ -139,8 +139,8 @@ void release_pages(struct page **pages, int nr)
 
 void __pagevec_release(struct pagevec *pvec)
 {
-	release_pages(pvec->pages, pagevec_count(pvec));
-	pagevec_init(pvec);
+	release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
+	pagevec_reinit(pvec);
 }
 
 /*
@@ -153,7 +153,8 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
 	int i;
 	struct pagevec pages_to_free;
 
-	pagevec_init(&pages_to_free);
+	pagevec_init(&pages_to_free, pvec->cold);
+	pages_to_free.cold = pvec->cold;
 	for (i = 0; i < pagevec_count(pvec); i++) {
 		struct page *page = pvec->pages[i];
 
@@ -162,7 +163,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
 			pagevec_add(&pages_to_free, page);
 	}
 	pagevec_free(&pages_to_free);
-	pagevec_init(pvec);
+	pagevec_reinit(pvec);
 }
 
 /*
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ba75cf456fcd..19de7f4073f4 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -301,7 +301,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr)
 
 		for (i = 0; i < todo; i++)
 			free_swap_cache(pagep[i]);
-		release_pages(pagep, todo);
+		release_pages(pagep, todo, 0);
 		pagep += todo;
 		nr -= todo;
 	}
diff --git a/mm/truncate.c b/mm/truncate.c
index cdf08bd59f41..884b4e3930c2 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -100,6 +100,10 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
  * When looking at page->index outside the page lock we need to be careful to
  * copy it into a local to avoid races (it could change at any time).
  *
+ * We pass down the cache-hot hint to the page freeing code.  Even if the
+ * mapping is large, it is probably the case that the final pages are the most
+ * recently touched, and freeing happens in ascending file offset order.
+ *
  * Called under (and serialised by) inode->i_sem.
  */
 void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
@@ -110,7 +114,7 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
 	pgoff_t next;
 	int i;
 
-	pagevec_init(&pvec);
+	pagevec_init(&pvec, 0);
 	next = start;
 	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
@@ -185,7 +189,7 @@ void invalidate_inode_pages(struct address_space *mapping)
 	pgoff_t next = 0;
 	int i;
 
-	pagevec_init(&pvec);
+	pagevec_init(&pvec, 0);
 	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
@@ -226,7 +230,7 @@ void invalidate_inode_pages2(struct address_space *mapping)
 	pgoff_t next = 0;
 	int i;
 
-	pagevec_init(&pvec);
+	pagevec_init(&pvec, 0);
 	while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5d80882c01e6..8119afe12ddd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -203,7 +203,7 @@ shrink_list(struct list_head *page_list, unsigned int gfp_mask,
 	int pgactivate = 0;
 	int ret = 0;
 
-	pagevec_init(&freed_pvec);
+	pagevec_init(&freed_pvec, 1);
 	while (!list_empty(page_list)) {
 		struct page *page;
 		int may_enter_fs;
@@ -433,7 +433,7 @@ shrink_cache(const int nr_pages, struct zone *zone,
 	if (nr_to_process < SWAP_CLUSTER_MAX)
 		nr_to_process = SWAP_CLUSTER_MAX;
 
-	pagevec_init(&pvec);
+	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
@@ -617,7 +617,7 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in,
 		pgdeactivate++;
 	}
 
-	pagevec_init(&pvec);
+	pagevec_init(&pvec, 1);
 	spin_lock_irq(&zone->lru_lock);
 	while (!list_empty(&l_inactive)) {
 		page = list_entry(l_inactive.prev, struct page, lru);
-- 
cgit v1.2.3


From d571b4837480e5370e90b140ca623b355385da9b Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@cse.unsw.edu.au>
Date: Wed, 30 Oct 2002 00:04:30 -0800
Subject: [PATCH] md: factor out MD superblock handling code

Define an interface for interpreting and updating superblocks
so we can more easily define new formats.

With this patch, (almost) all superblock layout information is
locating in a small set of routines dedicated to superblock
handling.  This will allow us to provide a similar set for
a different format.

The two exceptions are:
 1/ autostart_array where the devices listed in the superblock
    are searched for.
 2/ raid5 'knows' the maximum number of devices for
     compute_parity.

These will be addressed in a later patch.
---
 drivers/md/md.c           | 710 ++++++++++++++++++++++------------------------
 drivers/md/multipath.c    |   6 +-
 include/linux/raid/md_k.h |   4 +-
 3 files changed, 352 insertions(+), 368 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f9f71514541b..40b0d5f5e2d1 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -307,8 +307,6 @@ static int alloc_disk_sb(mdk_rdev_t * rdev)
 		printk(OUT_OF_MEM);
 		return -EINVAL;
 	}
-	rdev->sb = (mdp_super_t *) page_address(rdev->sb_page);
-	clear_page(rdev->sb);
 
 	return 0;
 }
@@ -317,7 +315,7 @@ static void free_disk_sb(mdk_rdev_t * rdev)
 {
 	if (rdev->sb_page) {
 		page_cache_release(rdev->sb_page);
-		rdev->sb = NULL;
+		rdev->sb_loaded = 0;
 		rdev->sb_page = NULL;
 		rdev->sb_offset = 0;
 		rdev->size = 0;
@@ -365,10 +363,12 @@ static int read_disk_sb(mdk_rdev_t * rdev)
 {
 	sector_t sb_offset;
 
-	if (!rdev->sb) {
+	if (!rdev->sb_page) {
 		MD_BUG();
 		return -EINVAL;
 	}
+	if (rdev->sb_loaded)
+		return 0;
 
 	/*
 	 * Calculate the position of the superblock,
@@ -381,8 +381,7 @@ static int read_disk_sb(mdk_rdev_t * rdev)
 
 	if (!sync_page_io(rdev->bdev, sb_offset<<1, MD_SB_BYTES, rdev->sb_page, READ))
 		goto fail;
-
-	printk(KERN_INFO " [events: %08lx]\n", (unsigned long)rdev->sb->events_lo);
+	rdev->sb_loaded = 1;
 	return 0;
 
 fail:
@@ -390,6 +389,56 @@ fail:
 	return -EINVAL;
 }
 
+static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+	if (	(sb1->set_uuid0 == sb2->set_uuid0) &&
+		(sb1->set_uuid1 == sb2->set_uuid1) &&
+		(sb1->set_uuid2 == sb2->set_uuid2) &&
+		(sb1->set_uuid3 == sb2->set_uuid3))
+
+		return 1;
+
+	return 0;
+}
+
+
+static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
+{
+	int ret;
+	mdp_super_t *tmp1, *tmp2;
+
+	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
+	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
+
+	if (!tmp1 || !tmp2) {
+		ret = 0;
+		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
+		goto abort;
+	}
+
+	*tmp1 = *sb1;
+	*tmp2 = *sb2;
+
+	/*
+	 * nr_disks is not constant
+	 */
+	tmp1->nr_disks = 0;
+	tmp2->nr_disks = 0;
+
+	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
+		ret = 0;
+	else
+		ret = 1;
+
+abort:
+	if (tmp1)
+		kfree(tmp1);
+	if (tmp2)
+		kfree(tmp2);
+
+	return ret;
+}
+
 static unsigned int calc_sb_csum(mdp_super_t * sb)
 {
 	unsigned int disk_csum, csum;
@@ -402,39 +451,284 @@ static unsigned int calc_sb_csum(mdp_super_t * sb)
 }
 
 /*
- * Check one RAID superblock for generic plausibility
+ * Handle superblock details.
+ * We want to be able to handle multiple superblock formats
+ * so we have a common interface to them all, and an array of
+ * different handlers.
+ * We rely on user-space to write the initial superblock, and support
+ * reading and updating of superblocks.
+ * Interface methods are:
+ *   int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev)
+ *      loads and validates a superblock on dev.
+ *      if refdev != NULL, compare superblocks on both devices
+ *    Return:
+ *      0 - dev has a superblock that is compatible with refdev
+ *      1 - dev has a superblock that is compatible and newer than refdev
+ *          so dev should be used as the refdev in future
+ *     -EINVAL superblock incompatible or invalid
+ *     -othererror e.g. -EIO
+ *
+ *   int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *      Verify that dev is acceptable into mddev.
+ *       The first time, mddev->raid_disks will be 0, and data from
+ *       dev should be merged in.  Subsequent calls check that dev
+ *       is new enough.  Return 0 or -EINVAL
+ *
+ *   void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
+ *     Update the superblock for rdev with data in mddev
+ *     This does not write to disc.
+ *
  */
 
-static int check_disk_sb(mdk_rdev_t * rdev)
+struct super_type  {
+	char 		*name;
+	struct module	*owner;
+	int		(*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev);
+	int		(*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+	void		(*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
+};
+
+/*
+ * load_super for 0.90.0 
+ */
+static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev)
 {
 	mdp_super_t *sb;
-	int ret = -EINVAL;
+	int ret;
 
-	sb = rdev->sb;
-	if (!sb) {
-		MD_BUG();
-		goto abort;
-	}
+	ret = read_disk_sb(rdev);
+	if (ret) return ret;
+
+	ret = -EINVAL;
+
+	sb = (mdp_super_t*)page_address(rdev->sb_page);
 
 	if (sb->md_magic != MD_SB_MAGIC) {
 		printk(BAD_MAGIC, bdev_partition_name(rdev->bdev));
 		goto abort;
 	}
 
+	if (sb->major_version != 0 ||
+	    sb->minor_version != 90) {
+		printk(KERN_WARNING "Bad version number %d.%d on %s\n",
+		       sb->major_version, sb->minor_version,
+		       bdev_partition_name(rdev->bdev));
+		goto abort;
+	}
+
 	if (sb->md_minor >= MAX_MD_DEVS) {
 		printk(BAD_MINOR, bdev_partition_name(rdev->bdev), sb->md_minor);
 		goto abort;
 	}
+	if (sb->raid_disks <= 0)
+		goto abort;
 
 	if (calc_sb_csum(sb) != sb->sb_csum) {
 		printk(BAD_CSUM, bdev_partition_name(rdev->bdev));
 		goto abort;
 	}
-	ret = 0;
-abort:
+
+	rdev->preferred_minor = sb->md_minor;
+
+	if (refdev == 0)
+		ret = 1;
+	else {
+		__u64 ev1, ev2;
+		mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
+		if (!uuid_equal(refsb, sb)) {
+			printk(KERN_WARNING "md: %s has different UUID to %s\n",
+			       bdev_partition_name(rdev->bdev),
+			       bdev_partition_name(refdev->bdev));
+			goto abort;
+		}
+		if (!sb_equal(refsb, sb)) {
+			printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
+			       bdev_partition_name(rdev->bdev),
+			       bdev_partition_name(refdev->bdev));
+			goto abort;
+		}
+		ev1 = md_event(sb);
+		ev2 = md_event(refsb);
+		if (ev1 > ev2)
+			ret = 1;
+		else 
+			ret = 0;
+	}
+
+
+ abort:
 	return ret;
 }
 
+/*
+ * validate_super for 0.90.0
+ */
+static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdp_disk_t *desc;
+	mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
+
+	if (mddev->raid_disks == 0) {
+		mddev->major_version = sb->major_version;
+		mddev->minor_version = sb->minor_version;
+		mddev->patch_version = sb->patch_version;
+		mddev->persistent = ! sb->not_persistent;
+		mddev->chunk_size = sb->chunk_size;
+		mddev->ctime = sb->ctime;
+		mddev->utime = sb->utime;
+		mddev->level = sb->level;
+		mddev->layout = sb->layout;
+		mddev->raid_disks = sb->raid_disks;
+		mddev->state = sb->state;
+		mddev->size = sb->size;
+		mddev->events = md_event(sb);
+	
+		memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
+		memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
+		memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
+		memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
+
+		mddev->max_disks = MD_SB_DISKS;
+	} else {
+		__u64 ev1;
+		ev1 = md_event(sb);
+		++ev1;
+		if (ev1 < mddev->events) 
+			return -EINVAL;
+	}
+	if (mddev->level != LEVEL_MULTIPATH) {
+		rdev->desc_nr = sb->this_disk.number;
+		rdev->raid_disk = -1;
+		rdev->in_sync = rdev->faulty = 0;
+		desc = sb->disks + rdev->desc_nr;
+
+		if (desc->state & (1<<MD_DISK_FAULTY))
+			rdev->faulty = 1;
+		else if (desc->state & (1<<MD_DISK_SYNC) &&
+			 desc->raid_disk < mddev->raid_disks) {
+			rdev->in_sync = 1;
+			rdev->raid_disk = desc->raid_disk;
+		}
+	}
+	return 0;
+}
+
+/*
+ * sync_super for 0.90.0
+ */
+static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+	mdp_super_t *sb;
+	struct list_head *tmp;
+	mdk_rdev_t *rdev2;
+	int next_spare = mddev->raid_disks;
+
+	/* make rdev->sb match mddev data..
+	 *
+	 * 1/ zero out disks
+	 * 2/ Add info for each disk, keeping track of highest desc_nr
+	 * 3/ any empty disks < highest become removed
+	 *
+	 * disks[0] gets initialised to REMOVED because
+	 * we cannot be sure from other fields if it has
+	 * been initialised or not.
+	 */
+	int highest = 0;
+	int i;
+	int active=0, working=0,failed=0,spare=0,nr_disks=0;
+
+	sb = (mdp_super_t*)page_address(rdev->sb_page);
+
+	memset(sb, 0, sizeof(*sb));
+
+	sb->md_magic = MD_SB_MAGIC;
+	sb->major_version = mddev->major_version;
+	sb->minor_version = mddev->minor_version;
+	sb->patch_version = mddev->patch_version;
+	sb->gvalid_words  = 0; /* ignored */
+	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
+	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
+	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
+	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
+
+	sb->ctime = mddev->ctime;
+	sb->level = mddev->level;
+	sb->size  = mddev->size;
+	sb->raid_disks = mddev->raid_disks;
+	sb->md_minor = mddev->__minor;
+	sb->not_persistent = !mddev->persistent;
+	sb->utime = mddev->utime;
+	sb->state = mddev->state;
+	sb->events_hi = (mddev->events>>32);
+	sb->events_lo = (u32)mddev->events;
+
+	sb->layout = mddev->layout;
+	sb->chunk_size = mddev->chunk_size;
+
+	sb->disks[0].state = (1<<MD_DISK_REMOVED);
+	ITERATE_RDEV(mddev,rdev2,tmp) {
+		mdp_disk_t *d;
+		if (rdev2->raid_disk >= 0)
+			rdev2->desc_nr = rdev2->raid_disk;
+		else
+			rdev2->desc_nr = next_spare++;
+		d = &sb->disks[rdev2->desc_nr];
+		nr_disks++;
+		d->number = rdev2->desc_nr;
+		d->major = MAJOR(rdev2->bdev->bd_dev);
+		d->minor = MINOR(rdev2->bdev->bd_dev);
+		if (rdev2->raid_disk >= 0)
+			d->raid_disk = rdev2->raid_disk;
+		else
+			d->raid_disk = rdev2->desc_nr; /* compatability */
+		if (rdev2->faulty) {
+			d->state = (1<<MD_DISK_FAULTY);
+			failed++;
+		} else if (rdev2->in_sync) {
+			d->state = (1<<MD_DISK_ACTIVE);
+			d->state |= (1<<MD_DISK_SYNC);
+			active++;
+			working++;
+		} else {
+			d->state = 0;
+			spare++;
+			working++;
+		}
+		if (rdev2->desc_nr > highest)
+			highest = rdev2->desc_nr;
+	}
+	
+	/* now set the "removed" bit on any non-trailing holes */
+	for (i=0; i<highest; i++) {
+		mdp_disk_t *d = &sb->disks[i];
+		if (d->state == 0 && d->number == 0) {
+			d->number = i;
+			d->raid_disk = i;
+			d->state = (1<<MD_DISK_REMOVED);
+		}
+	}
+	sb->nr_disks = nr_disks;
+	sb->active_disks = active;
+	sb->working_disks = working;
+	sb->failed_disks = failed;
+	sb->spare_disks = spare;
+
+	sb->this_disk = sb->disks[rdev->desc_nr];
+	sb->sb_csum = calc_sb_csum(sb);
+}
+
+struct super_type super_types[] = {
+	[0] = {
+		.name	= "0.90.0",
+		.owner	= THIS_MODULE,
+		.load_super	= super_90_load,
+		.validate_super	= super_90_validate,
+		.sync_super	= super_90_sync,
+	},
+};
+
+
+	
 static mdk_rdev_t * match_dev_unit(mddev_t *mddev, mdk_rdev_t *dev)
 {
 	struct list_head *tmp;
@@ -618,9 +912,9 @@ static void print_rdev(mdk_rdev_t *rdev)
 	printk(KERN_INFO "md: rdev %s, SZ:%08llu F:%d S:%d DN:%d ",
 		bdev_partition_name(rdev->bdev),
 		(unsigned long long)rdev->size, rdev->faulty, rdev->in_sync, rdev->desc_nr);
-	if (rdev->sb) {
+	if (rdev->sb_loaded) {
 		printk(KERN_INFO "md: rdev superblock:\n");
-		print_sb(rdev->sb);
+		print_sb((mdp_super_t*)page_address(rdev->sb_page));
 	} else
 		printk(KERN_INFO "md: no rdev superblock!\n");
 }
@@ -648,61 +942,13 @@ void md_print_devices(void)
 	printk("\n");
 }
 
-static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
-{
-	int ret;
-	mdp_super_t *tmp1, *tmp2;
-
-	tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
-	tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
-
-	if (!tmp1 || !tmp2) {
-		ret = 0;
-		printk(KERN_INFO "md.c: sb1 is not equal to sb2!\n");
-		goto abort;
-	}
-
-	*tmp1 = *sb1;
-	*tmp2 = *sb2;
-
-	/*
-	 * nr_disks is not constant
-	 */
-	tmp1->nr_disks = 0;
-	tmp2->nr_disks = 0;
-
-	if (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4))
-		ret = 0;
-	else
-		ret = 1;
-
-abort:
-	if (tmp1)
-		kfree(tmp1);
-	if (tmp2)
-		kfree(tmp2);
-
-	return ret;
-}
-
-static int uuid_equal(mdk_rdev_t *rdev1, mdk_rdev_t *rdev2)
-{
-	if (	(rdev1->sb->set_uuid0 == rdev2->sb->set_uuid0) &&
-		(rdev1->sb->set_uuid1 == rdev2->sb->set_uuid1) &&
-		(rdev1->sb->set_uuid2 == rdev2->sb->set_uuid2) &&
-		(rdev1->sb->set_uuid3 == rdev2->sb->set_uuid3))
-
-		return 1;
-
-	return 0;
-}
 
 static int write_disk_sb(mdk_rdev_t * rdev)
 {
 	sector_t sb_offset;
 	sector_t size;
 
-	if (!rdev->sb) {
+	if (!rdev->sb_loaded) {
 		MD_BUG();
 		return 1;
 	}
@@ -710,10 +956,6 @@ static int write_disk_sb(mdk_rdev_t * rdev)
 		MD_BUG();
 		return 1;
 	}
-	if (rdev->sb->md_magic != MD_SB_MAGIC) {
-		MD_BUG();
-		return 1;
-	}
 
 	sb_offset = calc_dev_sboffset(rdev->bdev);
 	if (rdev->sb_offset != sb_offset) {
@@ -751,116 +993,10 @@ fail:
 static void sync_sbs(mddev_t * mddev)
 {
 	mdk_rdev_t *rdev;
-	mdp_super_t *sb;
 	struct list_head *tmp;
-	int next_spare = mddev->raid_disks;
-
-	/* make all rdev->sb match mddev data..
-	 * we setup the data in the first rdev and copy it
-	 * to the others.
-	 *
-	 * 1/ zero out disks
-	 * 2/ Add info for each disk, keeping track of highest desc_nr
-	 * 3/ any empty disks < highest become removed
-	 *
-	 * disks[0] gets initialised to REMOVED because
-	 * we cannot be sure from other fields if it has
-	 * been initialised or not.
-	 */
-	int highest = 0;
-	int i;
-	int active=0, working=0,failed=0,spare=0,nr_disks=0;
-
-	if (list_empty(&mddev->disks)) {
-		MD_BUG();
-		return;
-	}
-	rdev = list_entry(mddev->disks.next, mdk_rdev_t, same_set);
-	sb = rdev->sb;
-
-	memset(sb, 0, sizeof(*sb));
-
-	sb->md_magic = MD_SB_MAGIC;
-	sb->major_version = mddev->major_version;
-	sb->minor_version = mddev->minor_version;
-	sb->patch_version = mddev->patch_version;
-	sb->gvalid_words  = 0; /* ignored */
-	memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
-	memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
-	memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
-	memcpy(&sb->set_uuid3, mddev->uuid+12,4);
-
-	sb->ctime = mddev->ctime;
-	sb->level = mddev->level;
-	sb->size  = mddev->size;
-	sb->raid_disks = mddev->raid_disks;
-	sb->md_minor = mddev->__minor;
-	sb->not_persistent = !mddev->persistent;
-	sb->utime = mddev->utime;
-	sb->state = mddev->state;
-	sb->events_hi = (mddev->events>>32);
-	sb->events_lo = (u32)mddev->events;
 
-	sb->layout = mddev->layout;
-	sb->chunk_size = mddev->chunk_size;
-
-	sb->disks[0].state = (1<<MD_DISK_REMOVED);
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		mdp_disk_t *d;
-		if (rdev->raid_disk >= 0)
-			rdev->desc_nr = rdev->raid_disk;
-		else
-			rdev->desc_nr = next_spare++;
-		d = &sb->disks[rdev->desc_nr];
-		nr_disks++;
-		d->number = rdev->desc_nr;
-		d->major = MAJOR(rdev->bdev->bd_dev);
-		d->minor = MINOR(rdev->bdev->bd_dev);
-		if (rdev->raid_disk >= 0)
-			d->raid_disk = rdev->raid_disk;
-		else
-			d->raid_disk = rdev->desc_nr; /* compatability */
-		if (rdev->faulty) {
-			d->state = (1<<MD_DISK_FAULTY);
-			failed++;
-		} else if (rdev->in_sync) {
-			d->state = (1<<MD_DISK_ACTIVE);
-			d->state |= (1<<MD_DISK_SYNC);
-			active++;
-			working++;
-		} else {
-			d->state = 0;
-			spare++;
-			working++;
-		}
-		if (rdev->desc_nr > highest)
-			highest = rdev->desc_nr;
-	}
-	
-	/* now set the "removed" bit on any non-trailing holes */
-	for (i=0; i<highest; i++) {
-		mdp_disk_t *d = &sb->disks[i];
-		if (d->state == 0 && d->number == 0) {
-			d->number = i;
-			d->raid_disk = i;
-			d->state = (1<<MD_DISK_REMOVED);
-		}
-	}
-	sb->nr_disks = nr_disks;
-	sb->active_disks = active;
-	sb->working_disks = working;
-	sb->failed_disks = failed;
-	sb->spare_disks = spare;
-
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		mdp_super_t *this_sb;
-		
-		this_sb = rdev->sb;
-		if (this_sb != sb)
-			*this_sb = *sb;
-		this_sb->this_disk = this_sb->disks[rdev->desc_nr];
-		this_sb->sb_csum = calc_sb_csum(this_sb);
-	}
+	ITERATE_RDEV(mddev,rdev,tmp)
+		super_90_sync(mddev, rdev);
 }
 
 static void md_update_sb(mddev_t * mddev)
@@ -903,8 +1039,6 @@ repeat:
 
 		printk("%s ", bdev_partition_name(rdev->bdev));
 		if (!rdev->faulty) {
-			printk("[events: %08lx]",
-				(unsigned long)rdev->sb->events_lo);
 			err += write_disk_sb(rdev);
 		} else
 			printk(")\n");
@@ -968,13 +1102,14 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
 	}
 
 	if (on_disk) {
-		if ((err = read_disk_sb(rdev))) {
-			printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
+		err = super_90_load(rdev, NULL);
+		if (err == -EINVAL) {
+			printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
 			       bdev_partition_name(rdev->bdev));
 			goto abort_free;
 		}
-		if ((err = check_disk_sb(rdev))) {
-			printk(KERN_WARNING "md: %s has invalid sb, not importing!\n",
+		if (err < 0) {
+			printk(KERN_WARNING "md: could not read %s's sb, not importing!\n",
 			       bdev_partition_name(rdev->bdev));
 			goto abort_free;
 		}
@@ -984,7 +1119,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int on_disk)
 	return rdev;
 
 abort_free:
-	if (rdev->sb) {
+	if (rdev->sb_page) {
 		if (rdev->bdev)
 			unlock_rdev(rdev);
 		free_disk_sb(rdev);
@@ -1014,155 +1149,39 @@ abort_free:
 
 static int analyze_sbs(mddev_t * mddev)
 {
-	int out_of_date = 0, i;
+	int i;
 	struct list_head *tmp;
 	mdk_rdev_t *rdev, *freshest;
-	mdp_super_t *sb;
-
-	/*
-	 * Verify the RAID superblock on each real device
-	 */
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		if (rdev->faulty) {
-			MD_BUG();
-			goto abort;
-		}
-		if (!rdev->sb) {
-			MD_BUG();
-			goto abort;
-		}
-		if (check_disk_sb(rdev))
-			goto abort;
-	}
 
-	/*
-	 * The superblock constant part has to be the same
-	 * for all disks in the array.
-	 */
-	sb = NULL;
-
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		if (!sb) {
-			sb = rdev->sb;
-			continue;
-		}
-		if (!sb_equal(sb, rdev->sb)) {
+	freshest = NULL;
+	ITERATE_RDEV(mddev,rdev,tmp)
+		switch (super_90_load(rdev, freshest)) {
+		case 1:
+			freshest = rdev;
+			break;
+		case 0:
+			break;
+		default:
 			printk(INCONSISTENT, bdev_partition_name(rdev->bdev));
 			kick_rdev_from_array(rdev);
-			continue;
 		}
-	}
 
-	/*
-	 * OK, we have all disks and the array is ready to run. Let's
-	 * find the freshest superblock, that one will be the superblock
-	 * that represents the whole array.
-	 */
-	freshest = NULL;
 
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		__u64 ev1, ev2;
-		/*
-		 * if the checksum is invalid, use the superblock
-		 * only as a last resort. (decrease it's age by
-		 * one event)
-		 */
-		if (calc_sb_csum(rdev->sb) != rdev->sb->sb_csum) {
-			if (rdev->sb->events_lo || rdev->sb->events_hi)
-				if (!(rdev->sb->events_lo--))
-					rdev->sb->events_hi--;
-		}
+	super_90_validate(mddev, freshest);
 
-		printk(KERN_INFO "md: %s's event counter: %08lx\n",
-		       bdev_partition_name(rdev->bdev),
-			(unsigned long)rdev->sb->events_lo);
-		if (!freshest) {
-			freshest = rdev;
-			continue;
-		}
-		/*
-		 * Find the newest superblock version
-		 */
-		ev1 = md_event(rdev->sb);
-		ev2 = md_event(freshest->sb);
-		if (ev1 != ev2) {
-			out_of_date = 1;
-			if (ev1 > ev2)
-				freshest = rdev;
-		}
-	}
-	if (out_of_date) {
-		printk(OUT_OF_DATE);
-		printk(KERN_INFO "md: freshest: %s\n", bdev_partition_name(freshest->bdev));
-	}
-
-	sb = freshest->sb;
-
-	mddev->major_version = sb->major_version;
-	mddev->minor_version = sb->minor_version;
-	mddev->patch_version = sb->patch_version;
-	mddev->persistent = ! sb->not_persistent;
-	mddev->chunk_size = sb->chunk_size;
-	mddev->ctime = sb->ctime;
-	mddev->utime = sb->utime;
-	mddev->level = sb->level;
-	mddev->layout = sb->layout;
-	mddev->raid_disks = sb->raid_disks;
-	mddev->state = sb->state;
-	mddev->size = sb->size;
-	mddev->events = md_event(sb);
-	
-	memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
-	memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
-	memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
-	memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
-
-	/*
-	 * at this point we have picked the 'best' superblock
-	 * from all available superblocks.
-	 * now we validate this superblock and kick out possibly
-	 * failed disks.
-	 */
-	ITERATE_RDEV(mddev,rdev,tmp) {
-		/*
-		 * Kick all non-fresh devices
-		 */
-		__u64 ev1;
-		ev1 = md_event(rdev->sb);
-		++ev1;
-		if (ev1 < mddev->events) {
-			printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
-						bdev_partition_name(rdev->bdev));
-			kick_rdev_from_array(rdev);
-			continue;
-		}
-	}
-
-	/* set rdev->desc_nr for each device.
-	 * for MULTIPATH, we just us sequential number as
-	 * nothing else is meaningful
-	 */
 	i = 0;
 	ITERATE_RDEV(mddev,rdev,tmp) {
+		if (rdev != freshest)
+			if (super_90_validate(mddev, rdev)) {
+				printk(KERN_WARNING "md: kicking non-fresh %s from array!\n",
+				       bdev_partition_name(rdev->bdev));
+				kick_rdev_from_array(rdev);
+				continue;
+			}
 		if (mddev->level == LEVEL_MULTIPATH) {
 			rdev->desc_nr = i++;
 			rdev->raid_disk = rdev->desc_nr;
 			rdev->in_sync = 1;
-		} else {
-			mdp_disk_t *desc;
-			rdev->desc_nr = rdev->sb->this_disk.number;
-			desc = sb->disks + rdev->desc_nr;
-			rdev->raid_disk = -1;
-			rdev->in_sync = rdev->faulty = 0;
-
-			if (desc->state & (1<<MD_DISK_FAULTY)) {
-				rdev->faulty = 1;
-				kick_rdev_from_array(rdev);
-			} else if (desc->state & (1<<MD_DISK_SYNC) &&
-				   desc->raid_disk < mddev->raid_disks) {
-				rdev->in_sync = 1;
-				rdev->raid_disk = desc->raid_disk;
-			}
 		}
 	}
 
@@ -1579,20 +1598,6 @@ out:
 	return err;
 }
 
-/*
- * We have to safely support old arrays too.
- */
-int detect_old_array(mdp_super_t *sb)
-{
-	if (sb->major_version > 0)
-		return 0;
-	if (sb->minor_version >= 90)
-		return 0;
-
-	return -EINVAL;
-}
-
-
 static void autorun_array(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
@@ -1648,25 +1653,18 @@ static void autorun_devices(void)
 
 		printk(KERN_INFO "md: considering %s ...\n", bdev_partition_name(rdev0->bdev));
 		INIT_LIST_HEAD(&candidates);
-		ITERATE_RDEV_PENDING(rdev,tmp) {
-			if (uuid_equal(rdev0, rdev)) {
-				if (!sb_equal(rdev0->sb, rdev->sb)) {
-					printk(KERN_WARNING
-					       "md: %s has same UUID as %s, but superblocks differ ...\n",
-					       bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
-					continue;
-				}
+		ITERATE_RDEV_PENDING(rdev,tmp)
+			if (super_90_load(rdev, rdev0) >= 0) {
 				printk(KERN_INFO "md:  adding %s ...\n", bdev_partition_name(rdev->bdev));
 				list_move(&rdev->same_set, &candidates);
 			}
-		}
 		/*
 		 * now we have a set of devices, with all of them having
 		 * mostly sane superblocks. It's time to allocate the
 		 * mddev.
 		 */
 
-		mddev = mddev_find(rdev0->sb->md_minor);
+		mddev = mddev_find(rdev0->preferred_minor);
 		if (!mddev) {
 			printk(KERN_ERR "md: cannot allocate memory for md drive.\n");
 			break;
@@ -1748,15 +1746,6 @@ static int autostart_array(dev_t startdev)
 	}
 	list_add(&start_rdev->same_set, &pending_raid_disks);
 
-	sb = start_rdev->sb;
-
-	err = detect_old_array(sb);
-	if (err) {
-		printk(KERN_WARNING "md: array version is too old to be autostarted ,"
-		       "use raidtools 0.90 mkraid --upgrade to upgrade the array "
-		       "without data loss!\n");
-		goto abort;
-	}
 
 	for (i = 0; i < MD_SB_DISKS; i++) {
 		mdp_disk_t *desc;
@@ -1875,8 +1864,6 @@ static int get_disk_info(mddev_t * mddev, void * arg)
 		return -EFAULT;
 
 	nr = info.number;
-	if (nr >= MD_SB_DISKS)
-		return -EINVAL;
 
 	rdev = find_rdev_nr(mddev, nr);
 	if (rdev) {
@@ -1918,18 +1905,13 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 		if (!list_empty(&mddev->disks)) {
 			mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
 							mdk_rdev_t, same_set);
-			if (!uuid_equal(rdev0, rdev)) {
+			int err = super_90_load(rdev, NULL);
+			if (err < 0) {
 				printk(KERN_WARNING "md: %s has different UUID to %s\n",
 				       bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
 				export_rdev(rdev);
 				return -EINVAL;
 			}
-			if (!sb_equal(rdev0->sb, rdev->sb)) {
-				printk(KERN_WARNING "md: %s has same UUID but different superblock to %s\n",
-				       bdev_partition_name(rdev->bdev), bdev_partition_name(rdev0->bdev));
-				export_rdev(rdev);
-				return -EINVAL;
-			}
 		}
 		bind_rdev_to_array(rdev, mddev);
 		return 0;
@@ -2080,11 +2062,11 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	rdev->size = size;
 	rdev->sb_offset = calc_dev_sboffset(rdev->bdev);
 
-	for (i = mddev->raid_disks; i < MD_SB_DISKS; i++)
+	for (i = mddev->raid_disks; i < mddev->max_disks; i++)
 		if (find_rdev_nr(mddev,i)==NULL)
 			break;
 
-	if (i == MD_SB_DISKS) {
+	if (i == mddev->max_disks) {
 		printk(KERN_WARNING "md%d: can not hot-add to full array!\n",
 		       mdidx(mddev));
 		err = -EBUSY;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a14b75456e5f..137e23ff22e4 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -59,7 +59,7 @@ static void mp_pool_free(void *mpb, void *data)
 static int multipath_map (mddev_t *mddev, mdk_rdev_t **rdevp)
 {
 	multipath_conf_t *conf = mddev_to_conf(mddev);
-	int i, disks = MD_SB_DISKS;
+	int i, disks = mddev->max_disks;
 
 	/*
 	 * Later we do read balancing on the read side 
@@ -147,7 +147,7 @@ static int multipath_read_balance (multipath_conf_t *conf)
 {
 	int disk;
 
-	for (disk = 0; disk < MD_SB_DISKS; disk++) {
+	for (disk = 0; disk < conf->mddev->max_disks; disk++) {
 		mdk_rdev_t *rdev = conf->multipaths[disk].rdev;
 		if (rdev && rdev->in_sync)
 			return disk;
@@ -259,7 +259,7 @@ static void print_multipath_conf (multipath_conf_t *conf)
 	printk(" --- wd:%d rd:%d\n", conf->working_disks,
 			 conf->raid_disks);
 
-	for (i = 0; i < MD_SB_DISKS; i++) {
+	for (i = 0; i < conf->mddev->max_disks; i++) {
 		tmp = conf->multipaths + i;
 		if (tmp->rdev)
 			printk(" disk%d, o:%d, dev:%s\n",
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index 453324c18bdd..f658735d28b2 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -151,8 +151,9 @@ struct mdk_rdev_s
 	struct block_device *bdev;	/* block device handle */
 
 	struct page	*sb_page;
-	mdp_super_t	*sb;
+	int		sb_loaded;
 	sector_t	sb_offset;
+	int		preferred_minor;	/* autorun support */
 
 	/* A device can be in one of three states based on two flags:
 	 * Not working:   faulty==1 in_sync==0
@@ -196,6 +197,7 @@ struct mddev_s
 	time_t				ctime, utime;
 	int				level, layout;
 	int				raid_disks;
+	int				max_disks;
 	unsigned long			state;
 	sector_t			size; /* used size of component devices */
 	__u64				events;
-- 
cgit v1.2.3


From b9d189e5d1f53709f1bc091a02819ad101bbbc75 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@cse.unsw.edu.au>
Date: Wed, 30 Oct 2002 00:24:12 -0800
Subject: [PATCH] kNFSd: Fix nfs shutdown problem.

The 'unexport everything' that happens when the
last nfsd thread dies was shuting down too much -
things that should only be shut down on module unload.
---
 fs/nfsd/export.c            | 31 ++++++++++++-------------------
 fs/nfsd/nfssvc.c            |  2 +-
 include/linux/nfsd/export.h |  1 +
 net/sunrpc/sunrpc_syms.c    |  1 +
 4 files changed, 15 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 43228c2a2aca..bad996892e61 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -738,23 +738,6 @@ exp_do_unexport(svc_export *unexp)
 	exp_fsid_unhash(unexp);
 }
 
-/*
- * Revoke all exports for a given client.
- */
-static void
-exp_unexport_all(svc_client *clp)
-{
-	struct svc_export *exp;
-	int index;
-
-	dprintk("unexporting all fs's for clnt %p\n", clp);
-
-	cache_for_each(exp, &svc_export_cache, index, h)
-		if (exp->ex_client == clp)
-			exp_do_unexport(exp);
-	cache_flush();
-
-}
 
 /*
  * unexport syscall.
@@ -1108,6 +1091,18 @@ nfsd_export_init(void)
 
 }
 
+/*
+ * Flush exports table - called when last nfsd thread is killed
+ */
+void
+nfsd_export_flush(void)
+{
+	exp_writelock();
+	cache_purge(&svc_expkey_cache);
+	cache_purge(&svc_export_cache);
+	exp_writeunlock();
+}
+
 /*
  * Shutdown the exports module.
  */
@@ -1119,8 +1114,6 @@ nfsd_export_shutdown(void)
 
 	exp_writelock();
 
-	exp_unexport_all(NULL);
-
 	if (cache_unregister(&svc_expkey_cache))
 		printk(KERN_ERR "nfsd: failed to unregister expkey cache\n");
 	if (cache_unregister(&svc_export_cache))
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 0f8be15737a3..a12f2c980865 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -238,7 +238,7 @@ nfsd(struct svc_rqst *rqstp)
 		printk(KERN_WARNING "nfsd: last server has exited\n");
 		if (err != SIG_NOCLEAN) {
 			printk(KERN_WARNING "nfsd: unexporting all filesystems\n");
-			nfsd_export_shutdown();
+			nfsd_export_flush();
 		}
 		nfsd_serv = NULL;
 	        nfsd_racache_shutdown();	/* release read-ahead cache */
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 4d692bb2797f..864e3b801f0f 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -83,6 +83,7 @@ struct svc_expkey {
  */
 void			nfsd_export_init(void);
 void			nfsd_export_shutdown(void);
+void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
 struct svc_expkey *	exp_find_key(struct auth_domain *clp, 
diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c
index a738a191c675..209219f998ac 100644
--- a/net/sunrpc/sunrpc_syms.c
+++ b/net/sunrpc/sunrpc_syms.c
@@ -101,6 +101,7 @@ EXPORT_SYMBOL(auth_unix_lookup);
 EXPORT_SYMBOL(cache_check);
 EXPORT_SYMBOL(cache_clean);
 EXPORT_SYMBOL(cache_flush);
+EXPORT_SYMBOL(cache_purge);
 EXPORT_SYMBOL(cache_fresh);
 EXPORT_SYMBOL(cache_init);
 EXPORT_SYMBOL(cache_register);
-- 
cgit v1.2.3


From 335c5fc746de61c7ef278eda451162e388b57d49 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@cse.unsw.edu.au>
Date: Wed, 30 Oct 2002 00:24:44 -0800
Subject: [PATCH] kNFSd: nfsd_readdir changes.

nfsd_readdir - the common readdir code for all version of nfsd,
contains a number of version-specific things with appropriate checks,
and also does some xdr-encoding which rightly belongs elsewhere.

This patch simplifies nfsd_readdir to do just the core stuff, and moves
the version specifics into version specific files, and the xdr encoding
into xdr encoding files.
---
 fs/nfsd/nfs3proc.c        | 31 +++++++++++++++------
 fs/nfsd/nfs3xdr.c         | 22 +++++++--------
 fs/nfsd/nfs4xdr.c         | 55 +++++++++++++++++++++----------------
 fs/nfsd/nfsproc.c         | 16 ++++++++---
 fs/nfsd/nfsxdr.c          | 15 +++++++---
 fs/nfsd/vfs.c             | 70 ++++++++++-------------------------------------
 include/linux/nfsd/nfsd.h | 22 +++++----------
 include/linux/nfsd/xdr.h  |  5 ++++
 include/linux/nfsd/xdr3.h |  7 +++++
 include/linux/nfsd/xdr4.h |  5 ++++
 10 files changed, 128 insertions(+), 120 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 2a60a38ce2ba..61be6186bacf 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -457,11 +457,18 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 
 	/* Read directory and encode entries on the fly */
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t) argp->cookie, 
-					nfs3svc_encode_entry,
-					buffer, &count, argp->verf, NULL);
+
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
+	resp->buffer = buffer;
+	resp->offset = NULL;
+	resp->rqstp = rqstp;
+	nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t*) &argp->cookie, 
+					&resp->common, nfs3svc_encode_entry);
 	memcpy(resp->verf, argp->verf, 8);
-	resp->count = count;
+	resp->count = resp->buffer - buffer;
+	if (resp->offset)
+		xdr_encode_hyper(resp->offset, argp->cookie);
 
 	RETURN_STATUS(nfserr);
 }
@@ -476,6 +483,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 {
 	u32 *	buffer;
 	int	nfserr, count, want;
+	loff_t	offset;
 
 	dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
 				SVCFH_fmt(&argp->fh),
@@ -492,11 +500,18 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 
 	/* Read directory and encode entries on the fly */
 	fh_copy(&resp->fh, &argp->fh);
-	nfserr = nfsd_readdir(rqstp, &resp->fh, (loff_t) argp->cookie, 
-					nfs3svc_encode_entry_plus,
-					buffer, &count, argp->verf, NULL);
+
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
+	resp->buffer = buffer;
+	resp->rqstp = rqstp;
+	offset = argp->cookie;
+	nfserr = nfsd_readdir(rqstp, &resp->fh, &offset, 
+			      &resp->common, nfs3svc_encode_entry_plus);
 	memcpy(resp->verf, argp->verf, 8);
-	resp->count = count;
+	resp->count = resp->buffer - buffer;
+	if (resp->offset)
+		xdr_encode_hyper(resp->offset, offset);
 
 	RETURN_STATUS(nfserr);
 }
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 1cfe66a163ee..9eeba9f3291d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -644,10 +644,13 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_readdirres *resp)
 {
 	p = encode_post_op_attr(rqstp, p, &resp->fh);
+
 	if (resp->status == 0) {
 		/* stupid readdir cookie */
 		memcpy(p, resp->verf, 8); p += 2;
-		p += XDR_QUADLEN(resp->count);
+		p = resp->buffer;
+		*p++ = 0;		/* no more entries */
+		*p++ = htonl(resp->common.err == nfserr_eof);
 	}
 
 	return xdr_ressize_check(rqstp, p);
@@ -666,20 +669,16 @@ nfs3svc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 #define NFS3_ENTRY_BAGGAGE	(2 + 1 + 2 + 1)
 #define NFS3_ENTRYPLUS_BAGGAGE	(1 + 21 + 1 + (NFS3_FHSIZE >> 2))
 static int
-encode_entry(struct readdir_cd *cd, const char *name,
+encode_entry(struct readdir_cd *ccd, const char *name,
 	     int namlen, off_t offset, ino_t ino, unsigned int d_type, int plus)
 {
+	struct nfsd3_readdirres *cd = container_of(ccd, struct nfsd3_readdirres, common);
 	u32		*p = cd->buffer;
 	int		buflen, slen, elen;
 
 	if (cd->offset)
 		xdr_encode_hyper(cd->offset, (u64) offset);
 
-	/* nfsd_readdir calls us with name == 0 when it wants us to
-	 * set the last offset entry. */
-	if (name == 0)
-		return 0;
-
 	/*
 	dprintk("encode_entry(%.*s @%ld%s)\n",
 		namlen, name, (long) offset, plus? " plus" : "");
@@ -693,7 +692,7 @@ encode_entry(struct readdir_cd *cd, const char *name,
 	elen = slen + NFS3_ENTRY_BAGGAGE
 		+ (plus? NFS3_ENTRYPLUS_BAGGAGE : 0);
 	if ((buflen = cd->buflen - elen) < 0) {
-		cd->eob = 1;
+		cd->common.err = nfserr_readdir_nospc;
 		return -EINVAL;
 	}
 	*p++ = xdr_one;				 /* mark entry present */
@@ -709,8 +708,8 @@ encode_entry(struct readdir_cd *cd, const char *name,
 		struct svc_export	*exp;
 		struct dentry		*dparent, *dchild;
 
-		dparent = cd->dirfh->fh_dentry;
-		exp  = cd->dirfh->fh_export;
+		dparent = cd->fh.fh_dentry;
+		exp  = cd->fh.fh_export;
 
 		fh_init(&fh, NFS3_FHSIZE);
 		if (isdotent(name, namlen)) {
@@ -724,7 +723,7 @@ encode_entry(struct readdir_cd *cd, const char *name,
 			dchild = lookup_one_len(name, dparent,namlen);
 		if (IS_ERR(dchild))
 			goto noexec;
-		if (fh_compose(&fh, exp, dchild, cd->dirfh) != 0 || !dchild->d_inode)
+		if (fh_compose(&fh, exp, dchild, &cd->fh) != 0 || !dchild->d_inode)
 			goto noexec;
 		p = encode_post_op_attr(cd->rqstp, p, &fh);
 		*p++ = xdr_one; /* yes, a file handle follows */
@@ -735,6 +734,7 @@ encode_entry(struct readdir_cd *cd, const char *name,
 out:
 	cd->buflen = buflen;
 	cd->buffer = p;
+	cd->common.err = nfs_ok;
 	return 0;
 
 noexec:
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 3eae696c545a..8e32b75e67eb 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1313,9 +1313,10 @@ out_serverfault:
 }
 
 static int
-nfsd4_encode_dirent(struct readdir_cd *cd, const char *name, int namlen,
+nfsd4_encode_dirent(struct readdir_cd *ccd, const char *name, int namlen,
 		    loff_t offset, ino_t ino, unsigned int d_type)
 {
+	struct nfsd4_readdir *cd = container_of(ccd, struct nfsd4_readdir, common);
 	int buflen;
 	u32 *p = cd->buffer;
 	u32 *attrlenp;
@@ -1324,17 +1325,14 @@ nfsd4_encode_dirent(struct readdir_cd *cd, const char *name, int namlen,
 	int nfserr = 0;
 
 	/* In nfsv4, "." and ".." never make it onto the wire.. */
-	if (name && isdotent(name, namlen))
+	if (name && isdotent(name, namlen)) {
+		cd->common.err = nfs_ok;
 		return 0;
+	}
 
 	if (cd->offset)
 		xdr_encode_hyper(cd->offset, (u64) offset);
 
-	/* nfsd_readdir calls us with name == 0 when it wants us to
-	 * set the last offset entry. */
-	if (name == 0)
-		return 0;
-
 	buflen = cd->buflen - 4 - XDR_QUADLEN(namlen);
 	if (buflen < 0)
 		goto nospc;
@@ -1347,8 +1345,8 @@ nfsd4_encode_dirent(struct readdir_cd *cd, const char *name, int namlen,
 	/*
 	 * Now we come to the ugly part: writing the fattr for this entry.
 	 */
-	bmval0 = cd->bmval[0];
-	bmval1 = cd->bmval[1];
+	bmval0 = cd->rd_bmval[0];
+	bmval1 = cd->rd_bmval[1];
 	if ((bmval0 & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_FILEID)) || bmval1)  {
 		/*
 		 * "Heavyweight" case: we have no choice except to
@@ -1356,14 +1354,14 @@ nfsd4_encode_dirent(struct readdir_cd *cd, const char *name, int namlen,
 		 * only Windows clients will trigger this code
 		 * path.
 		 */
-		dentry = lookup_one_len(name, cd->dirfh->fh_dentry, namlen);
+		dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
 		if (IS_ERR(dentry)) {
 			nfserr = nfserrno(PTR_ERR(dentry));
 			goto error;
 		}
 
-		nfserr = nfsd4_encode_fattr(NULL, cd->dirfh->fh_export,
-					    dentry, p, &buflen, cd->bmval);
+		nfserr = nfsd4_encode_fattr(NULL, cd->rd_fhp->fh_export,
+					    dentry, p, &buflen, cd->rd_bmval);
 		dput(dentry);
 
 		if (!nfserr) {
@@ -1384,7 +1382,7 @@ error:
 		 * entire READDIR operation(!)
 		 */
 		if (!(bmval0 & FATTR4_WORD0_RDATTR_ERROR)) {
-			cd->nfserr = nfserr;
+			cd->common.err = nfserr;
 			return -EINVAL;
 		}
 
@@ -1414,10 +1412,11 @@ error:
 out:
 	cd->buflen -= (p - cd->buffer);
 	cd->buffer = p;
+	cd->common.err = nfs_ok;
 	return 0;
 
 nospc:
-	cd->eob = 1;
+	cd->common.err = nfserr_readdir_nospc;
 	return -EINVAL;
 }
 
@@ -1643,6 +1642,7 @@ static int
 nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_readdir *readdir)
 {
 	int maxcount;
+	loff_t offset;
 	ENCODE_HEAD;
 
 	if (nfserr)
@@ -1667,17 +1667,26 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	WRITE32(0);
 	WRITE32(0);
 
+	readdir->common.err = 0;
+	readdir->buflen = maxcount;
+	readdir->buffer = p;
+	readdir->offset = NULL;
+
+	offset = readdir->rd_cookie;
 	nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp,
-			      readdir->rd_cookie, nfsd4_encode_dirent,
-			      p, &maxcount, NULL, readdir->rd_bmval);
+			      &offset,
+			      &readdir->common, nfsd4_encode_dirent);
+	if (nfserr == nfs_ok &&
+	    readdir->common.err == nfserr_readdir_nospc &&
+	    readdir->buffer == p)
+		nfserr = nfserr_readdir_nospc;
 	if (!nfserr) {
-		/*
-		 * nfsd_readdir() expects 'maxcount' to be a count of
-		 * words, but fills it in with a count of bytes at the
-		 * end!
-		 */
-		BUG_ON(maxcount & 3);
-		p += (maxcount >> 2);
+		if (readdir->offset)
+			xdr_encode_hyper(readdir->offset, offset);
+
+		p = readdir->buffer;
+		*p++ = 0;	/* no more entries */
+		*p++ = htonl(readdir->common.err == nfserr_eof);
 		ADJUST_ARGS();
 	}
 	return nfserr;
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 62c06a6873ee..06c4326e469b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -471,6 +471,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 {
 	u32 *		buffer;
 	int		nfserr, count;
+	loff_t		offset;
 
 	dprintk("nfsd: READDIR  %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),		
@@ -488,11 +489,18 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 	if (count < 0)
 		count = 0;
 
+	resp->buffer = buffer;
+	resp->offset = NULL;
+	resp->buflen = count;
+	resp->common.err = nfs_ok;
 	/* Read directory and encode entries on the fly */
-	nfserr = nfsd_readdir(rqstp, &argp->fh, (loff_t) argp->cookie, 
-			      nfssvc_encode_entry,
-			      buffer, &count, NULL, NULL);
-	resp->count = count;
+	offset = argp->cookie;
+	nfserr = nfsd_readdir(rqstp, &argp->fh, &offset, 
+			      &resp->common, nfssvc_encode_entry);
+
+	resp->count = resp->buffer - buffer;
+	if (resp->offset)
+		*resp->offset = (u32)offset;
 
 	fh_put(&argp->fh);
 	return nfserr;
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 074819aa6c8b..15f1c7a16031 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -380,7 +380,10 @@ int
 nfssvc_encode_readdirres(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readdirres *resp)
 {
-	p += XDR_QUADLEN(resp->count);
+	p = resp->buffer;
+	*p++ = 0;			/* no more entries */
+	*p++ = htonl((resp->common.err == nfserr_eof));
+
 	return xdr_ressize_check(rqstp, p);
 }
 
@@ -399,9 +402,10 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, u32 *p,
 }
 
 int
-nfssvc_encode_entry(struct readdir_cd *cd, const char *name,
+nfssvc_encode_entry(struct readdir_cd *ccd, const char *name,
 		    int namlen, loff_t offset, ino_t ino, unsigned int d_type)
 {
+	struct nfsd_readdirres *cd = container_of(ccd, struct nfsd_readdirres, common);
 	u32	*p = cd->buffer;
 	int	buflen, slen;
 
@@ -410,8 +414,10 @@ nfssvc_encode_entry(struct readdir_cd *cd, const char *name,
 			namlen, name, offset, ino);
 	 */
 
-	if (offset > ~((u32) 0))
+	if (offset > ~((u32) 0)) {
+		cd->common.err = nfserr_fbig;
 		return -EINVAL;
+	}
 	if (cd->offset)
 		*cd->offset = htonl(offset);
 	if (namlen > NFS2_MAXNAMLEN)
@@ -419,7 +425,7 @@ nfssvc_encode_entry(struct readdir_cd *cd, const char *name,
 
 	slen = XDR_QUADLEN(namlen);
 	if ((buflen = cd->buflen - slen - 4) < 0) {
-		cd->eob = 1;
+		cd->common.err = nfserr_readdir_nospc;
 		return -EINVAL;
 	}
 	*p++ = xdr_one;				/* mark entry present */
@@ -430,6 +436,7 @@ nfssvc_encode_entry(struct readdir_cd *cd, const char *name,
 
 	cd->buflen = buflen;
 	cd->buffer = p;
+	cd->common.err = nfs_ok;
 	return 0;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 996d8994d808..cb06e914d548 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1386,16 +1386,15 @@ out_nfserr:
 
 /*
  * Read entries from a directory.
- * The verifier is an NFSv3 thing we ignore for now.
+ * The  NFSv3/4 verifier we ignore for now.
  */
 int
-nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, 
-             encode_dent_fn func, u32 *buffer, int *countp, u32 *verf, u32 *bmval)
+nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, 
+	     struct readdir_cd *cdp, encode_dent_fn func)
 {
-	u32		*p;
-	int		oldlen, eof, err;
+	int		err;
 	struct file	file;
-	struct readdir_cd cd;
+	loff_t		offset = *offsetp;
 
 	err = nfsd_open(rqstp, fhp, S_IFDIR, MAY_READ, &file);
 	if (err)
@@ -1405,17 +1404,6 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	file.f_pos = offset;
 
-	/* Set up the readdir context */
-	memset(&cd, 0, sizeof(cd));
-	cd.rqstp  = rqstp;
-	cd.buffer = buffer;
-	cd.buflen = *countp; /* count of words */
-	cd.dirfh  = fhp;
-	if (bmval) {
-		cd.bmval[0] = bmval[0];
-		cd.bmval[1] = bmval[1];
-	}
-
 	/*
 	 * Read the directory entries. This silly loop is necessary because
 	 * readdir() is not guaranteed to fill up the entire buffer, but
@@ -1423,49 +1411,21 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 	 */
 
 	do {
-		oldlen = cd.buflen;
-
-		err = vfs_readdir(&file, (filldir_t) func, &cd);
-
-		if (err < 0)
-			goto out_nfserr;
-
-		err = cd.nfserr;
-		if (err)
-			goto out_close;
-	} while (oldlen != cd.buflen && !cd.eob);
-
-	err = nfserr_readdir_nospc;
-	if (rqstp->rq_vers == 4 && cd.eob && cd.buffer == buffer)
-		goto out_close;
-
-	/* If we didn't fill the buffer completely, we're at EOF */
-	eof = !cd.eob;
-
-	if (cd.offset) {
-		if (rqstp->rq_vers > 2)
-			(void)xdr_encode_hyper(cd.offset, file.f_pos);
-		else
-			*cd.offset = htonl(file.f_pos);
-	}
-
-	p = cd.buffer;
-	*p++ = 0;			/* no more entries */
-	*p++ = htonl(eof);		/* end of directory */
-	*countp = (caddr_t) p - (caddr_t) buffer;
+		cdp->err = nfserr_eof; /* will be cleared on successful read */
+		err = vfs_readdir(&file, (filldir_t) func, cdp);
+	} while (err >=0 && cdp->err == nfs_ok);
+	if (err)
+		err = nfserrno(err);
+	else
+		err = cdp->err;
+	*offsetp = file.f_pos;
 
-	dprintk("nfsd: readdir result %d bytes, eof %d offset %d\n",
-				*countp, eof,
-				cd.offset? ntohl(*cd.offset) : -1);
-	err = 0;
+	if (err == nfserr_eof || err == nfserr_readdir_nospc)
+		err = nfs_ok; /* can still be found in ->err */
 out_close:
 	nfsd_close(&file);
 out:
 	return err;
-
-out_nfserr:
-	err = nfserrno(err);
-	goto out_close;
 }
 
 /*
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index c72354852e2b..94fc6231004a 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -48,16 +48,7 @@
  * Callback function for readdir
  */
 struct readdir_cd {
-	struct svc_rqst *	rqstp;
-	struct svc_fh *		dirfh;
-	u32 *			buffer;
-	int			buflen;
-	u32 *			offset;		/* previous dirent->d_next */
-	char			plus;		/* readdirplus */
-	char			eob;		/* end of buffer */
-	char			dotonly;
-	int			nfserr;		/* v4 only */
-	u32			bmval[2];	/* v4 only */
+	int			err;	/* 0, nfserr, or nfserr_eof */
 };
 typedef int		(*encode_dent_fn)(struct readdir_cd *, const char *,
 						int, loff_t, ino_t, unsigned int);
@@ -117,9 +108,7 @@ int		nfsd_unlink(struct svc_rqst *, struct svc_fh *, int type,
 int		nfsd_truncate(struct svc_rqst *, struct svc_fh *,
 				unsigned long size);
 int		nfsd_readdir(struct svc_rqst *, struct svc_fh *,
-				loff_t, encode_dent_fn,
-				u32 *buffer, int *countp, u32 *verf,
-				u32 *bmval);
+			     loff_t *, struct readdir_cd *, encode_dent_fn);
 int		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 				struct statfs *);
 
@@ -180,10 +169,13 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_readdir_nospc	__constant_htonl(NFSERR_READDIR_NOSPC)
 #define	nfserr_bad_xdr		__constant_htonl(NFSERR_BAD_XDR)
 
-/* error code for internal use - if a request fails due to
- * kmalloc failure, it gets dropped.  Client should resend eventually
+/* error codes for internal use */
+/* if a request fails due to kmalloc failure, it gets dropped.
+ *  Client should resend eventually
  */
 #define	nfserr_dropit		__constant_htonl(30000)
+/* end-of-file indicator in readdir */
+#define	nfserr_eof		__constant_htonl(30001)
 
 /* Check for dir entries '.' and '..' */
 #define isdotent(n, l)	(l < 3 && n[0] == '.' && (l == 1 || n[1] == '.'))
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index d81b71fefe6d..dc6f850f3622 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -98,6 +98,11 @@ struct nfsd_readres {
 
 struct nfsd_readdirres {
 	int			count;
+
+	struct readdir_cd	common;
+	u32 *			buffer;
+	int			buflen;
+	u32 *			offset;
 };
 
 struct nfsd_statfsres {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 35d167ad6cd2..83ec5bc2b542 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -156,6 +156,13 @@ struct nfsd3_readdirres {
 	struct svc_fh		fh;
 	int			count;
 	__u32			verf[2];
+
+	struct readdir_cd	common;
+	u32 *			buffer;
+	int			buflen;
+	u32 *			offset;
+	struct svc_rqst *	rqstp;
+
 };
 
 struct nfsd3_fsstatres {
diff --git a/include/linux/nfsd/xdr4.h b/include/linux/nfsd/xdr4.h
index 2bf2c5d3b24e..4238cb04ad90 100644
--- a/include/linux/nfsd/xdr4.h
+++ b/include/linux/nfsd/xdr4.h
@@ -185,6 +185,11 @@ struct nfsd4_readdir {
 	u32		rd_bmval[2];        /* request */
 	struct svc_rqst *rd_rqstp;          /* response */
 	struct svc_fh * rd_fhp;             /* response */
+
+	struct readdir_cd	common;
+	u32 *			buffer;
+	int			buflen;
+	u32 *			offset;
 };
 
 struct nfsd4_readlink {
-- 
cgit v1.2.3


From a0e7d495df35797364092fedff52ec488ec702eb Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@cse.unsw.edu.au>
Date: Wed, 30 Oct 2002 00:24:57 -0800
Subject: [PATCH] kNFSd: Convert nfsd to use a list of pages instead of one big
 buffer

This means:
  1/ We don't need an order-4 allocation for each nfsd that starts
  2/ We don't need an order-4 allocation in skb_linearize when
     we receive a 32K write request
  3/ It will be easier to incorporate the zero-copy read changes

The pages are handed around using an xdr_buf (instead of svc_buf)
much like the NFS client so future crypto code can use the same
data structure for both client and server.

The code assumes that most requests and replies fit in a single page.
The exceptions are assumed to have some largish 'data' bit, and the
rest must fit in a single page.
The 'data' bits are file data, readdir data, and symlinks.
There must be only one 'data' bit per request.
This is all fine for nfs/nlm.

This isn't complete:
  1/ NFSv4 hasn't been converted yet (it won't compile)
  2/ NFSv3 allows symlinks upto 4096, but the code will only support
     upto about 3800 at the moment
  3/ readdir responses are limited to about 3800.

but I thought that patch was big enough, and the rest can come
later.


This patch introduces vfs_readv and vfs_writev as parallels to
vfs_read and vfs_write.  This means there is a fair bit of
duplication in read_write.c that should probably be tidied up...
---
 fs/lockd/xdr.c             |  19 ----
 fs/lockd/xdr4.c            |  20 -----
 fs/nfsd/nfs3proc.c         |  30 +++----
 fs/nfsd/nfs3xdr.c          |  86 ++++++++++++-------
 fs/nfsd/nfscache.c         |  42 ++++-----
 fs/nfsd/nfsproc.c          |  24 +++---
 fs/nfsd/nfssvc.c           |   8 +-
 fs/nfsd/nfsxdr.c           |  86 ++++++++++++-------
 fs/nfsd/vfs.c              |   9 +-
 fs/read_write.c            |  94 ++++++++++++++++++++
 include/linux/fs.h         |   2 +
 include/linux/nfsd/cache.h |   4 +-
 include/linux/nfsd/nfsd.h  |   4 +-
 include/linux/nfsd/xdr.h   |   8 +-
 include/linux/nfsd/xdr3.h  |   5 +-
 include/linux/sunrpc/svc.h | 110 +++++++++++++++++-------
 kernel/ksyms.c             |   2 +
 net/sunrpc/svc.c           | 136 +++++++++++++++++------------
 net/sunrpc/svcauth.c       |   3 +-
 net/sunrpc/svcauth_unix.c  |  65 +++++++-------
 net/sunrpc/svcsock.c       | 210 ++++++++++++++++++++++++++++-----------------
 net/sunrpc/xprt.c          |   2 +-
 22 files changed, 602 insertions(+), 367 deletions(-)

(limited to 'include/linux')

diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 5e87dd2fa59f..3d604168ebf9 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -216,25 +216,6 @@ nlm_encode_testres(u32 *p, struct nlm_res *resp)
 	return p;
 }
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static inline int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static inline int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	return (buf->len <= buf->buflen);
-}
 
 /*
  * First, the server side XDR functions
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 479bcdf73c2d..1f11211cbeb2 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -222,26 +222,6 @@ nlm4_encode_testres(u32 *p, struct nlm_res *resp)
 }
 
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	return (buf->len <= buf->buflen);
-}
-
 /*
  * First, the server side XDR functions
  */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 61be6186bacf..ede78ddd05ae 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -43,11 +43,11 @@ static int	nfs3_ftypes[] = {
 /*
  * Reserve room in the send buffer
  */
-static void
-svcbuf_reserve(struct svc_buf *buf, u32 **ptr, int *len, int nr)
+static inline void
+svcbuf_reserve(struct xdr_buf *buf, u32 **ptr, int *len, int nr)
 {
-	*ptr = buf->buf + nr;
-	*len = buf->buflen - buf->len - nr;
+	*ptr = (u32*)(buf->head[0].iov_base+buf->head[0].iov_len) + nr;
+	*len = ((PAGE_SIZE-buf->head[0].iov_len)>>2) - nr;
 }
 
 /*
@@ -150,7 +150,7 @@ nfsd3_proc_readlink(struct svc_rqst *rqstp, struct nfsd_fhandle     *argp,
 	dprintk("nfsd: READLINK(3) %s\n", SVCFH_fmt(&argp->fh));
 
 	/* Reserve room for status, post_op_attr, and path length */
-	svcbuf_reserve(&rqstp->rq_resbuf, &path, &dummy,
+	svcbuf_reserve(&rqstp->rq_res, &path, &dummy,
 				1 + NFS3_POST_OP_ATTR_WORDS + 1);
 
 	/* Read the symlink. */
@@ -167,8 +167,7 @@ static int
 nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 				        struct nfsd3_readres  *resp)
 {
-	u32 *	buffer;
-	int	nfserr, avail;
+	int	nfserr;
 
 	dprintk("nfsd: READ(3) %s %lu bytes at %lu\n",
 				SVCFH_fmt(&argp->fh),
@@ -179,18 +178,17 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp,
 	 * 1 (status) + 22 (post_op_attr) + 1 (count) + 1 (eof)
 	 * + 1 (xdr opaque byte count) = 26
 	 */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &avail,
-			1 + NFS3_POST_OP_ATTR_WORDS + 3);
+
 	resp->count = argp->count;
-	if ((avail << 2) < resp->count)
-		resp->count = avail << 2;
+	if (NFSSVC_MAXBLKSIZE < resp->count)
+		resp->count = NFSSVC_MAXBLKSIZE;
 
-	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + argp->count +4);
+	svc_reserve(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4);
 
 	fh_copy(&resp->fh, &argp->fh);
 	nfserr = nfsd_read(rqstp, &resp->fh,
 				  argp->offset,
-				  (char *) buffer,
+			   	  argp->vec, argp->vlen,
 				  &resp->count);
 	if (nfserr == 0) {
 		struct inode	*inode = resp->fh.fh_dentry->d_inode;
@@ -220,7 +218,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
 	resp->committed = argp->stable;
 	nfserr = nfsd_write(rqstp, &resp->fh,
 				   argp->offset,
-				   argp->data,
+				   argp->vec, argp->vlen,
 				   argp->len,
 				   &resp->committed);
 	resp->count = argp->count;
@@ -447,7 +445,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 				argp->count, (u32) argp->cookie);
 
 	/* Reserve buffer space for status, attributes and verifier */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count,
+	svcbuf_reserve(&rqstp->rq_res, &buffer, &count,
 				1 + NFS3_POST_OP_ATTR_WORDS + 2);
 
 	/* Make sure we've room for the NULL ptr & eof flag, and shrink to
@@ -490,7 +488,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp,
 				argp->count, (u32) argp->cookie);
 
 	/* Reserve buffer space for status, attributes and verifier */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count,
+	svcbuf_reserve(&rqstp->rq_res, &buffer, &count,
 				1 + NFS3_POST_OP_ATTR_WORDS + 2);
 
 	/* Make sure we've room for the NULL ptr & eof flag, and shrink to
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 9eeba9f3291d..963bf3c7bf1d 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -13,6 +13,7 @@
 #include <linux/spinlock.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
+#include <linux/mm.h>
 
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/svc.h>
@@ -269,27 +270,6 @@ encode_wcc_data(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 	return encode_post_op_attr(rqstp, p, fhp);
 }
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static inline int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static inline int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	dprintk("nfsd: ressize_check p %p base %p len %d\n",
-			p, buf->base, buf->buflen);
-	return (buf->len <= buf->buflen);
-}
 
 /*
  * XDR decode functions
@@ -342,11 +322,29 @@ int
 nfs3svc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_readargs *args)
 {
+	int len;
+	int v,pn;
+
 	if (!(p = decode_fh(p, &args->fh))
 	 || !(p = xdr_decode_hyper(p, &args->offset)))
 		return 0;
 
-	args->count = ntohl(*p++);
+	len = args->count = ntohl(*p++);
+
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
+
+	/* set up the iovec */
+	v=0;
+	while (len > 0) {
+		pn = rqstp->rq_resused;
+		take_page(rqstp);
+		args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		args->vec[v].iov_len = len < PAGE_SIZE? len : PAGE_SIZE;
+		v++;
+		len -= PAGE_SIZE;
+	}
+	args->vlen = v;
 	return xdr_argsize_check(rqstp, p);
 }
 
@@ -354,17 +352,33 @@ int
 nfs3svc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd3_writeargs *args)
 {
+	int len, v;
+
 	if (!(p = decode_fh(p, &args->fh))
 	 || !(p = xdr_decode_hyper(p, &args->offset)))
 		return 0;
 
 	args->count = ntohl(*p++);
 	args->stable = ntohl(*p++);
-	args->len = ntohl(*p++);
-	args->data = (char *) p;
-	p += XDR_QUADLEN(args->len);
+	len = args->len = ntohl(*p++);
+
+	args->vec[0].iov_base = (void*)p;
+	args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+		(((void*)p) - rqstp->rq_arg.head[0].iov_base);
+
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
+	v=  0;
+	while (len > args->vec[v].iov_len) {
+		len -= args->vec[v].iov_len;
+		v++;
+		args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
+		args->vec[v].iov_len = PAGE_SIZE;
+	}
+	args->vec[v].iov_len = len;
+	args->vlen = v+1;
 
-	return xdr_argsize_check(rqstp, p);
+	return args->count == args->len && args->vec[0].iov_len > 0;
 }
 
 int
@@ -584,9 +598,23 @@ nfs3svc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 		*p++ = htonl(resp->count);
 		*p++ = htonl(resp->eof);
 		*p++ = htonl(resp->count);	/* xdr opaque count */
-		p += XDR_QUADLEN(resp->count);
-	}
-	return xdr_ressize_check(rqstp, p);
+		xdr_ressize_check(rqstp, p);
+		/* now update rqstp->rq_res to reflect data aswell */
+		rqstp->rq_res.page_base = 0;
+		rqstp->rq_res.page_len = resp->count;
+		if (resp->count & 3) {
+			/* need to page with tail */
+			rqstp->rq_res.tail[0].iov_base = p;
+			*p = 0;
+			rqstp->rq_res.tail[0].iov_len = 4 - (resp->count & 3);
+		}
+		rqstp->rq_res.len =
+			rqstp->rq_res.head[0].iov_len+
+			rqstp->rq_res.page_len+
+			rqstp->rq_res.tail[0].iov_len;
+		return 1;
+	} else
+		return xdr_ressize_check(rqstp, p);
 }
 
 /* WRITE */
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index ab52b4b100f5..b1ae27ee05ba 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -41,7 +41,7 @@ static struct svc_cacherep *	lru_tail;
 static struct svc_cacherep *	nfscache;
 static int			cache_disabled = 1;
 
-static int	nfsd_cache_append(struct svc_rqst *rqstp, struct svc_buf *data);
+static int	nfsd_cache_append(struct svc_rqst *rqstp, struct iovec *vec);
 
 /* 
  * locking for the reply cache:
@@ -107,7 +107,7 @@ nfsd_cache_shutdown(void)
 
 	for (rp = lru_head; rp; rp = rp->c_lru_next) {
 		if (rp->c_state == RC_DONE && rp->c_type == RC_REPLBUFF)
-			kfree(rp->c_replbuf.buf);
+			kfree(rp->c_replvec.iov_base);
 	}
 
 	cache_disabled = 1;
@@ -242,8 +242,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp, int type)
 
 	/* release any buffer */
 	if (rp->c_type == RC_REPLBUFF) {
-		kfree(rp->c_replbuf.buf);
-		rp->c_replbuf.buf = NULL;
+		kfree(rp->c_replvec.iov_base);
+		rp->c_replvec.iov_base = NULL;
 	}
 	rp->c_type = RC_NOCACHE;
  out:
@@ -272,11 +272,11 @@ found_entry:
 	case RC_NOCACHE:
 		break;
 	case RC_REPLSTAT:
-		svc_putu32(&rqstp->rq_resbuf, rp->c_replstat);
+		svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
 		rtn = RC_REPLY;
 		break;
 	case RC_REPLBUFF:
-		if (!nfsd_cache_append(rqstp, &rp->c_replbuf))
+		if (!nfsd_cache_append(rqstp, &rp->c_replvec))
 			goto out;	/* should not happen */
 		rtn = RC_REPLY;
 		break;
@@ -308,13 +308,14 @@ void
 nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
 {
 	struct svc_cacherep *rp;
-	struct svc_buf	*resp = &rqstp->rq_resbuf, *cachp;
+	struct iovec	*resv = &rqstp->rq_res.head[0], *cachv;
 	int		len;
 
 	if (!(rp = rqstp->rq_cacherep) || cache_disabled)
 		return;
 
-	len = resp->len - (statp - resp->base);
+	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
+	len >>= 2;
 	
 	/* Don't cache excessive amounts of data and XDR failures */
 	if (!statp || len > (256 >> 2)) {
@@ -329,16 +330,16 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
 		rp->c_replstat = *statp;
 		break;
 	case RC_REPLBUFF:
-		cachp = &rp->c_replbuf;
-		cachp->buf = (u32 *) kmalloc(len << 2, GFP_KERNEL);
-		if (!cachp->buf) {
+		cachv = &rp->c_replvec;
+		cachv->iov_base = kmalloc(len << 2, GFP_KERNEL);
+		if (!cachv->iov_base) {
 			spin_lock(&cache_lock);
 			rp->c_state = RC_UNUSED;
 			spin_unlock(&cache_lock);
 			return;
 		}
-		cachp->len = len;
-		memcpy(cachp->buf, statp, len << 2);
+		cachv->iov_len = len << 2;
+		memcpy(cachv->iov_base, statp, len << 2);
 		break;
 	}
 	spin_lock(&cache_lock);
@@ -353,19 +354,20 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, u32 *statp)
 
 /*
  * Copy cached reply to current reply buffer. Should always fit.
+ * FIXME as reply is in a page, we should just attach the page, and
+ * keep a refcount....
  */
 static int
-nfsd_cache_append(struct svc_rqst *rqstp, struct svc_buf *data)
+nfsd_cache_append(struct svc_rqst *rqstp, struct iovec *data)
 {
-	struct svc_buf	*resp = &rqstp->rq_resbuf;
+	struct iovec	*vec = &rqstp->rq_res.head[0];
 
-	if (resp->len + data->len > resp->buflen) {
+	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
 		printk(KERN_WARNING "nfsd: cached reply too large (%d).\n",
-				data->len);
+				data->iov_len);
 		return 0;
 	}
-	memcpy(resp->buf, data->buf, data->len << 2);
-	resp->buf += data->len;
-	resp->len += data->len;
+	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
+	vec->iov_len += data->iov_len;
 	return 1;
 }
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 06c4326e469b..997400e1105a 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -30,11 +30,11 @@ typedef struct svc_buf	svc_buf;
 #define NFSDDBG_FACILITY		NFSDDBG_PROC
 
 
-static void
-svcbuf_reserve(struct svc_buf *buf, u32 **ptr, int *len, int nr)
+static inline void
+svcbuf_reserve(struct xdr_buf *buf, u32 **ptr, int *len, int nr)
 {
-	*ptr = buf->buf + nr;
-	*len = buf->buflen - buf->len - nr;
+	*ptr = (u32*)(buf->head[0].iov_base+buf->head[0].iov_len) + nr;
+	*len = ((PAGE_SIZE-buf->head[0].iov_len)>>2) - nr;
 }
 
 static int
@@ -109,7 +109,7 @@ nfsd_proc_readlink(struct svc_rqst *rqstp, struct nfsd_fhandle     *argp,
 	dprintk("nfsd: READLINK %s\n", SVCFH_fmt(&argp->fh));
 
 	/* Reserve room for status and path length */
-	svcbuf_reserve(&rqstp->rq_resbuf, &path, &dummy, 2);
+	svcbuf_reserve(&rqstp->rq_res, &path, &dummy, 2);
 
 	/* Read the symlink. */
 	resp->len = NFS_MAXPATHLEN;
@@ -127,8 +127,7 @@ static int
 nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 				       struct nfsd_readres  *resp)
 {
-	u32 *	buffer;
-	int	nfserr, avail;
+	int	nfserr;
 
 	dprintk("nfsd: READ    %s %d bytes at %d\n",
 		SVCFH_fmt(&argp->fh),
@@ -137,22 +136,21 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp,
 	/* Obtain buffer pointer for payload. 19 is 1 word for
 	 * status, 17 words for fattr, and 1 word for the byte count.
 	 */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &avail, 19);
 
-	if ((avail << 2) < argp->count) {
+	if (NFSSVC_MAXBLKSIZE < argp->count) {
 		printk(KERN_NOTICE
 			"oversized read request from %08x:%d (%d bytes)\n",
 				ntohl(rqstp->rq_addr.sin_addr.s_addr),
 				ntohs(rqstp->rq_addr.sin_port),
 				argp->count);
-		argp->count = avail << 2;
+		argp->count = NFSSVC_MAXBLKSIZE;
 	}
 	svc_reserve(rqstp, (19<<2) + argp->count + 4);
 
 	resp->count = argp->count;
 	nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh),
 				  argp->offset,
-				  (char *) buffer,
+			   	  argp->vec, argp->vlen,
 				  &resp->count);
 
 	return nfserr;
@@ -175,7 +173,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
 
 	nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
 				   argp->offset,
-				   argp->data,
+				   argp->vec, argp->vlen,
 				   argp->len,
 				   &stable);
 	return nfserr;
@@ -478,7 +476,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp, struct nfsd_readdirargs *argp,
 		argp->count, argp->cookie);
 
 	/* Reserve buffer space for status */
-	svcbuf_reserve(&rqstp->rq_resbuf, &buffer, &count, 1);
+	svcbuf_reserve(&rqstp->rq_res, &buffer, &count, 1);
 
 	/* Shrink to the client read size */
 	if (count > (argp->count >> 2))
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a12f2c980865..da4271183ef7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -277,7 +277,8 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 
 	/* Decode arguments */
 	xdr = proc->pc_decode;
-	if (xdr && !xdr(rqstp, rqstp->rq_argbuf.buf, rqstp->rq_argp)) {
+	if (xdr && !xdr(rqstp, (u32*)rqstp->rq_arg.head[0].iov_base,
+			rqstp->rq_argp)) {
 		dprintk("nfsd: failed to decode arguments!\n");
 		nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
 		*statp = rpc_garbage_args;
@@ -293,14 +294,15 @@ nfsd_dispatch(struct svc_rqst *rqstp, u32 *statp)
 	}
 		
 	if (rqstp->rq_proc != 0)
-		svc_putu32(&rqstp->rq_resbuf, nfserr);
+		svc_putu32(&rqstp->rq_res.head[0], nfserr);
 
 	/* Encode result.
 	 * For NFSv2, additional info is never returned in case of an error.
 	 */
 	if (!(nfserr && rqstp->rq_vers == 2)) {
 		xdr = proc->pc_encode;
-		if (xdr && !xdr(rqstp, rqstp->rq_resbuf.buf, rqstp->rq_resp)) {
+		if (xdr && !xdr(rqstp, (u32*)(rqstp->rq_res.head[0].iov_base+rqstp->rq_res.head[0].iov_len),
+				rqstp->rq_resp)) {
 			/* Failed to encode result. Release cache entry */
 			dprintk("nfsd: failed to encode result!\n");
 			nfsd_cache_update(rqstp, RC_NOCACHE, NULL);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 15f1c7a16031..d0895793efb1 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -14,6 +14,7 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/nfsd/nfsd.h>
 #include <linux/nfsd/xdr.h>
+#include <linux/mm.h>
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
@@ -176,27 +177,6 @@ encode_fattr(struct svc_rqst *rqstp, u32 *p, struct svc_fh *fhp)
 	return p;
 }
 
-/*
- * Check buffer bounds after decoding arguments
- */
-static inline int
-xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_argbuf;
-
-	return p - buf->base <= buf->buflen;
-}
-
-static inline int
-xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
-{
-	struct svc_buf	*buf = &rqstp->rq_resbuf;
-
-	buf->len = p - buf->base;
-	dprintk("nfsd: ressize_check p %p base %p len %d\n",
-			p, buf->base, buf->buflen);
-	return (buf->len <= buf->buflen);
-}
 
 /*
  * XDR decode functions
@@ -241,13 +221,31 @@ int
 nfssvc_decode_readargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_readargs *args)
 {
+	int len;
+	int v,pn;
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
 
 	args->offset    = ntohl(*p++);
-	args->count     = ntohl(*p++);
-	args->totalsize = ntohl(*p++);
+	len = args->count     = ntohl(*p++);
+	p++; /* totalcount - unused */
+
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
 
+	/* set up somewhere to store response.
+	 * We take pages, put them on reslist and include in iovec
+	 */
+	v=0;
+	while (len > 0) {
+		pn=rqstp->rq_resused;
+		take_page(rqstp);
+		args->vec[v].iov_base = page_address(rqstp->rq_respages[pn]);
+		args->vec[v].iov_len = len < PAGE_SIZE?len:PAGE_SIZE;
+		v++;
+		len -= PAGE_SIZE;
+	}
+	args->vlen = v;
 	return xdr_argsize_check(rqstp, p);
 }
 
@@ -255,17 +253,30 @@ int
 nfssvc_decode_writeargs(struct svc_rqst *rqstp, u32 *p,
 					struct nfsd_writeargs *args)
 {
+	int len;
+	int v;
 	if (!(p = decode_fh(p, &args->fh)))
 		return 0;
 
 	p++;				/* beginoffset */
 	args->offset = ntohl(*p++);	/* offset */
 	p++;				/* totalcount */
-	args->len = ntohl(*p++);
-	args->data = (char *) p;
-	p += XDR_QUADLEN(args->len);
-
-	return xdr_argsize_check(rqstp, p);
+	len = args->len = ntohl(*p++);
+	args->vec[0].iov_base = (void*)p;
+	args->vec[0].iov_len = rqstp->rq_arg.head[0].iov_len -
+				(((void*)p) - rqstp->rq_arg.head[0].iov_base);
+	if (len > NFSSVC_MAXBLKSIZE)
+		len = NFSSVC_MAXBLKSIZE;
+	v = 0;
+	while (len > args->vec[v].iov_len) {
+		len -= args->vec[v].iov_len;
+		v++;
+		args->vec[v].iov_base = page_address(rqstp->rq_argpages[v]);
+		args->vec[v].iov_len = PAGE_SIZE;
+	}
+	args->vec[v].iov_len = len;
+	args->vlen = v+1;
+	return args->vec[0].iov_len > 0;
 }
 
 int
@@ -371,9 +382,22 @@ nfssvc_encode_readres(struct svc_rqst *rqstp, u32 *p,
 {
 	p = encode_fattr(rqstp, p, &resp->fh);
 	*p++ = htonl(resp->count);
-	p += XDR_QUADLEN(resp->count);
-
-	return xdr_ressize_check(rqstp, p);
+	xdr_ressize_check(rqstp, p);
+
+	/* now update rqstp->rq_res to reflect data aswell */
+	rqstp->rq_res.page_base = 0;
+	rqstp->rq_res.page_len = resp->count;
+	if (resp->count & 3) {
+		/* need to pad with tail */
+		rqstp->rq_res.tail[0].iov_base = p;
+		*p = 0;
+		rqstp->rq_res.tail[0].iov_len = 4 - (resp->count&3);
+	}
+	rqstp->rq_res.len = 
+		rqstp->rq_res.head[0].iov_len+
+		rqstp->rq_res.page_len+
+		rqstp->rq_res.tail[0].iov_len;
+	return 1;
 }
 
 int
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index cb06e914d548..76ad1349e5b9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -577,7 +577,7 @@ found:
  */
 int
 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
-          char *buf, unsigned long *count)
+          struct iovec *vec, int vlen, unsigned long *count)
 {
 	struct raparms	*ra;
 	mm_segment_t	oldfs;
@@ -603,7 +603,7 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	err = vfs_read(&file, buf, *count, &offset);
+	err = vfs_readv(&file, vec, vlen, *count, &offset);
 	set_fs(oldfs);
 
 	/* Write back readahead params */
@@ -629,7 +629,8 @@ out:
  */
 int
 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
-				char *buf, unsigned long cnt, int *stablep)
+				struct iovec *vec, int vlen,
+	   			unsigned long cnt, int *stablep)
 {
 	struct svc_export	*exp;
 	struct file		file;
@@ -677,7 +678,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	/* Write the data. */
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	err = vfs_write(&file, buf, cnt, &offset);
+	err = vfs_writev(&file, vec, vlen, cnt, &offset);
 	if (err >= 0)
 		nfsdstats.io_write += cnt;
 	set_fs(oldfs);
diff --git a/fs/read_write.c b/fs/read_write.c
index a8b23e6367ee..a773421cb6f7 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -207,6 +207,53 @@ ssize_t vfs_read(struct file *file, char *buf, size_t count, loff_t *pos)
 	return ret;
 }
 
+ssize_t vfs_readv(struct file *file, struct iovec *vec, int vlen, size_t count, loff_t *pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read))
+		return -EINVAL;
+
+	ret = locks_verify_area(FLOCK_VERIFY_READ, inode, file, *pos, count);
+	if (!ret) {
+		ret = security_ops->file_permission (file, MAY_READ);
+		if (!ret) {
+			if (file->f_op->readv)
+				ret = file->f_op->readv(file, vec, vlen, pos);
+			else {
+				/* do it by hand */
+				struct iovec *vector = vec;
+				ret = 0;
+				while (vlen > 0) {
+					void * base =  vector->iov_base;
+					size_t len = vector->iov_len;
+					ssize_t nr;
+					vector++;
+					vlen--;
+					if (file->f_op->read)
+						nr = file->f_op->read(file, base, len, pos);
+					else
+						nr = do_sync_read(file, base, len, pos);
+					if (nr < 0) {
+						if (!ret) ret = nr;
+						break;
+					}
+					ret += nr;
+					if (nr != len)
+						break;
+				}
+			}
+			if (ret > 0)
+				dnotify_parent(file->f_dentry, DN_ACCESS);
+		}
+	}
+
+	return ret;
+}
+
 ssize_t do_sync_write(struct file *filp, const char *buf, size_t len, loff_t *ppos)
 {
 	struct kiocb kiocb;
@@ -247,6 +294,53 @@ ssize_t vfs_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 	return ret;
 }
 
+ssize_t vfs_writev(struct file *file, const struct iovec *vec, int vlen, size_t count, loff_t *pos)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	ssize_t ret;
+
+	if (!(file->f_mode & FMODE_WRITE))
+		return -EBADF;
+	if (!file->f_op || (!file->f_op->write && !file->f_op->aio_write))
+		return -EINVAL;
+
+	ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file, *pos, count);
+	if (!ret) {
+		ret = security_ops->file_permission (file, MAY_WRITE);
+		if (!ret) {
+			if (file->f_op->writev)
+				ret = file->f_op->writev(file, vec, vlen, pos);
+			else {
+				/* do it by hand */
+				const struct iovec *vector = vec;
+				ret = 0;
+				while (vlen > 0) {
+					void * base = vector->iov_base;
+					size_t len = vector->iov_len;
+					ssize_t nr;
+					vector++;
+					vlen--;
+					if (file->f_op->write)
+						nr = file->f_op->write(file, base, len, pos);
+					else
+						nr = do_sync_write(file, base, len, pos);
+					if (nr < 0) {
+						if (!ret) ret = nr;
+						break;
+					}
+					ret += nr;
+					if (nr != len)
+						break;
+				}
+			}
+			if (ret > 0)
+				dnotify_parent(file->f_dentry, DN_MODIFY);
+		}
+	}
+
+	return ret;
+}
+
 asmlinkage ssize_t sys_read(unsigned int fd, char * buf, size_t count)
 {
 	struct file *file;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0adb782c7ac7..9a3e78ba7592 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -795,6 +795,8 @@ struct seq_file;
 
 extern ssize_t vfs_read(struct file *, char *, size_t, loff_t *);
 extern ssize_t vfs_write(struct file *, const char *, size_t, loff_t *);
+extern ssize_t vfs_readv(struct file *, struct iovec *, int, size_t, loff_t *);
+extern ssize_t vfs_writev(struct file *, const struct iovec *, int, size_t, loff_t *);
 
 /*
  * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
diff --git a/include/linux/nfsd/cache.h b/include/linux/nfsd/cache.h
index ae2da13bed23..b780f9635930 100644
--- a/include/linux/nfsd/cache.h
+++ b/include/linux/nfsd/cache.h
@@ -32,12 +32,12 @@ struct svc_cacherep {
 	u32			c_vers;
 	unsigned long		c_timestamp;
 	union {
-		struct svc_buf	u_buffer;
+		struct iovec	u_vec;
 		u32		u_status;
 	}			c_u;
 };
 
-#define c_replbuf		c_u.u_buffer
+#define c_replvec		c_u.u_vec
 #define c_replstat		c_u.u_status
 
 /* cache entry states */
diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h
index 94fc6231004a..1b8b01067391 100644
--- a/include/linux/nfsd/nfsd.h
+++ b/include/linux/nfsd/nfsd.h
@@ -88,9 +88,9 @@ int		nfsd_open(struct svc_rqst *, struct svc_fh *, int,
 				int, struct file *);
 void		nfsd_close(struct file *);
 int		nfsd_read(struct svc_rqst *, struct svc_fh *,
-				loff_t, char *, unsigned long *);
+				loff_t, struct iovec *,int, unsigned long *);
 int		nfsd_write(struct svc_rqst *, struct svc_fh *,
-				loff_t, char *, unsigned long, int *);
+				loff_t, struct iovec *,int, unsigned long, int *);
 int		nfsd_readlink(struct svc_rqst *, struct svc_fh *,
 				char *, int *);
 int		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
diff --git a/include/linux/nfsd/xdr.h b/include/linux/nfsd/xdr.h
index dc6f850f3622..97078834e430 100644
--- a/include/linux/nfsd/xdr.h
+++ b/include/linux/nfsd/xdr.h
@@ -29,16 +29,16 @@ struct nfsd_readargs {
 	struct svc_fh		fh;
 	__u32			offset;
 	__u32			count;
-	__u32			totalsize;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd_writeargs {
 	svc_fh			fh;
-	__u32			beginoffset;
 	__u32			offset;
-	__u32			totalcount;
-	__u8 *			data;
 	int			len;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd_createargs {
diff --git a/include/linux/nfsd/xdr3.h b/include/linux/nfsd/xdr3.h
index 83ec5bc2b542..1576a6db4a17 100644
--- a/include/linux/nfsd/xdr3.h
+++ b/include/linux/nfsd/xdr3.h
@@ -33,6 +33,8 @@ struct nfsd3_readargs {
 	struct svc_fh		fh;
 	__u64			offset;
 	__u32			count;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd3_writeargs {
@@ -40,8 +42,9 @@ struct nfsd3_writeargs {
 	__u64			offset;
 	__u32			count;
 	int			stable;
-	__u8 *			data;
 	int			len;
+	struct iovec		vec[RPCSVC_MAXPAGES];
+	int			vlen;
 };
 
 struct nfsd3_createargs {
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 9ad879d9bea7..24464d66411a 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -48,43 +48,49 @@ struct svc_serv {
  * This is use to determine the max number of pages nfsd is
  * willing to return in a single READ operation.
  */
-#define RPCSVC_MAXPAYLOAD	16384u
+#define RPCSVC_MAXPAYLOAD	(64*1024u)
 
 /*
- * Buffer to store RPC requests or replies in.
- * Each server thread has one of these beasts.
+ * RPC Requsts and replies are stored in one or more pages.
+ * We maintain an array of pages for each server thread.
+ * Requests are copied into these pages as they arrive.  Remaining
+ * pages are available to write the reply into.
  *
- * Area points to the allocated memory chunk currently owned by the
- * buffer. Base points to the buffer containing the request, which is
- * different from area when directly reading from an sk_buff. buf is
- * the current read/write position while processing an RPC request.
+ * Currently pages are all re-used by the same server.  Later we 
+ * will use ->sendpage to transmit pages with reduced copying.  In
+ * that case we will need to give away the page and allocate new ones.
+ * In preparation for this, we explicitly move pages off the recv
+ * list onto the transmit list, and back.
  *
- * The array of iovecs can hold additional data that the server process
- * may not want to copy into the RPC reply buffer, but pass to the 
- * network sendmsg routines directly. The prime candidate for this
- * will of course be NFS READ operations, but one might also want to
- * do something about READLINK and READDIR. It might be worthwhile
- * to implement some generic readdir cache in the VFS layer...
+ * We use xdr_buf for holding responses as it fits well with NFS
+ * read responses (that have a header, and some data pages, and possibly
+ * a tail) and means we can share some client side routines.
  *
- * On the receiving end of the RPC server, the iovec may be used to hold
- * the list of IP fragments once we get to process fragmented UDP
- * datagrams directly.
+ * The xdr_buf.head iovec always points to the first page in the rq_*pages
+ * list.  The xdr_buf.pages pointer points to the second page on that
+ * list.  xdr_buf.tail points to the end of the first page.
+ * This assumes that the non-page part of an rpc reply will fit
+ * in a page - NFSd ensures this.  lockd also has no trouble.
  */
-#define RPCSVC_MAXIOV		((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1)
-struct svc_buf {
-	u32 *			area;	/* allocated memory */
-	u32 *			base;	/* base of RPC datagram */
-	int			buflen;	/* total length of buffer */
-	u32 *			buf;	/* read/write pointer */
-	int			len;	/* current end of buffer */
-
-	/* iovec for zero-copy NFS READs */
-	struct iovec		iov[RPCSVC_MAXIOV];
-	int			nriov;
-};
-#define svc_getu32(argp, val)	{ (val) = *(argp)->buf++; (argp)->len--; }
-#define svc_putu32(resp, val)	{ *(resp)->buf++ = (val); (resp)->len++; }
+#define RPCSVC_MAXPAGES		((RPCSVC_MAXPAYLOAD+PAGE_SIZE-1)/PAGE_SIZE + 1)
+
+static inline u32 svc_getu32(struct iovec *iov)
+{
+	u32 val, *vp;
+	vp = iov->iov_base;
+	val = *vp++;
+	iov->iov_base = (void*)vp;
+	iov->iov_len -= sizeof(u32);
+	return val;
+}
+static inline void svc_putu32(struct iovec *iov, u32 val)
+{
+	u32 *vp = iov->iov_base + iov->iov_len;
+	*vp = val;
+	iov->iov_len += sizeof(u32);
+}
 
+	
 /*
  * The context of a single thread, including the request currently being
  * processed.
@@ -102,9 +108,15 @@ struct svc_rqst {
 	struct svc_cred		rq_cred;	/* auth info */
 	struct sk_buff *	rq_skbuff;	/* fast recv inet buffer */
 	struct svc_deferred_req*rq_deferred;	/* deferred request we are replaying */
-	struct svc_buf		rq_defbuf;	/* default buffer */
-	struct svc_buf		rq_argbuf;	/* argument buffer */
-	struct svc_buf		rq_resbuf;	/* result buffer */
+
+	struct xdr_buf		rq_arg;
+	struct xdr_buf		rq_res;
+	struct page *		rq_argpages[RPCSVC_MAXPAGES];
+	struct page *		rq_respages[RPCSVC_MAXPAGES];
+	short			rq_argused;	/* pages used for argument */
+	short			rq_arghi;	/* pages available in argument page list */
+	short			rq_resused;	/* pages used for result */
+
 	u32			rq_xid;		/* transmission id */
 	u32			rq_prog;	/* program number */
 	u32			rq_vers;	/* program version */
@@ -136,6 +148,38 @@ struct svc_rqst {
 	wait_queue_head_t	rq_wait;	/* synchronization */
 };
 
+/*
+ * Check buffer bounds after decoding arguments
+ */
+static inline int
+xdr_argsize_check(struct svc_rqst *rqstp, u32 *p)
+{
+	char *cp = (char *)p;
+	struct iovec *vec = &rqstp->rq_arg.head[0];
+	return cp - (char*)vec->iov_base <= vec->iov_len;
+}
+
+static inline int
+xdr_ressize_check(struct svc_rqst *rqstp, u32 *p)
+{
+	struct iovec *vec = &rqstp->rq_res.head[0];
+	char *cp = (char*)p;
+
+	vec->iov_len = cp - (char*)vec->iov_base;
+	rqstp->rq_res.len = vec->iov_len;
+
+	return vec->iov_len <= PAGE_SIZE;
+}
+
+static int inline take_page(struct svc_rqst *rqstp)
+{
+	if (rqstp->rq_arghi <= rqstp->rq_argused)
+		return -ENOMEM;
+	rqstp->rq_respages[rqstp->rq_resused++] =
+		rqstp->rq_argpages[--rqstp->rq_arghi];
+	return 0;
+}
+
 struct svc_deferred_req {
 	struct svc_serv		*serv;
 	u32			prot;	/* protocol (UDP or TCP) */
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 7ecffcd552d1..6628b24ba827 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -253,7 +253,9 @@ EXPORT_SYMBOL(find_inode_number);
 EXPORT_SYMBOL(is_subdir);
 EXPORT_SYMBOL(get_unused_fd);
 EXPORT_SYMBOL(vfs_read);
+EXPORT_SYMBOL(vfs_readv);
 EXPORT_SYMBOL(vfs_write);
+EXPORT_SYMBOL(vfs_writev);
 EXPORT_SYMBOL(vfs_create);
 EXPORT_SYMBOL(vfs_mkdir);
 EXPORT_SYMBOL(vfs_mknod);
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index 0296128c3cbc..60cdc3cdb300 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -13,6 +13,7 @@
 #include <linux/net.h>
 #include <linux/in.h>
 #include <linux/unistd.h>
+#include <linux/mm.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
@@ -35,7 +36,6 @@ svc_create(struct svc_program *prog, unsigned int bufsize)
 
 	if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL)))
 		return NULL;
-
 	memset(serv, 0, sizeof(*serv));
 	serv->sv_program   = prog;
 	serv->sv_nrthreads = 1;
@@ -105,35 +105,42 @@ svc_destroy(struct svc_serv *serv)
 }
 
 /*
- * Allocate an RPC server buffer
- * Later versions may do nifty things by allocating multiple pages
- * of memory directly and putting them into the bufp->iov.
+ * Allocate an RPC server's buffer space.
+ * We allocate pages and place them in rq_argpages.
  */
-int
-svc_init_buffer(struct svc_buf *bufp, unsigned int size)
+static int
+svc_init_buffer(struct svc_rqst *rqstp, unsigned int size)
 {
-	if (!(bufp->area = (u32 *) kmalloc(size, GFP_KERNEL)))
-		return 0;
-	bufp->base   = bufp->area;
-	bufp->buf    = bufp->area;
-	bufp->len    = 0;
-	bufp->buflen = size >> 2;
-
-	bufp->iov[0].iov_base = bufp->area;
-	bufp->iov[0].iov_len  = size;
-	bufp->nriov = 1;
-
-	return 1;
+	int pages = 2 + (size+ PAGE_SIZE -1) / PAGE_SIZE;
+	int arghi;
+	
+	rqstp->rq_argused = 0;
+	rqstp->rq_resused = 0;
+	arghi = 0;
+	if (pages > RPCSVC_MAXPAGES)
+		BUG();
+	while (pages) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p)
+			break;
+		rqstp->rq_argpages[arghi++] = p;
+		pages--;
+	}
+	rqstp->rq_arghi = arghi;
+	return ! pages;
 }
 
 /*
  * Release an RPC server buffer
  */
-void
-svc_release_buffer(struct svc_buf *bufp)
+static void
+svc_release_buffer(struct svc_rqst *rqstp)
 {
-	kfree(bufp->area);
-	bufp->area = 0;
+	while (rqstp->rq_arghi)
+		put_page(rqstp->rq_argpages[--rqstp->rq_arghi]);
+	while (rqstp->rq_resused)
+		put_page(rqstp->rq_respages[--rqstp->rq_resused]);
+	rqstp->rq_argused = 0;
 }
 
 /*
@@ -154,7 +161,7 @@ svc_create_thread(svc_thread_fn func, struct svc_serv *serv)
 
 	if (!(rqstp->rq_argp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL))
 	 || !(rqstp->rq_resp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL))
-	 || !svc_init_buffer(&rqstp->rq_defbuf, serv->sv_bufsz))
+	 || !svc_init_buffer(rqstp, serv->sv_bufsz))
 		goto out_thread;
 
 	serv->sv_nrthreads++;
@@ -180,7 +187,7 @@ svc_exit_thread(struct svc_rqst *rqstp)
 {
 	struct svc_serv	*serv = rqstp->rq_server;
 
-	svc_release_buffer(&rqstp->rq_defbuf);
+	svc_release_buffer(rqstp);
 	if (rqstp->rq_resp)
 		kfree(rqstp->rq_resp);
 	if (rqstp->rq_argp)
@@ -242,37 +249,51 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	struct svc_program	*progp;
 	struct svc_version	*versp = NULL;	/* compiler food */
 	struct svc_procedure	*procp = NULL;
-	struct svc_buf *	argp = &rqstp->rq_argbuf;
-	struct svc_buf *	resp = &rqstp->rq_resbuf;
+	struct iovec *		argv = &rqstp->rq_arg.head[0];
+	struct iovec *		resv = &rqstp->rq_res.head[0];
 	kxdrproc_t		xdr;
-	u32			*bufp, *statp;
+	u32			*statp;
 	u32			dir, prog, vers, proc,
 				auth_stat, rpc_stat;
 
 	rpc_stat = rpc_success;
-	bufp = argp->buf;
 
-	if (argp->len < 5)
+	if (argv->iov_len < 6*4)
 		goto err_short_len;
 
-	dir  = ntohl(*bufp++);
-	vers = ntohl(*bufp++);
+	/* setup response xdr_buf.
+	 * Initially it has just one page 
+	 */
+	take_page(rqstp); /* must succeed */
+	resv->iov_base = page_address(rqstp->rq_respages[0]);
+	resv->iov_len = 0;
+	rqstp->rq_res.pages = rqstp->rq_respages+1;
+	rqstp->rq_res.len = 0;
+	rqstp->rq_res.page_base = 0;
+	rqstp->rq_res.page_len = 0;
+	/* tcp needs a space for the record length... */
+	if (rqstp->rq_prot == IPPROTO_TCP)
+		svc_putu32(resv, 0);
+
+	rqstp->rq_xid = svc_getu32(argv);
+	svc_putu32(resv, rqstp->rq_xid);
+
+	dir  = ntohl(svc_getu32(argv));
+	vers = ntohl(svc_getu32(argv));
 
 	/* First words of reply: */
-	svc_putu32(resp, xdr_one);		/* REPLY */
-	svc_putu32(resp, xdr_zero);		/* ACCEPT */
+	svc_putu32(resv, xdr_one);		/* REPLY */
 
 	if (dir != 0)		/* direction != CALL */
 		goto err_bad_dir;
 	if (vers != 2)		/* RPC version number */
 		goto err_bad_rpc;
 
-	rqstp->rq_prog = prog = ntohl(*bufp++);	/* program number */
-	rqstp->rq_vers = vers = ntohl(*bufp++);	/* version number */
-	rqstp->rq_proc = proc = ntohl(*bufp++);	/* procedure number */
+	svc_putu32(resv, xdr_zero);		/* ACCEPT */
 
-	argp->buf += 5;
-	argp->len -= 5;
+	rqstp->rq_prog = prog = ntohl(svc_getu32(argv));	/* program number */
+	rqstp->rq_vers = vers = ntohl(svc_getu32(argv));	/* version number */
+	rqstp->rq_proc = proc = ntohl(svc_getu32(argv));	/* procedure number */
 
 	/*
 	 * Decode auth data, and add verifier to reply buffer.
@@ -307,8 +328,8 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	serv->sv_stats->rpccnt++;
 
 	/* Build the reply header. */
-	statp = resp->buf;
-	svc_putu32(resp, rpc_success);		/* RPC_SUCCESS */
+	statp = resv->iov_base +resv->iov_len;
+	svc_putu32(resv, rpc_success);		/* RPC_SUCCESS */
 
 	/* Bump per-procedure stats counter */
 	procp->pc_count++;
@@ -327,14 +348,14 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 	if (!versp->vs_dispatch) {
 		/* Decode arguments */
 		xdr = procp->pc_decode;
-		if (xdr && !xdr(rqstp, rqstp->rq_argbuf.buf, rqstp->rq_argp))
+		if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp))
 			goto err_garbage;
 
 		*statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
 
 		/* Encode reply */
 		if (*statp == rpc_success && (xdr = procp->pc_encode)
-		 && !xdr(rqstp, rqstp->rq_resbuf.buf, rqstp->rq_resp)) {
+		 && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) {
 			dprintk("svc: failed to encode reply\n");
 			/* serv->sv_stats->rpcsystemerr++; */
 			*statp = rpc_system_err;
@@ -347,7 +368,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 
 	/* Check RPC status result */
 	if (*statp != rpc_success)
-		resp->len = statp + 1 - resp->base;
+		resv->iov_len = ((void*)statp)  - resv->iov_base + 4;
 
 	/* Release reply info */
 	if (procp->pc_release)
@@ -369,7 +390,7 @@ svc_process(struct svc_serv *serv, struct svc_rqst *rqstp)
 
 err_short_len:
 #ifdef RPC_PARANOIA
-	printk("svc: short len %d, dropping request\n", argp->len);
+	printk("svc: short len %d, dropping request\n", argv->iov_len);
 #endif
 	goto dropit;			/* drop request */
 
@@ -382,18 +403,19 @@ err_bad_dir:
 
 err_bad_rpc:
 	serv->sv_stats->rpcbadfmt++;
-	resp->buf[-1] = xdr_one;	/* REJECT */
-	svc_putu32(resp, xdr_zero);	/* RPC_MISMATCH */
-	svc_putu32(resp, xdr_two);	/* Only RPCv2 supported */
-	svc_putu32(resp, xdr_two);
+	svc_putu32(resv, xdr_one);	/* REJECT */
+	svc_putu32(resv, xdr_zero);	/* RPC_MISMATCH */
+	svc_putu32(resv, xdr_two);	/* Only RPCv2 supported */
+	svc_putu32(resv, xdr_two);
 	goto sendit;
 
 err_bad_auth:
 	dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat));
 	serv->sv_stats->rpcbadauth++;
-	resp->buf[-1] = xdr_one;	/* REJECT */
-	svc_putu32(resp, xdr_one);	/* AUTH_ERROR */
-	svc_putu32(resp, auth_stat);	/* status */
+	resv->iov_len -= 4;
+	svc_putu32(resv, xdr_one);	/* REJECT */
+	svc_putu32(resv, xdr_one);	/* AUTH_ERROR */
+	svc_putu32(resv, auth_stat);	/* status */
 	goto sendit;
 
 err_bad_prog:
@@ -403,7 +425,7 @@ err_bad_prog:
 	/* else it is just a Solaris client seeing if ACLs are supported */
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_prog_unavail);
+	svc_putu32(resv, rpc_prog_unavail);
 	goto sendit;
 
 err_bad_vers:
@@ -411,9 +433,9 @@ err_bad_vers:
 	printk("svc: unknown version (%d)\n", vers);
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_prog_mismatch);
-	svc_putu32(resp, htonl(progp->pg_lovers));
-	svc_putu32(resp, htonl(progp->pg_hivers));
+	svc_putu32(resv, rpc_prog_mismatch);
+	svc_putu32(resv, htonl(progp->pg_lovers));
+	svc_putu32(resv, htonl(progp->pg_hivers));
 	goto sendit;
 
 err_bad_proc:
@@ -421,7 +443,7 @@ err_bad_proc:
 	printk("svc: unknown procedure (%d)\n", proc);
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_proc_unavail);
+	svc_putu32(resv, rpc_proc_unavail);
 	goto sendit;
 
 err_garbage:
@@ -429,6 +451,6 @@ err_garbage:
 	printk("svc: failed to decode args\n");
 #endif
 	serv->sv_stats->rpcbadfmt++;
-	svc_putu32(resp, rpc_garbage_args);
+	svc_putu32(resv, rpc_garbage_args);
 	goto sendit;
 }
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 5e68c0c270ca..39a46f7a12f5 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -40,8 +40,7 @@ svc_authenticate(struct svc_rqst *rqstp, u32 *statp, u32 *authp, int proc)
 	*statp = rpc_success;
 	*authp = rpc_auth_ok;
 
-	svc_getu32(&rqstp->rq_argbuf, flavor);
-	flavor = ntohl(flavor);
+	flavor = ntohl(svc_getu32(&rqstp->rq_arg.head[0]));
 
 	dprintk("svc: svc_authenticate (%d)\n", flavor);
 	if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor])) {
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 7fabce411088..37e74850f362 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -295,20 +295,20 @@ void svcauth_unix_purge(void)
 static int
 svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp, int proc)
 {
-	struct svc_buf	*argp = &rqstp->rq_argbuf;
-	struct svc_buf	*resp = &rqstp->rq_resbuf;
+	struct iovec	*argv = &rqstp->rq_arg.head[0];
+	struct iovec	*resv = &rqstp->rq_res.head[0];
 	int		rv=0;
 	struct ip_map key, *ipm;
 
-	if ((argp->len -= 3) < 0) {
+	if (argv->iov_len < 3*4)
 		return SVC_GARBAGE;
-	}
-	if (*(argp->buf)++ != 0) {	/* we already skipped the flavor */
+
+	if (svc_getu32(argv) != 0) { 
 		dprintk("svc: bad null cred\n");
 		*authp = rpc_autherr_badcred;
 		return SVC_DENIED;
 	}
-	if (*(argp->buf)++ != RPC_AUTH_NULL || *(argp->buf)++ != 0) {
+	if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
 		dprintk("svc: bad null verf\n");
 		*authp = rpc_autherr_badverf;
 		return SVC_DENIED;
@@ -320,8 +320,8 @@ svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp, int proc)
 	rqstp->rq_cred.cr_groups[0] = NOGROUP;
 
 	/* Put NULL verifier */
-	svc_putu32(resp, RPC_AUTH_NULL);
-	svc_putu32(resp, 0);
+	svc_putu32(resv, RPC_AUTH_NULL);
+	svc_putu32(resv, 0);
 
 	key.m_class = rqstp->rq_server->sv_program->pg_class;
 	key.m_addr = rqstp->rq_addr.sin_addr;
@@ -376,55 +376,54 @@ struct auth_ops svcauth_null = {
 int
 svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp, int proc)
 {
-	struct svc_buf	*argp = &rqstp->rq_argbuf;
-	struct svc_buf	*resp = &rqstp->rq_resbuf;
+	struct iovec	*argv = &rqstp->rq_arg.head[0];
+	struct iovec	*resv = &rqstp->rq_res.head[0];
 	struct svc_cred	*cred = &rqstp->rq_cred;
-	u32		*bufp = argp->buf, slen, i;
-	int		len   = argp->len;
+	u32		slen, i;
+	int		len   = argv->iov_len;
 	int		rv=0;
 	struct ip_map key, *ipm;
 
-	if ((len -= 3) < 0)
+	if ((len -= 3*4) < 0)
 		return SVC_GARBAGE;
 
-	bufp++;					/* length */
-	bufp++;					/* time stamp */
-	slen = XDR_QUADLEN(ntohl(*bufp++));	/* machname length */
-	if (slen > 64 || (len -= slen + 3) < 0)
+	svc_getu32(argv);			/* length */
+	svc_getu32(argv);			/* time stamp */
+	slen = XDR_QUADLEN(ntohl(svc_getu32(argv)));	/* machname length */
+	if (slen > 64 || (len -= (slen + 3)*4) < 0)
 		goto badcred;
-	bufp += slen;				/* skip machname */
-
-	cred->cr_uid = ntohl(*bufp++);		/* uid */
-	cred->cr_gid = ntohl(*bufp++);		/* gid */
+	argv->iov_base = (void*)((u32*)argv->iov_base + slen);	/* skip machname */
+	argv->iov_len -= slen*4;
 
-	slen = ntohl(*bufp++);			/* gids length */
-	if (slen > 16 || (len -= slen + 2) < 0)
+	cred->cr_uid = ntohl(svc_getu32(argv));		/* uid */
+	cred->cr_gid = ntohl(svc_getu32(argv));		/* gid */
+	slen = ntohl(svc_getu32(argv));			/* gids length */
+	if (slen > 16 || (len -= (slen + 2)*4) < 0)
 		goto badcred;
-	for (i = 0; i < NGROUPS && i < slen; i++)
-		cred->cr_groups[i] = ntohl(*bufp++);
+	for (i = 0; i < slen; i++)
+		if (i < NGROUPS)
+			cred->cr_groups[i] = ntohl(svc_getu32(argv));
+		else
+			svc_getu32(argv);
 	if (i < NGROUPS)
 		cred->cr_groups[i] = NOGROUP;
-	bufp += (slen - i);
 
-	if (*bufp++ != RPC_AUTH_NULL || *bufp++ != 0) {
+	if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) {
 		*authp = rpc_autherr_badverf;
 		return SVC_DENIED;
 	}
 
-	argp->buf = bufp;
-	argp->len = len;
-
 	/* Put NULL verifier */
-	svc_putu32(resp, RPC_AUTH_NULL);
-	svc_putu32(resp, 0);
+	svc_putu32(resv, RPC_AUTH_NULL);
+	svc_putu32(resv, 0);
 
 	key.m_class = rqstp->rq_server->sv_program->pg_class;
 	key.m_addr = rqstp->rq_addr.sin_addr;
 
+
 	ipm = ip_map_lookup(&key, 0);
 
 	rqstp->rq_client = NULL;
-
 	if (ipm)
 		switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) {
 		case -EAGAIN:
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 6c2b97c5d18d..4894ce957549 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -234,7 +234,7 @@ svc_sock_received(struct svc_sock *svsk)
  */
 void svc_reserve(struct svc_rqst *rqstp, int space)
 {
-	space += rqstp->rq_resbuf.len<<2;
+	space += rqstp->rq_res.head[0].iov_len;
 
 	if (space < rqstp->rq_reserved) {
 		struct svc_sock *svsk = rqstp->rq_sock;
@@ -278,13 +278,12 @@ svc_sock_release(struct svc_rqst *rqstp)
 	 * But first, check that enough space was reserved
 	 * for the reply, otherwise we have a bug!
 	 */
-	if ((rqstp->rq_resbuf.len<<2) >  rqstp->rq_reserved)
+	if ((rqstp->rq_res.len) >  rqstp->rq_reserved)
 		printk(KERN_ERR "RPC request reserved %d but used %d\n",
 		       rqstp->rq_reserved,
-		       rqstp->rq_resbuf.len<<2);
+		       rqstp->rq_res.len);
 
-	rqstp->rq_resbuf.buf = rqstp->rq_resbuf.base;
-	rqstp->rq_resbuf.len = 0;
+	rqstp->rq_res.head[0].iov_len = 0;
 	svc_reserve(rqstp, 0);
 	rqstp->rq_sock = NULL;
 
@@ -348,8 +347,9 @@ svc_sendto(struct svc_rqst *rqstp, struct iovec *iov, int nr)
 	len = sock_sendmsg(sock, &msg, buflen);
 	set_fs(oldfs);
 
-	dprintk("svc: socket %p sendto([%p %Zu... ], %d, %d) = %d\n",
-			rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, nr, buflen, len);
+	dprintk("svc: socket %p sendto([%p %Zu... ], %d, %d) = %d (addr %x)\n",
+			rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, nr, buflen, len,
+		rqstp->rq_addr.sin_addr.s_addr);
 
 	return len;
 }
@@ -480,13 +480,15 @@ svc_write_space(struct sock *sk)
 /*
  * Receive a datagram from a UDP socket.
  */
+extern int
+csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb);
+
 static int
 svc_udp_recvfrom(struct svc_rqst *rqstp)
 {
 	struct svc_sock	*svsk = rqstp->rq_sock;
 	struct svc_serv	*serv = svsk->sk_server;
 	struct sk_buff	*skb;
-	u32		*data;
 	int		err, len;
 
 	if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags))
@@ -512,33 +514,19 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 	}
 	set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */
 
-	/* Sorry. */
-	if (skb_is_nonlinear(skb)) {
-		if (skb_linearize(skb, GFP_KERNEL) != 0) {
-			kfree_skb(skb);
-			svc_sock_received(svsk);
-			return 0;
-		}
-	}
+	len  = skb->len - sizeof(struct udphdr);
 
-	if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
-		if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) {
-			skb_free_datagram(svsk->sk_sk, skb);
-			svc_sock_received(svsk);
-			return 0;
-		}
+	if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) {
+		/* checksum error */
+		skb_free_datagram(svsk->sk_sk, skb);
+		svc_sock_received(svsk);
+		return 0;
 	}
 
 
-	len  = skb->len - sizeof(struct udphdr);
-	data = (u32 *) (skb->data + sizeof(struct udphdr));
-
-	rqstp->rq_skbuff      = skb;
-	rqstp->rq_argbuf.base = data;
-	rqstp->rq_argbuf.buf  = data;
-	rqstp->rq_argbuf.len  = (len >> 2);
-	rqstp->rq_argbuf.buflen = (len >> 2);
-	/* rqstp->rq_resbuf      = rqstp->rq_defbuf; */
+	rqstp->rq_arg.len = len;
+	rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+	rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE;
 	rqstp->rq_prot        = IPPROTO_UDP;
 
 	/* Get sender address */
@@ -546,6 +534,8 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 	rqstp->rq_addr.sin_port = skb->h.uh->source;
 	rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr;
 
+	skb_free_datagram(svsk->sk_sk, skb);
+
 	if (serv->sv_stats)
 		serv->sv_stats->netudpcnt++;
 
@@ -559,21 +549,36 @@ svc_udp_recvfrom(struct svc_rqst *rqstp)
 static int
 svc_udp_sendto(struct svc_rqst *rqstp)
 {
-	struct svc_buf	*bufp = &rqstp->rq_resbuf;
 	int		error;
+	struct iovec vec[RPCSVC_MAXPAGES];
+	int v;
+	int base, len;
 
 	/* Set up the first element of the reply iovec.
 	 * Any other iovecs that may be in use have been taken
 	 * care of by the server implementation itself.
 	 */
-	/* bufp->base = bufp->area; */
-	bufp->iov[0].iov_base = bufp->base;
-	bufp->iov[0].iov_len  = bufp->len << 2;
-
-	error = svc_sendto(rqstp, bufp->iov, bufp->nriov);
+	vec[0] = rqstp->rq_res.head[0];
+	v=1;
+	base=rqstp->rq_res.page_base;
+	len = rqstp->rq_res.page_len;
+	while (len) {
+		vec[v].iov_base = page_address(rqstp->rq_res.pages[v-1]) + base;
+		vec[v].iov_len = PAGE_SIZE-base;
+		if (len <= vec[v].iov_len)
+			vec[v].iov_len = len;
+		len -= vec[v].iov_len;
+		base = 0;
+		v++;
+	}
+	if (rqstp->rq_res.tail[0].iov_len) {
+		vec[v] = rqstp->rq_res.tail[0];
+		v++;
+	}
+	error = svc_sendto(rqstp, vec, v);
 	if (error == -ECONNREFUSED)
 		/* ICMP error on earlier request. */
-		error = svc_sendto(rqstp, bufp->iov, bufp->nriov);
+		error = svc_sendto(rqstp, vec, v);
 
 	return error;
 }
@@ -785,8 +790,9 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 {
 	struct svc_sock	*svsk = rqstp->rq_sock;
 	struct svc_serv	*serv = svsk->sk_server;
-	struct svc_buf	*bufp = &rqstp->rq_argbuf;
 	int		len;
+	struct iovec vec[RPCSVC_MAXPAGES];
+	int pnum, vlen;
 
 	dprintk("svc: tcp_recv %p data %d conn %d close %d\n",
 		svsk, test_bit(SK_DATA, &svsk->sk_flags),
@@ -851,7 +857,7 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		}
 		svsk->sk_reclen &= 0x7fffffff;
 		dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen);
-		if (svsk->sk_reclen > (bufp->buflen<<2)) {
+		if (svsk->sk_reclen > serv->sv_bufsz) {
 			printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n",
 			       (unsigned long) svsk->sk_reclen);
 			goto err_delete;
@@ -869,30 +875,35 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 		svc_sock_received(svsk);
 		return -EAGAIN;	/* record not complete */
 	}
+	len = svsk->sk_reclen;
 	set_bit(SK_DATA, &svsk->sk_flags);
 
-	/* Frob argbuf */
-	bufp->iov[0].iov_base += 4;
-	bufp->iov[0].iov_len  -= 4;
+	vec[0] = rqstp->rq_arg.head[0];
+	vlen = PAGE_SIZE;
+	pnum = 1;
+	while (vlen < len) {
+		vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]);
+		vec[pnum].iov_len = PAGE_SIZE;
+		pnum++;
+		vlen += PAGE_SIZE;
+	}
 
 	/* Now receive data */
-	len = svc_recvfrom(rqstp, bufp->iov, bufp->nriov, svsk->sk_reclen);
+	len = svc_recvfrom(rqstp, vec, pnum, len);
 	if (len < 0)
 		goto error;
 
 	dprintk("svc: TCP complete record (%d bytes)\n", len);
-
-	/* Position reply write pointer immediately after args,
-	 * allowing for record length */
-	rqstp->rq_resbuf.base = rqstp->rq_argbuf.base + 1 + (len>>2);
-	rqstp->rq_resbuf.buf  = rqstp->rq_resbuf.base + 1;
-	rqstp->rq_resbuf.len  = 1;
-	rqstp->rq_resbuf.buflen= rqstp->rq_argbuf.buflen - (len>>2) - 1;
+	rqstp->rq_arg.len = len;
+	rqstp->rq_arg.page_base = 0;
+	if (len <= rqstp->rq_arg.head[0].iov_len) {
+		rqstp->rq_arg.head[0].iov_len = len;
+		rqstp->rq_arg.page_len = 0;
+	} else {
+		rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len;
+	}
 
 	rqstp->rq_skbuff      = 0;
-	rqstp->rq_argbuf.buf += 1;
-	rqstp->rq_argbuf.len  = (len >> 2);
-	rqstp->rq_argbuf.buflen = (len >> 2) +1;
 	rqstp->rq_prot	      = IPPROTO_TCP;
 
 	/* Reset TCP read info */
@@ -928,23 +939,44 @@ svc_tcp_recvfrom(struct svc_rqst *rqstp)
 static int
 svc_tcp_sendto(struct svc_rqst *rqstp)
 {
-	struct svc_buf	*bufp = &rqstp->rq_resbuf;
+	struct xdr_buf	*xbufp = &rqstp->rq_res;
+	struct iovec vec[RPCSVC_MAXPAGES];
+	int v;
+	int base, len;
 	int sent;
+	u32 reclen;
 
 	/* Set up the first element of the reply iovec.
 	 * Any other iovecs that may be in use have been taken
 	 * care of by the server implementation itself.
 	 */
-	bufp->iov[0].iov_base = bufp->base;
-	bufp->iov[0].iov_len  = bufp->len << 2;
-	bufp->base[0] = htonl(0x80000000|((bufp->len << 2) - 4));
+	reclen = htonl(0x80000000|((xbufp->len ) - 4));
+	memcpy(xbufp->head[0].iov_base, &reclen, 4);
+
+	vec[0] = rqstp->rq_res.head[0];
+	v=1;
+	base= xbufp->page_base;
+	len = xbufp->page_len;
+	while (len) {
+		vec[v].iov_base = page_address(xbufp->pages[v-1]) + base;
+		vec[v].iov_len = PAGE_SIZE-base;
+		if (len <= vec[v].iov_len)
+			vec[v].iov_len = len;
+		len -= vec[v].iov_len;
+		base = 0;
+		v++;
+	}
+	if (xbufp->tail[0].iov_len) {
+		vec[v] = xbufp->tail[0];
+		v++;
+	}
 
-	sent = svc_sendto(rqstp, bufp->iov, bufp->nriov);
-	if (sent != bufp->len<<2) {
+	sent = svc_sendto(rqstp, vec, v);
+	if (sent != xbufp->len) {
 		printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n",
 		       rqstp->rq_sock->sk_server->sv_name,
 		       (sent<0)?"got error":"sent only",
-		       sent, bufp->len << 2);
+		       sent, xbufp->len);
 		svc_delete_socket(rqstp->rq_sock);
 		sent = -EAGAIN;
 	}
@@ -1016,6 +1048,8 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 {
 	struct svc_sock		*svsk =NULL;
 	int			len;
+	int 			pages;
+	struct xdr_buf		*arg;
 	DECLARE_WAITQUEUE(wait, current);
 
 	dprintk("svc: server %p waiting for data (to = %ld)\n",
@@ -1031,9 +1065,35 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 			 rqstp);
 
 	/* Initialize the buffers */
-	rqstp->rq_argbuf = rqstp->rq_defbuf;
-	rqstp->rq_resbuf = rqstp->rq_defbuf;
+	/* first reclaim pages that were moved to response list */
+	while (rqstp->rq_resused) 
+		rqstp->rq_argpages[rqstp->rq_arghi++] =
+			rqstp->rq_respages[--rqstp->rq_resused];
+	/* now allocate needed pages.  If we get a failure, sleep briefly */
+	pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE;
+	while (rqstp->rq_arghi < pages) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ/2);
+			current->state = TASK_RUNNING;
+			continue;
+		}
+		rqstp->rq_argpages[rqstp->rq_arghi++] = p;
+	}
 
+	/* Make arg->head point to first page and arg->pages point to rest */
+	arg = &rqstp->rq_arg;
+	arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]);
+	arg->head[0].iov_len = PAGE_SIZE;
+	rqstp->rq_argused = 1;
+	arg->pages = rqstp->rq_argpages + 1;
+	arg->page_base = 0;
+	/* save at least one page for response */
+	arg->page_len = (pages-2)*PAGE_SIZE;
+	arg->len = (pages-1)*PAGE_SIZE;
+	arg->tail[0].iov_len = 0;
+	
 	if (signalled())
 		return -EINTR;
 
@@ -1109,12 +1169,6 @@ svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout)
 	rqstp->rq_userset = 0;
 	rqstp->rq_chandle.defer = svc_defer;
 
-	svc_getu32(&rqstp->rq_argbuf, rqstp->rq_xid);
-	svc_putu32(&rqstp->rq_resbuf, rqstp->rq_xid);
-
-	/* Assume that the reply consists of a single buffer. */
-	rqstp->rq_resbuf.nriov = 1;
-
 	if (serv->sv_stats)
 		serv->sv_stats->netcnt++;
 	return len;
@@ -1354,23 +1408,25 @@ static struct cache_deferred_req *
 svc_defer(struct cache_req *req)
 {
 	struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle);
-	int size = sizeof(struct svc_deferred_req) + (rqstp->rq_argbuf.buflen << 2);
+	int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.head[0].iov_len);
 	struct svc_deferred_req *dr;
 
+	if (rqstp->rq_arg.page_len)
+		return NULL; /* if more than a page, give up FIXME */
 	if (rqstp->rq_deferred) {
 		dr = rqstp->rq_deferred;
 		rqstp->rq_deferred = NULL;
 	} else {
 		/* FIXME maybe discard if size too large */
-		dr = kmalloc(size<<2, GFP_KERNEL);
+		dr = kmalloc(size, GFP_KERNEL);
 		if (dr == NULL)
 			return NULL;
 
 		dr->serv = rqstp->rq_server;
 		dr->prot = rqstp->rq_prot;
 		dr->addr = rqstp->rq_addr;
-		dr->argslen = rqstp->rq_argbuf.buflen;
-		memcpy(dr->args, rqstp->rq_argbuf.base, dr->argslen<<2);
+		dr->argslen = rqstp->rq_arg.head[0].iov_len >> 2;
+		memcpy(dr->args, rqstp->rq_arg.head[0].iov_base, dr->argslen<<2);
 	}
 	spin_lock(&rqstp->rq_server->sv_lock);
 	rqstp->rq_sock->sk_inuse++;
@@ -1388,10 +1444,10 @@ static int svc_deferred_recv(struct svc_rqst *rqstp)
 {
 	struct svc_deferred_req *dr = rqstp->rq_deferred;
 
-	rqstp->rq_argbuf.base = dr->args;
-	rqstp->rq_argbuf.buf  = dr->args;
-	rqstp->rq_argbuf.len  = dr->argslen;
-	rqstp->rq_argbuf.buflen = dr->argslen;
+	rqstp->rq_arg.head[0].iov_base = dr->args;
+	rqstp->rq_arg.head[0].iov_len = dr->argslen<<2;
+	rqstp->rq_arg.page_len = 0;
+	rqstp->rq_arg.len = dr->argslen<<2;
 	rqstp->rq_prot        = dr->prot;
 	rqstp->rq_addr        = dr->addr;
 	return dr->argslen<<2;
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 390d2b13543c..3fc0e22521ce 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -655,7 +655,7 @@ skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len)
  * We have set things up such that we perform the checksum of the UDP
  * packet in parallel with the copies into the RPC client iovec.  -DaveM
  */
-static int
+int
 csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
 {
 	skb_reader_t desc;
-- 
cgit v1.2.3