From de331827e17e3897ff09a2a46dcc7b2a2c754466 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 9 Apr 2002 21:25:21 -0700
Subject: [PATCH] 2.5.8-pre3 set_bit cleanup II

This changes over some bogus casts, and converts the ext2, hfs and
minix set-bit macros.  Also changes pte and open_fds to hand the
actual bitfield rather than whole structure.

No object code changes
---
 include/linux/hfs_sysdep.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hfs_sysdep.h b/include/linux/hfs_sysdep.h
index ab6c834d0f31..62fcf2ea311f 100644
--- a/include/linux/hfs_sysdep.h
+++ b/include/linux/hfs_sysdep.h
@@ -200,16 +200,16 @@ static inline void *hfs_buffer_data(const hfs_buffer buffer) {
 #endif
 
 static inline int hfs_clear_bit(int bitnr, hfs_u32 *lword) {
-	return test_and_clear_bit(BITNR(bitnr), lword);
+	return test_and_clear_bit(BITNR(bitnr), (unsigned long *)lword);
 }
 
 static inline int hfs_set_bit(int bitnr, hfs_u32 *lword) {
-	return test_and_set_bit(BITNR(bitnr), lword);
+	return test_and_set_bit(BITNR(bitnr), (unsigned long *)lword);
 }
 
 static inline int hfs_test_bit(int bitnr, const hfs_u32 *lword) {
 	/* the kernel should declare the second arg of test_bit as const */
-	return test_bit(BITNR(bitnr), (void *)lword);
+	return test_bit(BITNR(bitnr), (unsigned long *)lword);
 }
 
 #undef BITNR
-- 
cgit v1.2.3


From 3d30a6cc3af49ca0b668a2cbbc9d43def619567c Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@zip.com.au>
Date: Tue, 9 Apr 2002 21:29:24 -0700
Subject: [PATCH] Velikov/Hellwig radix-tree pagecache

Before the mempool was added, the VM was getting many, many
0-order allocation failures due to the atomic ratnode
allocations inside swap_out.  That monster mempool is
doing its job - drove a 256meg machine a gigabyte into
swap with no ratnode allocation failures at all.

So we do need to trim that pool a bit, and also handle
the case where swap_out fails, and not just keep
pointlessly calling it.
---
 drivers/block/rd.c         |   4 +-
 fs/inode.c                 |   2 +
 include/linux/fs.h         |   3 +
 include/linux/mm.h         |   8 +-
 include/linux/pagemap.h    |  71 +++----
 include/linux/radix-tree.h |  49 +++++
 include/linux/swap.h       |   5 +-
 init/main.c                |   3 +-
 kernel/ksyms.c             |   6 +-
 lib/Makefile               |   6 +-
 lib/radix-tree.c           | 296 ++++++++++++++++++++++++++++
 mm/filemap.c               | 468 +++++++++++++++++++--------------------------
 mm/mincore.c               |   4 +-
 mm/shmem.c                 |  78 ++++----
 mm/swap_state.c            | 113 +++++++++--
 mm/swapfile.c              |   8 +-
 mm/vmscan.c                |  57 +++---
 17 files changed, 769 insertions(+), 412 deletions(-)
 create mode 100644 include/linux/radix-tree.h
 create mode 100644 lib/radix-tree.c

(limited to 'include/linux')

diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index de6c45c64f13..3330b415468d 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -156,7 +156,6 @@ static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec,
 
 	do {
 		int count;
-		struct page ** hash;
 		struct page * page;
 		char * src, * dst;
 		int unlock = 0;
@@ -166,8 +165,7 @@ static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec,
 			count = size;
 		size -= count;
 
-		hash = page_hash(mapping, index);
-		page = __find_get_page(mapping, index, hash);
+		page = find_get_page(mapping, index);
 		if (!page) {
 			page = grab_cache_page(mapping, index);
 			err = -ENOMEM;
diff --git a/fs/inode.c b/fs/inode.c
index cbe7d2eeb349..29ba87e29414 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -143,6 +143,8 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
+	rwlock_init(&inode->i_data.page_lock);
 	spin_lock_init(&inode->i_data.i_shared_lock);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_shared);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 20928d804327..676ddd28ebe2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -21,6 +21,7 @@
 #include <linux/cache.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
+#include <linux/radix-tree.h>
 
 #include <asm/atomic.h>
 #include <asm/bitops.h>
@@ -370,6 +371,8 @@ struct address_space_operations {
 };
 
 struct address_space {
+	struct radix_tree_root	page_tree;	/* radix tree of all pages */
+	rwlock_t		page_lock;	/* and rwlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35ea7c71b53e..05293f0ab136 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -149,14 +149,11 @@ typedef struct page {
 	struct list_head list;		/* ->mapping has some page lists. */
 	struct address_space *mapping;	/* The inode (or ...) we belong to. */
 	unsigned long index;		/* Our offset within mapping. */
-	struct page *next_hash;		/* Next page sharing our hash bucket in
-					   the pagecache hash table. */
 	atomic_t count;			/* Usage count, see below. */
 	unsigned long flags;		/* atomic flags, some possibly
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
-	struct page **pprev_hash;	/* Complement to *next_hash. */
 	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
 
 	/*
@@ -236,9 +233,8 @@ typedef struct page {
  * using the page->list list_head. These fields are also used for
  * freelist managemet (when page->count==0).
  *
- * There is also a hash table mapping (mapping,index) to the page
- * in memory if present. The lists for this hash table use the fields
- * page->next_hash and page->pprev_hash.
+ * There is also a per-mapping radix tree mapping index to the page
+ * in memory if present. The tree is rooted at mapping->root.  
  *
  * All process pages can do I/O:
  * - inode pages may need to be read from disk,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 242a576ea934..df7f5bd64367 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -41,53 +41,39 @@ static inline struct page *page_cache_alloc(struct address_space *x)
  */
 #define page_cache_entry(x)	virt_to_page(x)
 
-extern unsigned int page_hash_bits;
-#define PAGE_HASH_BITS (page_hash_bits)
-#define PAGE_HASH_SIZE (1 << PAGE_HASH_BITS)
-
-extern atomic_t page_cache_size; /* # of pages currently in the hash table */
-extern struct page **page_hash_table;
-
-extern void page_cache_init(unsigned long);
+extern atomic_t page_cache_size; /* # of pages currently in the page cache */
+
+extern struct page * find_get_page(struct address_space *mapping,
+				unsigned long index);
+extern struct page * find_lock_page(struct address_space *mapping,
+				unsigned long index);
+extern struct page * find_trylock_page(struct address_space *mapping,
+				unsigned long index);
+extern struct page * find_or_create_page(struct address_space *mapping,
+				unsigned long index, unsigned int gfp_mask);
 
-/*
- * We use a power-of-two hash table to avoid a modulus,
- * and get a reasonable hash by knowing roughly how the
- * inode pointer and indexes are distributed (ie, we
- * roughly know which bits are "significant")
- *
- * For the time being it will work for struct address_space too (most of
- * them sitting inside the inodes). We might want to change it later.
- */
-static inline unsigned long _page_hashfn(struct address_space * mapping, unsigned long index)
+extern struct page * grab_cache_page(struct address_space *mapping,
+				unsigned long index);
+extern struct page * grab_cache_page_nowait(struct address_space *mapping,
+				unsigned long index);
+
+extern int add_to_page_cache(struct page *page,
+		struct address_space *mapping, unsigned long index);
+extern int add_to_page_cache_unique(struct page *page,
+		struct address_space *mapping, unsigned long index);
+static inline void ___add_to_page_cache(struct page *page,
+		struct address_space *mapping, unsigned long index)
 {
-#define i (((unsigned long) mapping)/(sizeof(struct inode) & ~ (sizeof(struct inode) - 1)))
-#define s(x) ((x)+((x)>>PAGE_HASH_BITS))
-	return s(i+index) & (PAGE_HASH_SIZE-1);
-#undef i
-#undef s
-}
-
-#define page_hash(mapping,index) (page_hash_table+_page_hashfn(mapping,index))
+	list_add(&page->list, &mapping->clean_pages);
+	page->mapping = mapping;
+	page->index = index;
 
-extern struct page * __find_get_page(struct address_space *mapping,
-				unsigned long index, struct page **hash);
-#define find_get_page(mapping, index) \
-	__find_get_page(mapping, index, page_hash(mapping, index))
-extern struct page * __find_lock_page (struct address_space * mapping,
-				unsigned long index, struct page **hash);
-extern struct page * find_or_create_page(struct address_space *mapping,
-				unsigned long index, unsigned int gfp_mask);
+	mapping->nrpages++;
+	atomic_inc(&page_cache_size);
+}
 
 extern void FASTCALL(lock_page(struct page *page));
 extern void FASTCALL(unlock_page(struct page *page));
-#define find_lock_page(mapping, index) \
-	__find_lock_page(mapping, index, page_hash(mapping, index))
-extern struct page *find_trylock_page(struct address_space *, unsigned long);
-
-extern void add_to_page_cache(struct page * page, struct address_space *mapping, unsigned long index);
-extern void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index);
-extern int add_to_page_cache_unique(struct page * page, struct address_space *mapping, unsigned long index, struct page **hash);
 
 extern void ___wait_on_page(struct page *);
 
@@ -99,9 +85,6 @@ static inline void wait_on_page(struct page * page)
 
 extern void wake_up_page(struct page *);
 
-extern struct page * grab_cache_page (struct address_space *, unsigned long);
-extern struct page * grab_cache_page_nowait (struct address_space *, unsigned long);
-
 typedef int filler_t(void *, struct page*);
 
 extern struct page *read_cache_page(struct address_space *, unsigned long,
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
new file mode 100644
index 000000000000..fb2e3f3350d3
--- /dev/null
+++ b/include/linux/radix-tree.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_RADIX_TREE_H
+#define _LINUX_RADIX_TREE_H
+
+struct radix_tree_node;
+
+#define RADIX_TREE_SLOT_RESERVED ((void *)~0UL)
+
+struct radix_tree_root {
+	unsigned int		height;
+	int			gfp_mask;
+	struct radix_tree_node	*rnode;
+};
+
+#define RADIX_TREE_INIT(mask)	{0, (mask), NULL}
+
+#define RADIX_TREE(name, mask) \
+	struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask)	\
+do {					\
+	(root)->height = 0;		\
+	(root)->gfp_mask = (mask);	\
+	(root)->rnode = NULL;		\
+} while (0)
+
+extern int radix_tree_reserve(struct radix_tree_root *, unsigned long, void ***);
+extern int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+extern void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+extern int radix_tree_delete(struct radix_tree_root *, unsigned long);
+
+#endif /* _LINUX_RADIX_TREE_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 824a928d5c37..a7a64ba55816 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -109,7 +109,7 @@ extern void __remove_inode_page(struct page *);
 struct task_struct;
 struct vm_area_struct;
 struct sysinfo;
-
+struct address_space;
 struct zone_t;
 
 /* linux/mm/swap.c */
@@ -139,6 +139,9 @@ extern void show_swap_cache_info(void);
 extern int add_to_swap_cache(struct page *, swp_entry_t);
 extern void __delete_from_swap_cache(struct page *page);
 extern void delete_from_swap_cache(struct page *page);
+extern int move_to_swap_cache(struct page *page, swp_entry_t entry);
+extern int move_from_swap_cache(struct page *page, unsigned long index,
+		struct address_space *mapping);
 extern void free_page_and_swap_cache(struct page *page);
 extern struct page * lookup_swap_cache(swp_entry_t);
 extern struct page * read_swap_cache_async(swp_entry_t);
diff --git a/init/main.c b/init/main.c
index e4f0d3223095..fc8fbd3fbaad 100644
--- a/init/main.c
+++ b/init/main.c
@@ -69,6 +69,7 @@ extern void sbus_init(void);
 extern void sysctl_init(void);
 extern void signals_init(void);
 
+extern void radix_tree_init(void);
 extern void free_initmem(void);
 
 #ifdef CONFIG_TC
@@ -392,7 +393,7 @@ asmlinkage void __init start_kernel(void)
 	proc_caches_init();
 	vfs_caches_init(mempages);
 	buffer_init(mempages);
-	page_cache_init(mempages);
+	radix_tree_init();
 #if defined(CONFIG_ARCH_S390)
 	ccwcache_init();
 #endif
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 519a500fb547..3d66f7d8e1f1 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -224,8 +224,6 @@ EXPORT_SYMBOL(generic_file_write);
 EXPORT_SYMBOL(generic_file_mmap);
 EXPORT_SYMBOL(generic_ro_fops);
 EXPORT_SYMBOL(generic_buffer_fdatasync);
-EXPORT_SYMBOL(page_hash_bits);
-EXPORT_SYMBOL(page_hash_table);
 EXPORT_SYMBOL(file_lock_list);
 EXPORT_SYMBOL(locks_init_lock);
 EXPORT_SYMBOL(locks_copy_lock);
@@ -266,8 +264,8 @@ EXPORT_SYMBOL(no_llseek);
 EXPORT_SYMBOL(__pollwait);
 EXPORT_SYMBOL(poll_freewait);
 EXPORT_SYMBOL(ROOT_DEV);
-EXPORT_SYMBOL(__find_get_page);
-EXPORT_SYMBOL(__find_lock_page);
+EXPORT_SYMBOL(find_get_page);
+EXPORT_SYMBOL(find_lock_page);
 EXPORT_SYMBOL(grab_cache_page);
 EXPORT_SYMBOL(grab_cache_page_nowait);
 EXPORT_SYMBOL(read_cache_page);
diff --git a/lib/Makefile b/lib/Makefile
index b5e669b2b4dc..bc7df822e1cc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,9 +8,11 @@
 
 L_TARGET := lib.a
 
-export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o crc32.o rbtree.o
+export-objs := cmdline.o dec_and_lock.o rwsem-spinlock.o rwsem.o \
+	       crc32.o rbtree.o radix-tree.o
 
-obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o bust_spinlocks.o rbtree.o
+obj-y := errno.o ctype.o string.o vsprintf.o brlock.o cmdline.o \
+	 bust_spinlocks.o rbtree.o radix-tree.o
 
 obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
new file mode 100644
index 000000000000..aa33c677df7e
--- /dev/null
+++ b/lib/radix-tree.c
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/radix-tree.h>
+#include <linux/slab.h>
+
+/*
+ * Radix tree node definition.
+ */
+#define RADIX_TREE_MAP_SHIFT  7
+#define RADIX_TREE_MAP_SIZE  (1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK  (RADIX_TREE_MAP_SIZE-1)
+
+struct radix_tree_node {
+	unsigned int	count;
+	void		*slots[RADIX_TREE_MAP_SIZE];
+};
+
+struct radix_tree_path {
+	struct radix_tree_node *node, **slot;
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+
+/*
+ * Radix tree node cache.
+ */
+static kmem_cache_t *radix_tree_node_cachep;
+static mempool_t *radix_tree_node_pool;
+
+#define radix_tree_node_alloc(root) \
+	mempool_alloc(radix_tree_node_pool, (root)->gfp_mask)
+#define radix_tree_node_free(node) \
+	mempool_free((node), radix_tree_node_pool);
+
+
+/*
+ *	Return the maximum key which can be store into a
+ *	radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+	if (tmp >= RADIX_TREE_INDEX_BITS)
+		index = ~0UL;
+	return index;
+}
+
+
+/*
+ *	Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_node *node;
+	unsigned int height;
+
+	/* Figure out what the height should be.  */
+	height = root->height + 1;
+	while (index > radix_tree_maxindex(height))
+		height++;
+
+	if (root->rnode) {
+		do {
+			if (!(node = radix_tree_node_alloc(root)))
+				return -ENOMEM;
+
+			/* Increase the height.  */
+			node->slots[0] = root->rnode;
+			if (root->rnode)
+				node->count = 1;
+			root->rnode = node;
+			root->height++;
+		} while (height > root->height);
+	} else 
+		root->height = height;
+
+	return 0;
+}
+
+
+/**
+ *	radix_tree_reserve    -    reserve space in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@pslot:		pointer to reserved slot
+ *
+ *	Reserve a slot in a radix tree for the key @index.
+ */
+int radix_tree_reserve(struct radix_tree_root *root, unsigned long index, void ***pslot)
+{
+	struct radix_tree_node *node = NULL, *tmp, **slot;
+	unsigned int height, shift;
+	int error;
+
+	/* Make sure the tree is high enough.  */
+	if (index > radix_tree_maxindex(root->height)) {
+		error = radix_tree_extend(root, index);
+		if (error)
+			return error;
+	}
+    
+	slot = &root->rnode;
+	height = root->height;
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+	while (height > 0) {
+		if (*slot == NULL) {
+			/* Have to add a child node.  */
+			if (!(tmp = radix_tree_node_alloc(root)))
+				return -ENOMEM;
+			*slot = tmp;
+			if (node)
+				node->count++;
+		}
+
+		/* Go a level down.  */
+		node = *slot;
+		slot = (struct radix_tree_node **)
+			(node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	if (*slot != NULL)
+		return -EEXIST;
+	if (node)
+		node->count++;
+
+	*pslot = (void **)slot;
+	**pslot = RADIX_TREE_SLOT_RESERVED;
+	return 0;
+}
+
+EXPORT_SYMBOL(radix_tree_reserve);
+
+
+/**
+ *	radix_tree_insert    -    insert into a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@item:		item to insert
+ *
+ *	Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root, unsigned long index, void *item)
+{
+	void **slot;
+	int error;
+
+	error = radix_tree_reserve(root, index, &slot);
+	if (!error)
+		*slot = item;
+	return error;
+}
+
+EXPORT_SYMBOL(radix_tree_insert);
+
+
+/**
+ *	radix_tree_lookup    -    perform lookup operation on a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup them item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	while (height > 0) {
+		if (*slot == NULL)
+			return NULL;
+
+		slot = (struct radix_tree_node **)
+			((*slot)->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	return (void *) *slot;
+}
+
+EXPORT_SYMBOL(radix_tree_lookup);
+
+
+/**
+ *	radix_tree_delete    -    delete an item from a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Remove the item at @index from the radix tree rooted at @root.
+ */
+int radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_path path[RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2], *pathp = path;
+	unsigned int height, shift;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return -ENOENT;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+	pathp->slot = &root->rnode;
+
+	while (height > 0) {
+		if (*pathp->slot == NULL)
+			return -ENOENT;
+
+		pathp[1].node = *pathp[0].slot;
+		pathp[1].slot = (struct radix_tree_node **)
+		    (pathp[1].node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK));
+		pathp++;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	if (*pathp[0].slot == NULL)
+		return -ENOENT;
+
+	*pathp[0].slot = NULL;
+	while (pathp[0].node && --pathp[0].node->count == 0) {
+		pathp--;
+		*pathp[0].slot = NULL;
+		radix_tree_node_free(pathp[1].node);
+	}
+
+	return 0;
+}
+
+EXPORT_SYMBOL(radix_tree_delete);
+
+static void radix_tree_node_ctor(void *node, kmem_cache_t *cachep, unsigned long flags)
+{
+	memset(node, 0, sizeof(struct radix_tree_node));
+}
+
+static void *radix_tree_node_pool_alloc(int gfp_mask, void *data)
+{
+	return kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
+}
+
+static void radix_tree_node_pool_free(void *node, void *data)
+{
+	kmem_cache_free(radix_tree_node_cachep, node);
+}
+
+/*
+ * FIXME!  512 nodes is 200-300k of memory.  This needs to be
+ * scaled by the amount of available memory, and hopefully
+ * reduced also.
+ */
+void __init radix_tree_init(void)
+{
+	radix_tree_node_cachep = kmem_cache_create("radix_tree_node",
+			sizeof(struct radix_tree_node), 0,
+			SLAB_HWCACHE_ALIGN, radix_tree_node_ctor, NULL);
+	if (!radix_tree_node_cachep)
+		panic ("Failed to create radix_tree_node cache\n");
+	radix_tree_node_pool = mempool_create(512, radix_tree_node_pool_alloc,
+			radix_tree_node_pool_free, NULL);
+	if (!radix_tree_node_pool)
+		panic ("Failed to create radix_tree_node pool\n");
+}
diff --git a/mm/filemap.c b/mm/filemap.c
index ceb7df5fbe51..de48c026a9a4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,89 +46,46 @@
  */
 
 atomic_t page_cache_size = ATOMIC_INIT(0);
-unsigned int page_hash_bits;
-struct page **page_hash_table;
 
-spinlock_t pagecache_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 /*
- * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock 
- *	with the pagecache_lock held.
- *
- * Ordering:
- *	swap_lock ->
- *		pagemap_lru_lock ->
- *			pagecache_lock
+ * Lock ordering:
+ *	pagemap_lru_lock ==> page_lock ==> i_shared_lock
  */
 spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
 
 #define CLUSTER_PAGES		(1 << page_cluster)
 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
 
-static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
-static void add_page_to_hash_queue(struct page * page, struct page **p)
-{
-	struct page *next = *p;
-
-	*p = page;
-	page->next_hash = next;
-	page->pprev_hash = p;
-	if (next)
-		next->pprev_hash = &page->next_hash;
-	if (page->buffers)
-		PAGE_BUG(page);
-	atomic_inc(&page_cache_size);
-}
-
-static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
+/*
+ * Remove a page from the page cache and free it. Caller has to make
+ * sure the page is locked and that nobody else uses it - or that usage
+ * is safe.  The caller must hold a write_lock on the mapping's page_lock.
+ */
+void __remove_inode_page(struct page *page)
 {
-	struct list_head *head = &mapping->clean_pages;
-
-	mapping->nrpages++;
-	list_add(&page->list, head);
-	page->mapping = mapping;
-}
+	struct address_space *mapping = page->mapping;
 
-static inline void remove_page_from_inode_queue(struct page * page)
-{
-	struct address_space * mapping = page->mapping;
+	if (unlikely(PageDirty(page)))
+		BUG();
 
-	mapping->nrpages--;
+	radix_tree_delete(&page->mapping->page_tree, page->index);
 	list_del(&page->list);
 	page->mapping = NULL;
-}
-
-static inline void remove_page_from_hash_queue(struct page * page)
-{
-	struct page *next = page->next_hash;
-	struct page **pprev = page->pprev_hash;
 
-	if (next)
-		next->pprev_hash = pprev;
-	*pprev = next;
-	page->pprev_hash = NULL;
+	mapping->nrpages--;
 	atomic_dec(&page_cache_size);
 }
 
-/*
- * Remove a page from the page cache and free it. Caller has to make
- * sure the page is locked and that nobody else uses it - or that usage
- * is safe.
- */
-void __remove_inode_page(struct page *page)
-{
-	if (PageDirty(page)) BUG();
-	remove_page_from_inode_queue(page);
-	remove_page_from_hash_queue(page);
-}
-
 void remove_inode_page(struct page *page)
 {
-	if (!PageLocked(page))
+	struct address_space *mapping = page->mapping;
+
+	if (unlikely(!PageLocked(page)))
 		PAGE_BUG(page);
 
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 	__remove_inode_page(page);
-	spin_unlock(&pagecache_lock);
+	write_unlock(&mapping->page_lock);
 }
 
 static inline int sync_page(struct page *page)
@@ -149,10 +106,10 @@ void set_page_dirty(struct page *page)
 		struct address_space *mapping = page->mapping;
 
 		if (mapping) {
-			spin_lock(&pagecache_lock);
+			write_lock(&mapping->page_lock);
 			list_del(&page->list);
 			list_add(&page->list, &mapping->dirty_pages);
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 
 			if (mapping->host)
 				mark_inode_dirty_pages(mapping->host);
@@ -172,11 +129,12 @@ void invalidate_inode_pages(struct inode * inode)
 {
 	struct list_head *head, *curr;
 	struct page * page;
+	struct address_space *mapping = inode->i_mapping;
 
-	head = &inode->i_mapping->clean_pages;
+	head = &mapping->clean_pages;
 
 	spin_lock(&pagemap_lru_lock);
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 	curr = head->next;
 
 	while (curr != head) {
@@ -207,7 +165,7 @@ unlock:
 		continue;
 	}
 
-	spin_unlock(&pagecache_lock);
+	write_unlock(&mapping->page_lock);
 	spin_unlock(&pagemap_lru_lock);
 }
 
@@ -246,8 +204,8 @@ static void truncate_complete_page(struct page *page)
 	page_cache_release(page);
 }
 
-static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
-static int truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
+static int truncate_list_pages(struct address_space *mapping,
+	struct list_head *head, unsigned long start, unsigned *partial)
 {
 	struct list_head *curr;
 	struct page * page;
@@ -276,7 +234,7 @@ static int truncate_list_pages(struct list_head *head, unsigned long start, unsi
 				/* Restart on this page */
 				list_add(head, curr);
 
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			unlocked = 1;
 
  			if (!failed) {
@@ -297,7 +255,7 @@ static int truncate_list_pages(struct list_head *head, unsigned long start, unsi
 				schedule();
 			}
 
-			spin_lock(&pagecache_lock);
+			write_lock(&mapping->page_lock);
 			goto restart;
 		}
 		curr = curr->prev;
@@ -321,24 +279,28 @@ void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 	int unlocked;
 
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 	do {
-		unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
-		unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
-		unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
+		unlocked = truncate_list_pages(mapping,
+				&mapping->clean_pages, start, &partial);
+		unlocked |= truncate_list_pages(mapping,
+				&mapping->dirty_pages, start, &partial);
+		unlocked |= truncate_list_pages(mapping,
+				&mapping->locked_pages, start, &partial);
 	} while (unlocked);
 	/* Traversed all three lists without dropping the lock */
-	spin_unlock(&pagecache_lock);
+	write_unlock(&mapping->page_lock);
 }
 
-static inline int invalidate_this_page2(struct page * page,
+static inline int invalidate_this_page2(struct address_space * mapping,
+					struct page * page,
 					struct list_head * curr,
 					struct list_head * head)
 {
 	int unlocked = 1;
 
 	/*
-	 * The page is locked and we hold the pagecache_lock as well
+	 * The page is locked and we hold the mapping lock as well
 	 * so both page_count(page) and page->buffers stays constant here.
 	 */
 	if (page_count(page) == 1 + !!page->buffers) {
@@ -347,7 +309,7 @@ static inline int invalidate_this_page2(struct page * page,
 		list_add_tail(head, curr);
 
 		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
+		write_unlock(&mapping->page_lock);
 		truncate_complete_page(page);
 	} else {
 		if (page->buffers) {
@@ -356,7 +318,7 @@ static inline int invalidate_this_page2(struct page * page,
 			list_add_tail(head, curr);
 
 			page_cache_get(page);
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			block_invalidate_page(page);
 		} else
 			unlocked = 0;
@@ -368,8 +330,8 @@ static inline int invalidate_this_page2(struct page * page,
 	return unlocked;
 }
 
-static int FASTCALL(invalidate_list_pages2(struct list_head *));
-static int invalidate_list_pages2(struct list_head *head)
+static int invalidate_list_pages2(struct address_space * mapping,
+				  struct list_head * head)
 {
 	struct list_head *curr;
 	struct page * page;
@@ -383,7 +345,7 @@ static int invalidate_list_pages2(struct list_head *head)
 		if (!TryLockPage(page)) {
 			int __unlocked;
 
-			__unlocked = invalidate_this_page2(page, curr, head);
+			__unlocked = invalidate_this_page2(mapping, page, curr, head);
 			UnlockPage(page);
 			unlocked |= __unlocked;
 			if (!__unlocked) {
@@ -396,7 +358,7 @@ static int invalidate_list_pages2(struct list_head *head)
 			list_add(head, curr);
 
 			page_cache_get(page);
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			unlocked = 1;
 			wait_on_page(page);
 		}
@@ -407,7 +369,7 @@ static int invalidate_list_pages2(struct list_head *head)
 			schedule();
 		}
 
-		spin_lock(&pagecache_lock);
+		write_lock(&mapping->page_lock);
 		goto restart;
 	}
 	return unlocked;
@@ -422,41 +384,27 @@ void invalidate_inode_pages2(struct address_space * mapping)
 {
 	int unlocked;
 
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 	do {
-		unlocked = invalidate_list_pages2(&mapping->clean_pages);
-		unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
-		unlocked |= invalidate_list_pages2(&mapping->locked_pages);
+		unlocked = invalidate_list_pages2(mapping,
+				&mapping->clean_pages);
+		unlocked |= invalidate_list_pages2(mapping,
+				&mapping->dirty_pages);
+		unlocked |= invalidate_list_pages2(mapping,
+				&mapping->locked_pages);
 	} while (unlocked);
-	spin_unlock(&pagecache_lock);
-}
-
-static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
-{
-	goto inside;
-
-	for (;;) {
-		page = page->next_hash;
-inside:
-		if (!page)
-			goto not_found;
-		if (page->mapping != mapping)
-			continue;
-		if (page->index == offset)
-			break;
-	}
-
-not_found:
-	return page;
+	write_unlock(&mapping->page_lock);
 }
 
-static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
+static int do_buffer_fdatasync(struct address_space *mapping,
+		struct list_head *head, unsigned long start,
+		unsigned long end, int (*fn)(struct page *))
 {
 	struct list_head *curr;
 	struct page *page;
 	int retval = 0;
 
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 	curr = head->next;
 	while (curr != head) {
 		page = list_entry(curr, struct page, list);
@@ -469,7 +417,7 @@ static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsi
 			continue;
 
 		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
+		write_unlock(&mapping->page_lock);
 		lock_page(page);
 
 		/* The buffers could have been free'd while we waited for the page lock */
@@ -477,11 +425,11 @@ static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsi
 			retval |= fn(page);
 
 		UnlockPage(page);
-		spin_lock(&pagecache_lock);
+		write_lock(&mapping->page_lock);
 		curr = page->list.next;
 		page_cache_release(page);
 	}
-	spin_unlock(&pagecache_lock);
+	write_unlock(&mapping->page_lock);
 
 	return retval;
 }
@@ -492,17 +440,24 @@ static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsi
  */
 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 {
+	struct address_space *mapping = inode->i_mapping;
 	int retval;
 
 	/* writeout dirty buffers on pages from both clean and dirty lists */
-	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
-	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
-	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
+	retval = do_buffer_fdatasync(mapping, &mapping->dirty_pages,
+			start_idx, end_idx, writeout_one_page);
+	retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages,
+			start_idx, end_idx, writeout_one_page);
+	retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages,
+			start_idx, end_idx, writeout_one_page);
 
 	/* now wait for locked buffers on pages from both clean and dirty lists */
-	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
-	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
-	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(mapping, &mapping->dirty_pages,
+			start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(mapping, &mapping->clean_pages,
+			start_idx, end_idx, waitfor_one_page);
+	retval |= do_buffer_fdatasync(mapping, &mapping->locked_pages,
+			start_idx, end_idx, waitfor_one_page);
 
 	return retval;
 }
@@ -548,7 +503,7 @@ int filemap_fdatasync(struct address_space * mapping)
 	int ret = 0;
 	int (*writepage)(struct page *) = mapping->a_ops->writepage;
 
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 
         while (!list_empty(&mapping->dirty_pages)) {
 		struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
@@ -560,7 +515,7 @@ int filemap_fdatasync(struct address_space * mapping)
 			continue;
 
 		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
+		write_unlock(&mapping->page_lock);
 
 		lock_page(page);
 
@@ -574,9 +529,9 @@ int filemap_fdatasync(struct address_space * mapping)
 			UnlockPage(page);
 
 		page_cache_release(page);
-		spin_lock(&pagecache_lock);
+		write_lock(&mapping->page_lock);
 	}
-	spin_unlock(&pagecache_lock);
+	write_unlock(&mapping->page_lock);
 	return ret;
 }
 
@@ -591,7 +546,7 @@ int filemap_fdatawait(struct address_space * mapping)
 {
 	int ret = 0;
 
-	spin_lock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
 
         while (!list_empty(&mapping->locked_pages)) {
 		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
@@ -603,86 +558,69 @@ int filemap_fdatawait(struct address_space * mapping)
 			continue;
 
 		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
+		write_unlock(&mapping->page_lock);
 
 		___wait_on_page(page);
 		if (PageError(page))
 			ret = -EIO;
 
 		page_cache_release(page);
-		spin_lock(&pagecache_lock);
+		write_lock(&mapping->page_lock);
 	}
-	spin_unlock(&pagecache_lock);
+	write_unlock(&mapping->page_lock);
 	return ret;
 }
 
-/*
- * Add a page to the inode page cache.
- *
- * The caller must have locked the page and 
- * set all the page flags correctly..
- */
-void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
-{
-	if (!PageLocked(page))
-		BUG();
-
-	page->index = index;
-	page_cache_get(page);
-	spin_lock(&pagecache_lock);
-	add_page_to_inode_queue(mapping, page);
-	add_page_to_hash_queue(page, page_hash(mapping, index));
-	spin_unlock(&pagecache_lock);
-
-	lru_cache_add(page);
-}
-
 /*
  * This adds a page to the page cache, starting out as locked,
  * owned by us, but unreferenced, not uptodate and with no errors.
+ * The caller must hold a write_lock on the mapping->page_lock.
  */
-static inline void __add_to_page_cache(struct page * page,
-	struct address_space *mapping, unsigned long offset,
-	struct page **hash)
+static int __add_to_page_cache(struct page *page,
+		struct address_space *mapping, unsigned long offset)
 {
 	unsigned long flags;
 
+	page_cache_get(page);
+	if (radix_tree_insert(&mapping->page_tree, offset, page) < 0)
+		goto nomem;
 	flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_dirty | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked);
 	page->flags = flags | (1 << PG_locked);
-	page_cache_get(page);
-	page->index = offset;
-	add_page_to_inode_queue(mapping, page);
-	add_page_to_hash_queue(page, hash);
+	___add_to_page_cache(page, mapping, offset);
+	return 0;
+ nomem:
+	page_cache_release(page);
+	return -ENOMEM;
 }
 
-void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
+int add_to_page_cache(struct page *page,
+		struct address_space *mapping, unsigned long offset)
 {
-	spin_lock(&pagecache_lock);
-	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
-	spin_unlock(&pagecache_lock);
+	write_lock(&mapping->page_lock);
+	if (__add_to_page_cache(page, mapping, offset) < 0)
+		goto nomem;
+	write_unlock(&mapping->page_lock);
 	lru_cache_add(page);
+	return 0;
+nomem:
+	write_unlock(&mapping->page_lock);
+	return -ENOMEM;
 }
 
-int add_to_page_cache_unique(struct page * page,
-	struct address_space *mapping, unsigned long offset,
-	struct page **hash)
+int add_to_page_cache_unique(struct page *page,
+		struct address_space *mapping, unsigned long offset)
 {
-	int err;
 	struct page *alias;
+	int error = -EEXIST;
 
-	spin_lock(&pagecache_lock);
-	alias = __find_page_nolock(mapping, offset, *hash);
+	write_lock(&mapping->page_lock);
+	if (!(alias = radix_tree_lookup(&mapping->page_tree, offset)))
+		error = __add_to_page_cache(page, mapping, offset);
+	write_unlock(&mapping->page_lock);
 
-	err = 1;
-	if (!alias) {
-		__add_to_page_cache(page,mapping,offset,hash);
-		err = 0;
-	}
-
-	spin_unlock(&pagecache_lock);
-	if (!err)
+	if (!error)
 		lru_cache_add(page);
-	return err;
+	return error;
 }
 
 /*
@@ -693,12 +631,12 @@ static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 static int page_cache_read(struct file * file, unsigned long offset)
 {
 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
-	struct page **hash = page_hash(mapping, offset);
 	struct page *page; 
+	int error;
 
-	spin_lock(&pagecache_lock);
-	page = __find_page_nolock(mapping, offset, *hash);
-	spin_unlock(&pagecache_lock);
+	read_lock(&mapping->page_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	read_unlock(&mapping->page_lock);
 	if (page)
 		return 0;
 
@@ -706,17 +644,20 @@ static int page_cache_read(struct file * file, unsigned long offset)
 	if (!page)
 		return -ENOMEM;
 
-	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
-		int error = mapping->a_ops->readpage(file, page);
+	error = add_to_page_cache_unique(page, mapping, offset);
+	if (!error) {
+		error = mapping->a_ops->readpage(file, page);
 		page_cache_release(page);
 		return error;
 	}
+
 	/*
 	 * We arrive here in the unlikely event that someone 
-	 * raced with us and added our page to the cache first.
+	 * raced with us and added our page to the cache first
+	 * or we are out of memory for radix-tree nodes.
 	 */
 	page_cache_release(page);
-	return 0;
+	return error == -EEXIST ? 0 : error;
 }
 
 /*
@@ -842,8 +783,7 @@ void lock_page(struct page *page)
  * a rather lightweight function, finding and getting a reference to a
  * hashed page atomically.
  */
-struct page * __find_get_page(struct address_space *mapping,
-			      unsigned long offset, struct page **hash)
+struct page * find_get_page(struct address_space *mapping, unsigned long offset)
 {
 	struct page *page;
 
@@ -851,11 +791,11 @@ struct page * __find_get_page(struct address_space *mapping,
 	 * We scan the hash list read-only. Addition to and removal from
 	 * the hash-list needs a held write-lock.
 	 */
-	spin_lock(&pagecache_lock);
-	page = __find_page_nolock(mapping, offset, *hash);
+	read_lock(&mapping->page_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page)
 		page_cache_get(page);
-	spin_unlock(&pagecache_lock);
+	read_unlock(&mapping->page_lock);
 	return page;
 }
 
@@ -865,15 +805,12 @@ struct page * __find_get_page(struct address_space *mapping,
 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
 {
 	struct page *page;
-	struct page **hash = page_hash(mapping, offset);
 
-	spin_lock(&pagecache_lock);
-	page = __find_page_nolock(mapping, offset, *hash);
-	if (page) {
-		if (TryLockPage(page))
-			page = NULL;
-	}
-	spin_unlock(&pagecache_lock);
+	read_lock(&mapping->page_lock);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
+	if (page && TryLockPage(page))
+		page = NULL;
+	read_unlock(&mapping->page_lock);
 	return page;
 }
 
@@ -882,9 +819,8 @@ struct page *find_trylock_page(struct address_space *mapping, unsigned long offs
  * will return with it held (but it may be dropped
  * during blocking operations..
  */
-static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
-static struct page * __find_lock_page_helper(struct address_space *mapping,
-					unsigned long offset, struct page *hash)
+static struct page *__find_lock_page(struct address_space *mapping,
+					unsigned long offset)
 {
 	struct page *page;
 
@@ -893,13 +829,13 @@ static struct page * __find_lock_page_helper(struct address_space *mapping,
 	 * the hash-list needs a held write-lock.
 	 */
 repeat:
-	page = __find_page_nolock(mapping, offset, hash);
+	page = radix_tree_lookup(&mapping->page_tree, offset);
 	if (page) {
 		page_cache_get(page);
 		if (TryLockPage(page)) {
-			spin_unlock(&pagecache_lock);
+			read_unlock(&mapping->page_lock);
 			lock_page(page);
-			spin_lock(&pagecache_lock);
+			read_lock(&mapping->page_lock);
 
 			/* Has the page been re-allocated while we slept? */
 			if (page->mapping != mapping || page->index != offset) {
@@ -916,46 +852,50 @@ repeat:
  * Same as the above, but lock the page too, verifying that
  * it's still valid once we own it.
  */
-struct page * __find_lock_page (struct address_space *mapping,
-				unsigned long offset, struct page **hash)
+struct page * find_lock_page(struct address_space *mapping, unsigned long offset)
 {
 	struct page *page;
 
-	spin_lock(&pagecache_lock);
-	page = __find_lock_page_helper(mapping, offset, *hash);
-	spin_unlock(&pagecache_lock);
+	read_lock(&mapping->page_lock);
+	page = __find_lock_page(mapping, offset);
+	read_unlock(&mapping->page_lock);
+
 	return page;
 }
 
 /*
  * Same as above, but create the page if required..
  */
-struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
+struct page * find_or_create_page(struct address_space *mapping,
+		unsigned long index, unsigned int gfp_mask)
 {
 	struct page *page;
-	struct page **hash = page_hash(mapping, index);
 
-	spin_lock(&pagecache_lock);
-	page = __find_lock_page_helper(mapping, index, *hash);
-	spin_unlock(&pagecache_lock);
+	page = find_lock_page(mapping, index);
 	if (!page) {
 		struct page *newpage = alloc_page(gfp_mask);
 		if (newpage) {
-			spin_lock(&pagecache_lock);
-			page = __find_lock_page_helper(mapping, index, *hash);
+			write_lock(&mapping->page_lock);
+			page = __find_lock_page(mapping, index);
 			if (likely(!page)) {
 				page = newpage;
-				__add_to_page_cache(page, mapping, index, hash);
+				if (__add_to_page_cache(page, mapping, index)) {
+					write_unlock(&mapping->page_lock);
+					page_cache_release(page);
+					page = NULL;
+					goto out;
+				}
 				newpage = NULL;
 			}
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			if (newpage == NULL)
 				lru_cache_add(page);
 			else 
 				page_cache_release(newpage);
 		}
 	}
-	return page;	
+out:
+	return page;
 }
 
 /*
@@ -975,10 +915,9 @@ struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
  */
 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
 {
-	struct page *page, **hash;
+	struct page *page;
 
-	hash = page_hash(mapping, index);
-	page = __find_get_page(mapping, index, hash);
+	page = find_get_page(mapping, index);
 
 	if ( page ) {
 		if ( !TryLockPage(page) ) {
@@ -1000,11 +939,14 @@ struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long
 	}
 
 	page = page_cache_alloc(mapping);
-	if ( unlikely(!page) )
+	if (unlikely(!page))
 		return NULL;	/* Failed to allocate a page */
 
-	if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
-		/* Someone else grabbed the page already. */
+	if (unlikely(add_to_page_cache_unique(page, mapping, index))) {
+		/*
+		 * Someone else grabbed the page already, or
+		 * failed to allocate a radix-tree node
+		 */
 		page_cache_release(page);
 		return NULL;
 	}
@@ -1319,7 +1261,7 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 	}
 
 	for (;;) {
-		struct page *page, **hash;
+		struct page *page;
 		unsigned long end_index, nr, ret;
 
 		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
@@ -1338,15 +1280,14 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 		/*
 		 * Try to find the data in the page cache..
 		 */
-		hash = page_hash(mapping, index);
 
-		spin_lock(&pagecache_lock);
-		page = __find_page_nolock(mapping, index, *hash);
+		write_lock(&mapping->page_lock);
+		page = radix_tree_lookup(&mapping->page_tree, index);
 		if (!page)
 			goto no_cached_page;
 found_page:
 		page_cache_get(page);
-		spin_unlock(&pagecache_lock);
+		write_unlock(&mapping->page_lock);
 
 		if (!Page_Uptodate(page))
 			goto page_not_up_to_date;
@@ -1440,7 +1381,7 @@ no_cached_page:
 		 * We get here with the page cache lock held.
 		 */
 		if (!cached_page) {
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			cached_page = page_cache_alloc(mapping);
 			if (!cached_page) {
 				desc->error = -ENOMEM;
@@ -1451,8 +1392,8 @@ no_cached_page:
 			 * Somebody may have added the page while we
 			 * dropped the page cache lock. Check for that.
 			 */
-			spin_lock(&pagecache_lock);
-			page = __find_page_nolock(mapping, index, *hash);
+			write_lock(&mapping->page_lock);
+			page = radix_tree_lookup(&mapping->page_tree, index);
 			if (page)
 				goto found_page;
 		}
@@ -1460,9 +1401,13 @@ no_cached_page:
 		/*
 		 * Ok, add the new page to the hash-queues...
 		 */
+		if (__add_to_page_cache(cached_page, mapping, index) < 0) {
+			write_unlock(&mapping->page_lock);
+			desc->error = -ENOMEM;
+			break;
+		}
 		page = cached_page;
-		__add_to_page_cache(page, mapping, index, hash);
-		spin_unlock(&pagecache_lock);
+		write_unlock(&mapping->page_lock);
 		lru_cache_add(page);		
 		cached_page = NULL;
 
@@ -1902,7 +1847,7 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
 	struct file *file = area->vm_file;
 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 	struct inode *inode = mapping->host;
-	struct page *page, **hash;
+	struct page *page;
 	unsigned long size, pgoff, endoff;
 
 	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
@@ -1924,9 +1869,8 @@ retry_all:
 	/*
 	 * Do we have something in the page cache already?
 	 */
-	hash = page_hash(mapping, pgoff);
 retry_find:
-	page = __find_get_page(mapping, pgoff, hash);
+	page = find_get_page(mapping, pgoff);
 	if (!page)
 		goto no_cached_page;
 
@@ -2418,20 +2362,25 @@ struct page *__read_cache_page(struct address_space *mapping,
 				int (*filler)(void *,struct page*),
 				void *data)
 {
-	struct page **hash = page_hash(mapping, index);
 	struct page *page, *cached_page = NULL;
 	int err;
 repeat:
-	page = __find_get_page(mapping, index, hash);
+	page = find_get_page(mapping, index);
 	if (!page) {
 		if (!cached_page) {
 			cached_page = page_cache_alloc(mapping);
 			if (!cached_page)
 				return ERR_PTR(-ENOMEM);
 		}
-		page = cached_page;
-		if (add_to_page_cache_unique(page, mapping, index, hash))
+		err = add_to_page_cache_unique(cached_page, mapping, index);
+		if (err == -EEXIST)
 			goto repeat;
+		if (err < 0) {
+			/* Presumably ENOMEM for radix tree node */
+			page_cache_release(cached_page);
+			return ERR_PTR(err);
+		}
+		page = cached_page;
 		cached_page = NULL;
 		err = filler(data, page);
 		if (err < 0) {
@@ -2486,19 +2435,23 @@ retry:
 static inline struct page * __grab_cache_page(struct address_space *mapping,
 				unsigned long index, struct page **cached_page)
 {
-	struct page *page, **hash = page_hash(mapping, index);
+	int err;
+	struct page *page;
 repeat:
-	page = __find_lock_page(mapping, index, hash);
+	page = find_lock_page(mapping, index);
 	if (!page) {
 		if (!*cached_page) {
 			*cached_page = page_cache_alloc(mapping);
 			if (!*cached_page)
 				return NULL;
 		}
-		page = *cached_page;
-		if (add_to_page_cache_unique(page, mapping, index, hash))
+		err = add_to_page_cache_unique(*cached_page, mapping, index);
+		if (err == -EEXIST)
 			goto repeat;
-		*cached_page = NULL;
+		if (err == 0) {
+			page = *cached_page;
+			*cached_page = NULL;
+		}
 	}
 	return page;
 }
@@ -2772,30 +2725,3 @@ o_direct:
 		status = generic_osync_inode(inode, OSYNC_METADATA);
 	goto out_status;
 }
-
-void __init page_cache_init(unsigned long mempages)
-{
-	unsigned long htable_size, order;
-
-	htable_size = mempages;
-	htable_size *= sizeof(struct page *);
-	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
-		;
-
-	do {
-		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
-
-		page_hash_bits = 0;
-		while((tmp >>= 1UL) != 0UL)
-			page_hash_bits++;
-
-		page_hash_table = (struct page **)
-			__get_free_pages(GFP_ATOMIC, order);
-	} while(page_hash_table == NULL && --order > 0);
-
-	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
-	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
-	if (!page_hash_table)
-		panic("Failed to allocate page hash table\n");
-	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
-}
diff --git a/mm/mincore.c b/mm/mincore.c
index 5209b8aa7df8..21ce8614da80 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -27,9 +27,9 @@ static unsigned char mincore_page(struct vm_area_struct * vma,
 {
 	unsigned char present = 0;
 	struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
-	struct page * page, ** hash = page_hash(as, pgoff);
+	struct page * page;
 
-	page = __find_get_page(as, pgoff, hash);
+	page = find_get_page(as, pgoff);
 	if (page) {
 		present = Page_Uptodate(page);
 		page_cache_release(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index a6e7093312da..31a19c4584be 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -370,9 +370,10 @@ static int shmem_unuse_inode (struct shmem_inode_info *info, swp_entry_t entry,
 	swp_entry_t *ptr;
 	unsigned long idx;
 	int offset;
-	
-	idx = 0;
+
 	spin_lock (&info->lock);
+repeat:
+	idx = 0;
 	offset = shmem_clear_swp (entry, info->i_direct, SHMEM_NR_DIRECT);
 	if (offset >= 0)
 		goto found;
@@ -389,13 +390,16 @@ static int shmem_unuse_inode (struct shmem_inode_info *info, swp_entry_t entry,
 	spin_unlock (&info->lock);
 	return 0;
 found:
-	delete_from_swap_cache(page);
-	add_to_page_cache(page, info->vfs_inode.i_mapping, offset + idx);
-	SetPageDirty(page);
-	SetPageUptodate(page);
-	info->swapped--;
-	spin_unlock(&info->lock);
-	return 1;
+	if (!move_from_swap_cache (page, offset+idx, info->vfs_inode.i_mapping)) {
+		info->swapped--;
+		SetPageUptodate (page);
+		spin_unlock (&info->lock);
+		return 1;
+	}
+
+	/* Yield for kswapd, and try again */
+	yield();
+	goto repeat;
 }
 
 /*
@@ -425,6 +429,7 @@ void shmem_unuse(swp_entry_t entry, struct page *page)
  */
 static int shmem_writepage(struct page * page)
 {
+	int err;
 	struct shmem_inode_info *info;
 	swp_entry_t *entry, swap;
 	struct address_space *mapping;
@@ -442,7 +447,6 @@ static int shmem_writepage(struct page * page)
 	info = SHMEM_I(inode);
 	if (info->locked)
 		return fail_writepage(page);
-getswap:
 	swap = get_swap_page();
 	if (!swap.val)
 		return fail_writepage(page);
@@ -455,29 +459,20 @@ getswap:
 	if (entry->val)
 		BUG();
 
-	/* Remove it from the page cache */
-	remove_inode_page(page);
-	page_cache_release(page);
-
-	/* Add it to the swap cache */
-	if (add_to_swap_cache(page, swap) != 0) {
-		/*
-		 * Raced with "speculative" read_swap_cache_async.
-		 * Add page back to page cache, unref swap, try again.
-		 */
-		add_to_page_cache_locked(page, mapping, index);
+	err = move_to_swap_cache(page, swap);
+	if (!err) {
+		*entry = swap;
+		info->swapped++;
 		spin_unlock(&info->lock);
-		swap_free(swap);
-		goto getswap;
+		SetPageUptodate(page);
+		set_page_dirty(page);
+		UnlockPage(page);
+		return 0;
 	}
 
-	*entry = swap;
-	info->swapped++;
 	spin_unlock(&info->lock);
-	SetPageUptodate(page);
-	set_page_dirty(page);
-	UnlockPage(page);
-	return 0;
+	swap_free(swap);
+	return fail_writepage(page);
 }
 
 /*
@@ -493,10 +488,11 @@ getswap:
  */
 static struct page * shmem_getpage_locked(struct shmem_inode_info *info, struct inode * inode, unsigned long idx)
 {
-	struct address_space * mapping = inode->i_mapping;
+	struct address_space *mapping = inode->i_mapping;
 	struct shmem_sb_info *sbinfo;
-	struct page * page;
+	struct page *page;
 	swp_entry_t *entry;
+	int error;
 
 repeat:
 	page = find_lock_page(mapping, idx);
@@ -524,8 +520,6 @@ repeat:
 	
 	shmem_recalc_inode(inode);
 	if (entry->val) {
-		unsigned long flags;
-
 		/* Look it up and read it in.. */
 		page = find_get_page(&swapper_space, entry->val);
 		if (!page) {
@@ -550,16 +544,18 @@ repeat:
 			goto repeat;
 		}
 
-		/* We have to this with page locked to prevent races */
+		/* We have to do this with page locked to prevent races */
 		if (TryLockPage(page)) 
 			goto wait_retry;
 
+		error = move_from_swap_cache(page, idx, mapping);
+		if (error < 0) {
+			UnlockPage(page);
+			return ERR_PTR(error);
+		}
+
 		swap_free(*entry);
 		*entry = (swp_entry_t) {0};
-		delete_from_swap_cache(page);
-		flags = page->flags & ~((1 << PG_uptodate) | (1 << PG_error) | (1 << PG_referenced) | (1 << PG_arch_1));
-		page->flags = flags | (1 << PG_dirty);
-		add_to_page_cache_locked(page, mapping, idx);
 		info->swapped--;
 		spin_unlock (&info->lock);
 	} else {
@@ -581,9 +577,13 @@ repeat:
 		page = page_cache_alloc(mapping);
 		if (!page)
 			return ERR_PTR(-ENOMEM);
+		error = add_to_page_cache(page, mapping, idx);
+		if (error < 0) {
+			page_cache_release(page);
+			return ERR_PTR(-ENOMEM);
+		}
 		clear_highpage(page);
 		inode->i_blocks += BLOCKS_PER_PAGE;
-		add_to_page_cache (page, mapping, idx);
 	}
 
 	/* We have the page */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0868f839f7a3..43c28a5072f7 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -37,11 +37,13 @@ static struct address_space_operations swap_aops = {
 };
 
 struct address_space swapper_space = {
-	LIST_HEAD_INIT(swapper_space.clean_pages),
-	LIST_HEAD_INIT(swapper_space.dirty_pages),
-	LIST_HEAD_INIT(swapper_space.locked_pages),
-	0,				/* nrpages	*/
-	&swap_aops,
+	page_tree:	RADIX_TREE_INIT(GFP_ATOMIC),
+	page_lock:	RW_LOCK_UNLOCKED,
+	clean_pages:	LIST_HEAD_INIT(swapper_space.clean_pages),
+	dirty_pages:	LIST_HEAD_INIT(swapper_space.dirty_pages),
+	locked_pages:	LIST_HEAD_INIT(swapper_space.locked_pages),
+	a_ops:		&swap_aops,
+	i_shared_lock:	SPIN_LOCK_UNLOCKED,
 };
 
 #ifdef SWAP_CACHE_INFO
@@ -69,17 +71,21 @@ void show_swap_cache_info(void)
 
 int add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
+	int error;
+
 	if (page->mapping)
 		BUG();
 	if (!swap_duplicate(entry)) {
 		INC_CACHE_INFO(noent_race);
 		return -ENOENT;
 	}
-	if (add_to_page_cache_unique(page, &swapper_space, entry.val,
-			page_hash(&swapper_space, entry.val)) != 0) {
+
+	error = add_to_page_cache_unique(page, &swapper_space, entry.val);
+	if (error != 0) {
 		swap_free(entry);
-		INC_CACHE_INFO(exist_race);
-		return -EEXIST;
+		if (error == -EEXIST)
+			INC_CACHE_INFO(exist_race);
+		return error;
 	}
 	if (!PageLocked(page))
 		BUG();
@@ -121,14 +127,96 @@ void delete_from_swap_cache(struct page *page)
 
 	entry.val = page->index;
 
-	spin_lock(&pagecache_lock);
+	write_lock(&swapper_space.page_lock);
 	__delete_from_swap_cache(page);
-	spin_unlock(&pagecache_lock);
+	write_unlock(&swapper_space.page_lock);
 
 	swap_free(entry);
 	page_cache_release(page);
 }
 
+int move_to_swap_cache(struct page *page, swp_entry_t entry)
+{
+	struct address_space *mapping = page->mapping;
+	void **pslot;
+	int err;
+
+	if (!mapping)
+		BUG();
+
+	if (!swap_duplicate(entry)) {
+		INC_CACHE_INFO(noent_race);
+		return -ENOENT;
+	}
+
+	write_lock(&swapper_space.page_lock);
+	write_lock(&mapping->page_lock);
+
+	err = radix_tree_reserve(&swapper_space.page_tree, entry.val, &pslot);
+	if (!err) {
+		/* Remove it from the page cache */
+		__remove_inode_page (page);
+
+		/* Add it to the swap cache */
+		*pslot = page;
+		page->flags = ((page->flags & ~(1 << PG_uptodate | 1 << PG_error
+						| 1 << PG_dirty  | 1 << PG_referenced
+						| 1 << PG_arch_1 | 1 << PG_checked))
+			       | (1 << PG_locked));
+		___add_to_page_cache(page, &swapper_space, entry.val);
+	}
+
+	write_unlock(&mapping->page_lock);
+	write_unlock(&swapper_space.page_lock);
+
+	if (!err) {
+		INC_CACHE_INFO(add_total);
+		return 0;
+	}
+
+	swap_free(entry);
+
+	if (err == -EEXIST)
+		INC_CACHE_INFO(exist_race);
+
+	return err;
+}
+
+int move_from_swap_cache(struct page *page, unsigned long index,
+		struct address_space *mapping)
+{
+	void **pslot;
+	int err;
+
+	if (!PageLocked(page))
+		BUG();
+
+	write_lock(&swapper_space.page_lock);
+	write_lock(&mapping->page_lock);
+
+	err = radix_tree_reserve(&mapping->page_tree, index, &pslot);
+	if (!err) {
+		swp_entry_t entry;
+
+		block_flushpage(page, 0);
+		entry.val = page->index;
+		__delete_from_swap_cache(page);
+		swap_free(entry);
+
+		*pslot = page;
+		page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
+				 1 << PG_referenced | 1 << PG_arch_1 |
+				 1 << PG_checked);
+		page->flags |= (1 << PG_dirty);
+		___add_to_page_cache(page, mapping, index);
+	}
+
+	write_unlock(&mapping->page_lock);
+	write_unlock(&swapper_space.page_lock);
+
+	return err;
+}
+
 /* 
  * Perform a free_page(), also freeing any swap cache associated with
  * this page if it is the last user of the page. Can not do a lock_page,
@@ -213,6 +301,7 @@ struct page * read_swap_cache_async(swp_entry_t entry)
 		 * swap cache: added by a racing read_swap_cache_async,
 		 * or by try_to_swap_out (or shmem_writepage) re-using
 		 * the just freed swap entry for an existing page.
+		 * May fail (-ENOMEM) if radix-tree node allocation failed.
 		 */
 		err = add_to_swap_cache(new_page, entry);
 		if (!err) {
@@ -222,7 +311,7 @@ struct page * read_swap_cache_async(swp_entry_t entry)
 			rw_swap_page(READ, new_page);
 			return new_page;
 		}
-	} while (err != -ENOENT);
+	} while (err != -ENOENT && err != -ENOMEM);
 
 	if (new_page)
 		page_cache_release(new_page);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 840aeee01df1..32c740c01213 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -239,10 +239,10 @@ static int exclusive_swap_page(struct page *page)
 		/* Is the only swap cache user the cache itself? */
 		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
 			/* Recheck the page count with the pagecache lock held.. */
-			spin_lock(&pagecache_lock);
+			read_lock(&swapper_space.page_lock);
 			if (page_count(page) - !!page->buffers == 2)
 				retval = 1;
-			spin_unlock(&pagecache_lock);
+			read_unlock(&swapper_space.page_lock);
 		}
 		swap_info_put(p);
 	}
@@ -307,13 +307,13 @@ int remove_exclusive_swap_page(struct page *page)
 	retval = 0;
 	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
 		/* Recheck the page count with the pagecache lock held.. */
-		spin_lock(&pagecache_lock);
+		read_lock(&swapper_space.page_lock);
 		if (page_count(page) - !!page->buffers == 2) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
 		}
-		spin_unlock(&pagecache_lock);
+		read_unlock(&swapper_space.page_lock);
 	}
 	swap_info_put(p);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f00425ad7bf8..fe363d2f3050 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -138,10 +138,16 @@ drop_pte:
 		 * (adding to the page cache will clear the dirty
 		 * and uptodate bits, so we need to do it again)
 		 */
-		if (add_to_swap_cache(page, entry) == 0) {
+		switch (add_to_swap_cache(page, entry)) {
+		case 0:				/* Success */
 			SetPageUptodate(page);
 			set_page_dirty(page);
 			goto set_swap_pte;
+		case -ENOMEM:			/* radix-tree allocation */
+			swap_free(entry);
+			goto preserve;
+		default:			/* ENOENT: raced */
+			break;
 		}
 		/* Raced with "speculative" read_swap_cache_async */
 		swap_free(entry);
@@ -341,6 +347,7 @@ static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int
 static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int priority)
 {
 	struct list_head * entry;
+	struct address_space *mapping;
 	int max_scan = nr_inactive_pages / priority;
 	int max_mapped = nr_pages << (9 - priority);
 
@@ -395,7 +402,9 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
 			continue;
 		}
 
-		if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
+		mapping = page->mapping;
+
+		if (PageDirty(page) && is_page_cache_freeable(page) && mapping) {
 			/*
 			 * It is not critical here to write it only if
 			 * the page is unmapped beause any direct writer
@@ -406,7 +415,7 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
 			 */
 			int (*writepage)(struct page *);
 
-			writepage = page->mapping->a_ops->writepage;
+			writepage = mapping->a_ops->writepage;
 			if ((gfp_mask & __GFP_FS) && writepage) {
 				ClearPageDirty(page);
 				SetPageLaunder(page);
@@ -433,7 +442,7 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
 			page_cache_get(page);
 
 			if (try_to_release_page(page, gfp_mask)) {
-				if (!page->mapping) {
+				if (!mapping) {
 					/*
 					 * We must not allow an anon page
 					 * with no buffers to be visible on
@@ -470,33 +479,35 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
 			}
 		}
 
-		spin_lock(&pagecache_lock);
-
 		/*
-		 * this is the non-racy check for busy page.
+		 * This is the non-racy check for busy page.
 		 */
-		if (!page->mapping || !is_page_cache_freeable(page)) {
-			spin_unlock(&pagecache_lock);
-			UnlockPage(page);
+		if (mapping) {
+			write_lock(&mapping->page_lock);
+			if (is_page_cache_freeable(page))
+				goto page_freeable;
+			write_unlock(&mapping->page_lock);
+		}
+		UnlockPage(page);
 page_mapped:
-			if (--max_mapped >= 0)
-				continue;
+		if (--max_mapped >= 0)
+			continue;
 
-			/*
-			 * Alert! We've found too many mapped pages on the
-			 * inactive list, so we start swapping out now!
-			 */
-			spin_unlock(&pagemap_lru_lock);
-			swap_out(priority, gfp_mask, classzone);
-			return nr_pages;
-		}
+		/*
+		 * Alert! We've found too many mapped pages on the
+		 * inactive list, so we start swapping out now!
+		 */
+		spin_unlock(&pagemap_lru_lock);
+		swap_out(priority, gfp_mask, classzone);
+		return nr_pages;
 
+page_freeable:
 		/*
 		 * It is critical to check PageDirty _after_ we made sure
 		 * the page is freeable* so not in use by anybody.
 		 */
 		if (PageDirty(page)) {
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			UnlockPage(page);
 			continue;
 		}
@@ -504,12 +515,12 @@ page_mapped:
 		/* point of no return */
 		if (likely(!PageSwapCache(page))) {
 			__remove_inode_page(page);
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 		} else {
 			swp_entry_t swap;
 			swap.val = page->index;
 			__delete_from_swap_cache(page);
-			spin_unlock(&pagecache_lock);
+			write_unlock(&mapping->page_lock);
 			swap_free(swap);
 		}
 
-- 
cgit v1.2.3


From 8fa498462272fec2c16a92a9a7f67d005225b640 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@zip.com.au>
Date: Tue, 9 Apr 2002 21:29:32 -0700
Subject: [PATCH] readahead

I'd like to be able to claim amazing speedups, but
the best benchmark I could find was diffing two
256 megabyte files, which is about 10% quicker.  And
that is probably due to the window size being effectively
50% larger.

Fact is, any disk worth owning nowadays has a segmented
2-megabyte cache, and OS-level readahead mainly seems
to save on CPU cycles rather than overall throughput.
Once you start reading more streams than there are segments
in the disk cache we start to win.

Still.  The main motivation for this work is to
clean the code up, and to create a central point at
which many pages are marshalled together so that
they can all be encapsulated into the smallest possible
number of BIOs, and injected into the request layer.

A number of filesystems were poking around inside the
readahead state variables.  I'm not really sure what they
were up to, but I took all that out.  The readahead
code manages its own state autonomously and should not
need any hints.

- Unifies the current three readahead functions (mmap reads, read(2)
  and sys_readhead) into a single implementation.

- More aggressive in building up the readahead windows.

- More conservative in tearing them down.

- Special start-of-file heuristics.

- Preallocates the readahead pages, to avoid the (never demonstrated,
  but potentially catastrophic) scenario where allocation of readahead
  pages causes the allocator to perform VM writeout.

- Gets all the readahead pages gathered together in
  one spot, so they can be marshalled into big BIOs.

- reinstates the readahead ioctls, so hdparm(8) and blockdev(8)
  are working again.  The readahead settings are now per-request-queue,
  and the drivers never have to know about it.  I use blockdev(8).
  It works in units of 512 bytes.

- Identifies readahead thrashing.

  Also attempts to handle it.  Certainly the changes here
  delay the onset of catastrophic readahead thrashing by
  quite a lot, and decrease it seriousness as we get more
  deeply into it, but it's still pretty bad.
---
 drivers/block/blkpg.c     |  12 ++
 drivers/block/ll_rw_blk.c |  44 ++++-
 drivers/md/md.c           |   6 +-
 fs/block_dev.c            |  19 +-
 fs/hfs/file.c             |   1 -
 fs/hfs/file_cap.c         |   1 -
 fs/hfs/file_hdr.c         |   2 -
 fs/intermezzo/vfs.c       |   1 -
 fs/nfsd/vfs.c             |  37 +---
 fs/open.c                 |   1 -
 fs/read_write.c           |   3 -
 include/linux/blkdev.h    |  12 +-
 include/linux/fs.h        |  16 +-
 include/linux/mm.h        |   7 +
 include/linux/raid/md_k.h |   1 -
 mm/Makefile               |   2 +-
 mm/filemap.c              | 447 ++++------------------------------------------
 mm/readahead.c            | 345 +++++++++++++++++++++++++++++++++++
 18 files changed, 490 insertions(+), 467 deletions(-)
 create mode 100644 mm/readahead.c

(limited to 'include/linux')

diff --git a/drivers/block/blkpg.c b/drivers/block/blkpg.c
index 9909c3896ee5..6018ec1990fd 100644
--- a/drivers/block/blkpg.c
+++ b/drivers/block/blkpg.c
@@ -237,6 +237,18 @@ int blk_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg)
 			intval = (is_read_only(dev) != 0);
 			return put_user(intval, (int *)(arg));
 
+		case BLKRASET:
+		case BLKFRASET:
+			if(!capable(CAP_SYS_ADMIN))
+				return -EACCES;
+			return blk_set_readahead(dev, arg);
+
+		case BLKRAGET:
+		case BLKFRAGET:
+			if (!arg)
+				return -EINVAL;
+			return put_user(blk_get_readahead(dev), (long *)arg);
+
 		case BLKSECTGET:
 			if ((q = blk_get_queue(dev)) == NULL)
 				return -EINVAL;
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index 5dee03f788fb..6cc85ffd0f4e 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -108,6 +108,47 @@ inline request_queue_t *blk_get_queue(kdev_t dev)
 		return &blk_dev[major(dev)].request_queue;
 }
 
+/**
+ * blk_set_readahead - set a queue's readahead tunable
+ * @dev:	device
+ * @sectors:	readahead, in 512 byte sectors
+ *
+ * Returns zero on success, else negative errno
+ */
+int blk_set_readahead(kdev_t dev, unsigned sectors)
+{
+	int ret = -EINVAL;
+	request_queue_t *q = blk_get_queue(dev);
+
+	if (q) {
+		q->ra_sectors = sectors;
+		ret = 0;
+	}
+	return ret;
+}
+
+/**
+ * blk_get_readahead - query a queue's readahead tunable
+ * @dev:	device
+ *
+ * Locates the passed device's request queue and returns its
+ * readahead setting.
+ *
+ * The returned value is in units of 512 byte sectors.
+ *
+ * Will return zero if the queue has never had its readahead
+ * setting altered.
+ */
+unsigned blk_get_readahead(kdev_t dev)
+{
+	unsigned ret = 0;
+	request_queue_t *q = blk_get_queue(dev);
+
+	if (q)
+		ret = q->ra_sectors;
+	return ret;
+}
+
 void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
 {
 	q->prep_rq_fn = pfn;
@@ -810,7 +851,8 @@ int blk_init_queue(request_queue_t *q, request_fn_proc *rfn, spinlock_t *lock)
 	q->plug_tq.data		= q;
 	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
 	q->queue_lock		= lock;
-	
+	q->ra_sectors		= 0;		/* Use VM default */
+
 	blk_queue_segment_boundary(q, 0xffffffff);
 
 	blk_queue_make_request(q, __make_request);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 877b114638bf..8fda89927e90 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1577,7 +1577,7 @@ static int device_size_calculation(mddev_t * mddev)
 	if (!md_size[mdidx(mddev)])
 		md_size[mdidx(mddev)] = sb->size * data_disks;
 
-	readahead = MD_READAHEAD;
+	readahead = (blk_get_readahead(rdev->dev) * 512) / PAGE_SIZE;
 	if (!sb->level || (sb->level == 4) || (sb->level == 5)) {
 		readahead = (mddev->sb->chunk_size>>PAGE_SHIFT) * 4 * data_disks;
 		if (readahead < data_disks * (MAX_SECTORS>>(PAGE_SHIFT-9))*2)
@@ -3387,7 +3387,7 @@ recheck:
 	/*
 	 * Tune reconstruction:
 	 */
-	window = MAX_READAHEAD*(PAGE_SIZE/512);
+	window = 32*(PAGE_SIZE/512);
 	printk(KERN_INFO "md: using %dk window, over a total of %d blocks.\n",
 	       window/2,max_sectors/2);
 
@@ -3605,7 +3605,7 @@ static void md_geninit(void)
 	for(i = 0; i < MAX_MD_DEVS; i++) {
 		md_blocksizes[i] = 1024;
 		md_size[i] = 0;
-		md_maxreadahead[i] = MD_READAHEAD;
+		md_maxreadahead[i] = 32;
 	}
 	blksize_size[MAJOR_NR] = md_blocksizes;
 	blk_size[MAJOR_NR] = md_size;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 85c21bc48c3d..0d23362f4e76 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -19,6 +19,7 @@
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
+#include <linux/blkpg.h>
 
 #include <asm/uaccess.h>
 
@@ -172,7 +173,6 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
 	if (offset >= 0 && offset <= size) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_reada = 0;
 			file->f_version = ++event;
 		}
 		retval = offset;
@@ -692,9 +692,20 @@ int blkdev_close(struct inode * inode, struct file * filp)
 static int blkdev_ioctl(struct inode *inode, struct file *file, unsigned cmd,
 			unsigned long arg)
 {
-	if (inode->i_bdev->bd_op->ioctl)
-		return inode->i_bdev->bd_op->ioctl(inode, file, cmd, arg);
-	return -EINVAL;
+	int ret = -EINVAL;
+	switch (cmd) {
+	case BLKRAGET:
+	case BLKFRAGET:
+	case BLKRASET:
+	case BLKFRASET:
+		ret = blk_ioctl(inode->i_bdev, cmd, arg);
+		break;
+	default:
+		if (inode->i_bdev->bd_op->ioctl)
+			ret =inode->i_bdev->bd_op->ioctl(inode, file, cmd, arg);
+		break;
+	}
+	return ret;
 }
 
 struct address_space_operations def_blk_aops = {
diff --git a/fs/hfs/file.c b/fs/hfs/file.c
index a5df68e40ec5..9a16442c98d6 100644
--- a/fs/hfs/file.c
+++ b/fs/hfs/file.c
@@ -166,7 +166,6 @@ static hfs_rwret_t hfs_file_read(struct file * filp, char * buf,
 	}
 	if ((read = hfs_do_read(inode, HFS_I(inode)->fork, pos, buf, left)) > 0) {
 	        *ppos += read;
-		filp->f_reada = 1;
 	}
 
 	return read;
diff --git a/fs/hfs/file_cap.c b/fs/hfs/file_cap.c
index 7a02ebaa6154..b7ac5722785d 100644
--- a/fs/hfs/file_cap.c
+++ b/fs/hfs/file_cap.c
@@ -105,7 +105,6 @@ static loff_t cap_info_llseek(struct file *file, loff_t offset, int origin)
 	if (offset>=0 && offset<=HFS_FORK_MAX) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_reada = 0;
 		}
 		retval = offset;
 	}
diff --git a/fs/hfs/file_hdr.c b/fs/hfs/file_hdr.c
index db6c6a518c3a..cb5673bbe448 100644
--- a/fs/hfs/file_hdr.c
+++ b/fs/hfs/file_hdr.c
@@ -361,7 +361,6 @@ loff_t hdr_llseek(struct file *file, loff_t offset, int origin)
 	if (offset>=0 && offset<file->f_dentry->d_inode->i_size) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_reada = 0;
 		}
 		retval = offset;
 	}
@@ -594,7 +593,6 @@ hfs_did_done:
 		} else if (fork) {
 			left = hfs_do_read(inode, fork, offset, buf, left);
 			if (left > 0) {
-				filp->f_reada = 1;
 			} else if (!read) {
 				return left;
 			} else {
diff --git a/fs/intermezzo/vfs.c b/fs/intermezzo/vfs.c
index 96a392239cf5..613a1c3add4c 100644
--- a/fs/intermezzo/vfs.c
+++ b/fs/intermezzo/vfs.c
@@ -1884,7 +1884,6 @@ static struct file *presto_filp_dopen(struct dentry *dentry, int flags)
 
         f->f_dentry = dentry;
         f->f_pos = 0;
-        f->f_reada = 0;
         f->f_op = NULL;
         if (inode->i_op)
                 /* XXX should we set to presto ops, or leave at cache ops? */
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 17a3867aaf26..aecd48f0d130 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -67,11 +67,7 @@ struct raparms {
 	unsigned int		p_count;
 	ino_t			p_ino;
 	kdev_t			p_dev;
-	unsigned long		p_reada,
-				p_ramax,
-				p_raend,
-				p_ralen,
-				p_rawin;
+	struct file_ra_state	p_ra;
 };
 
 static struct raparms *		raparml;
@@ -564,11 +560,7 @@ nfsd_get_raparms(kdev_t dev, ino_t ino)
 	ra = *frap;
 	ra->p_dev = dev;
 	ra->p_ino = ino;
-	ra->p_reada = 0;
-	ra->p_ramax = 0;
-	ra->p_raend = 0;
-	ra->p_ralen = 0;
-	ra->p_rawin = 0;
+	memset(&ra->p_ra, 0, sizeof(ra->p_ra));
 found:
 	if (rap != &raparm_cache) {
 		*rap = ra->p_next;
@@ -611,31 +603,18 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
 
 	/* Get readahead parameters */
 	ra = nfsd_get_raparms(inode->i_dev, inode->i_ino);
-	if (ra) {
-		file.f_reada = ra->p_reada;
-		file.f_ramax = ra->p_ramax;
-		file.f_raend = ra->p_raend;
-		file.f_ralen = ra->p_ralen;
-		file.f_rawin = ra->p_rawin;
-	}
+	if (ra)
+		file.f_ra = ra->p_ra;
 	file.f_pos = offset;
 
-	oldfs = get_fs(); set_fs(KERNEL_DS);
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
 	err = file.f_op->read(&file, buf, *count, &file.f_pos);
 	set_fs(oldfs);
 
 	/* Write back readahead params */
-	if (ra != NULL) {
-		dprintk("nfsd: raparms %ld %ld %ld %ld %ld\n",
-			file.f_reada, file.f_ramax, file.f_raend,
-			file.f_ralen, file.f_rawin);
-		ra->p_reada = file.f_reada;
-		ra->p_ramax = file.f_ramax;
-		ra->p_raend = file.f_raend;
-		ra->p_ralen = file.f_ralen;
-		ra->p_rawin = file.f_rawin;
-		ra->p_count -= 1;
-	}
+	if (ra)
+		ra->p_ra = file.f_ra;
 
 	if (err >= 0) {
 		nfsdstats.io_read += err;
diff --git a/fs/open.c b/fs/open.c
index 84a46e56a11c..58853a70da43 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -635,7 +635,6 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags)
 	f->f_dentry = dentry;
 	f->f_vfsmnt = mnt;
 	f->f_pos = 0;
-	f->f_reada = 0;
 	f->f_op = fops_get(inode->i_fop);
 	file_move(f, &inode->i_sb->s_files);
 
diff --git a/fs/read_write.c b/fs/read_write.c
index be8782266d6a..243b2be574f8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -37,7 +37,6 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 	if (offset>=0 && offset<=inode->i_sb->s_maxbytes) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_reada = 0;
 			file->f_version = ++event;
 		}
 		retval = offset;
@@ -62,7 +61,6 @@ loff_t remote_llseek(struct file *file, loff_t offset, int origin)
 	if (offset>=0 && offset<=file->f_dentry->d_inode->i_sb->s_maxbytes) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_reada = 0;
 			file->f_version = ++event;
 		}
 		retval = offset;
@@ -92,7 +90,6 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
 	if (offset >= 0) {
 		if (offset != file->f_pos) {
 			file->f_pos = offset;
-			file->f_reada = 0;
 			file->f_version = ++event;
 		}
 		retval = offset;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7a43ff774fe0..914498e8e4b9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -152,6 +152,12 @@ struct request_queue
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 
+	/*
+	 * The VM-level readahead tunable for this device.  In
+	 * units of 512-byte sectors.
+	 */
+	unsigned ra_sectors;
+
 	/*
 	 * The queue owner gets to use this for whatever they like.
 	 * ll_rw_blk doesn't touch it.
@@ -308,6 +314,8 @@ extern void blk_queue_hardsect_size(request_queue_t *q, unsigned short);
 extern void blk_queue_segment_boundary(request_queue_t *q, unsigned long);
 extern void blk_queue_assign_lock(request_queue_t *q, spinlock_t *);
 extern void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn);
+extern int blk_set_readahead(kdev_t dev, unsigned sectors);
+extern unsigned blk_get_readahead(kdev_t dev);
 
 extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
@@ -322,10 +330,6 @@ extern int * blksize_size[MAX_BLKDEV];
 
 #define MAX_SEGMENT_SIZE	65536
 
-/* read-ahead in pages.. */
-#define MAX_READAHEAD	31
-#define MIN_READAHEAD	3
-
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
 extern void drive_stat_acct(struct request *, int, int);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 676ddd28ebe2..75c9b5892f38 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -173,12 +173,10 @@ extern int leases_enable, dir_notify_enable, lease_break_time;
 #define BLKRRPART  _IO(0x12,95)	/* re-read partition table */
 #define BLKGETSIZE _IO(0x12,96)	/* return device size /512 (long *arg) */
 #define BLKFLSBUF  _IO(0x12,97)	/* flush buffer cache */
-#if 0				/* Obsolete, these don't do anything. */
 #define BLKRASET   _IO(0x12,98)	/* set read ahead for block device */
 #define BLKRAGET   _IO(0x12,99)	/* get current read ahead setting */
 #define BLKFRASET  _IO(0x12,100)/* set filesystem (mm/filemap.c) read-ahead */
 #define BLKFRAGET  _IO(0x12,101)/* get filesystem (mm/filemap.c) read-ahead */
-#endif
 #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */
 #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */
 #define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
@@ -487,6 +485,18 @@ struct fown_struct {
 	int signum;		/* posix.1b rt signal to be delivered on IO */
 };
 
+/*
+ * Track a single file's readahead state
+ */
+struct file_ra_state {
+	unsigned long start;		/* Current window */
+	unsigned long size;
+	unsigned long next_size;	/* Next window size */
+	unsigned long prev_page;	/* Cache last read() position */
+	unsigned long ahead_start;	/* Ahead window */
+	unsigned long ahead_size;
+};
+
 struct file {
 	struct list_head	f_list;
 	struct dentry		*f_dentry;
@@ -496,10 +506,10 @@ struct file {
 	unsigned int 		f_flags;
 	mode_t			f_mode;
 	loff_t			f_pos;
-	unsigned long 		f_reada, f_ramax, f_raend, f_ralen, f_rawin;
 	struct fown_struct	f_owner;
 	unsigned int		f_uid, f_gid;
 	int			f_error;
+	struct file_ra_state	f_ra;
 
 	unsigned long		f_version;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 05293f0ab136..9f676c226ce8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -531,6 +531,13 @@ extern void truncate_inode_pages(struct address_space *, loff_t);
 extern int filemap_sync(struct vm_area_struct *, unsigned long,	size_t, unsigned int);
 extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
 
+/* readahead.c */
+void do_page_cache_readahead(struct file *file,
+			unsigned long offset, unsigned long nr_to_read);
+void page_cache_readahead(struct file *file, unsigned long offset);
+void page_cache_readaround(struct file *file, unsigned long offset);
+void handle_ra_thrashing(struct file *file);
+
 /* vma is the first one with  address < vma->vm_end,
  * and even  address < vma->vm_start. Have to extend vma. */
 static inline int expand_stack(struct vm_area_struct * vma, unsigned long address)
diff --git a/include/linux/raid/md_k.h b/include/linux/raid/md_k.h
index c0c21e26f2c3..997d45fa7be7 100644
--- a/include/linux/raid/md_k.h
+++ b/include/linux/raid/md_k.h
@@ -91,7 +91,6 @@ static inline mddev_t * kdev_to_mddev (kdev_t dev)
 /*
  * default readahead
  */
-#define MD_READAHEAD	MAX_READAHEAD
 
 static inline int disk_faulty(mdp_disk_t * d)
 {
diff --git a/mm/Makefile b/mm/Makefile
index 5ad80b427103..65e9a3717247 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,6 +14,6 @@ export-objs := shmem.o filemap.o mempool.o page_alloc.o
 obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
-	    shmem.o highmem.o mempool.o msync.o mincore.o
+	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o
 
 include $(TOPDIR)/Rules.make
diff --git a/mm/filemap.c b/mm/filemap.c
index de48c026a9a4..099b144be1fa 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -26,6 +26,7 @@
 #include <linux/compiler.h>
 #include <linux/fs.h>
 #include <linux/hash.h>
+#include <linux/blkdev.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -263,7 +264,6 @@ static int truncate_list_pages(struct address_space *mapping,
 	return unlocked;
 }
 
-
 /**
  * truncate_inode_pages - truncate *all* the pages from an offset
  * @mapping: mapping to truncate
@@ -660,28 +660,6 @@ static int page_cache_read(struct file * file, unsigned long offset)
 	return error == -EEXIST ? 0 : error;
 }
 
-/*
- * Read in an entire cluster at once.  A cluster is usually a 64k-
- * aligned block that includes the page requested in "offset."
- */
-static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
-					     unsigned long filesize));
-static int read_cluster_nonblocking(struct file * file, unsigned long offset,
-	unsigned long filesize)
-{
-	unsigned long pages = CLUSTER_PAGES;
-
-	offset = CLUSTER_OFFSET(offset);
-	while ((pages-- > 0) && (offset < filesize)) {
-		int error = page_cache_read(file, offset);
-		if (error < 0)
-			return error;
-		offset ++;
-	}
-
-	return 0;
-}
-
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
@@ -954,232 +932,6 @@ struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long
 	return page;
 }
 
-#if 0
-#define PROFILE_READAHEAD
-#define DEBUG_READAHEAD
-#endif
-
-/*
- * Read-ahead profiling information
- * --------------------------------
- * Every PROFILE_MAXREADCOUNT, the following information is written 
- * to the syslog:
- *   Percentage of asynchronous read-ahead.
- *   Average of read-ahead fields context value.
- * If DEBUG_READAHEAD is defined, a snapshot of these fields is written 
- * to the syslog.
- */
-
-#ifdef PROFILE_READAHEAD
-
-#define PROFILE_MAXREADCOUNT 1000
-
-static unsigned long total_reada;
-static unsigned long total_async;
-static unsigned long total_ramax;
-static unsigned long total_ralen;
-static unsigned long total_rawin;
-
-static void profile_readahead(int async, struct file *filp)
-{
-	unsigned long flags;
-
-	++total_reada;
-	if (async)
-		++total_async;
-
-	total_ramax	+= filp->f_ramax;
-	total_ralen	+= filp->f_ralen;
-	total_rawin	+= filp->f_rawin;
-
-	if (total_reada > PROFILE_MAXREADCOUNT) {
-		save_flags(flags);
-		cli();
-		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
-			restore_flags(flags);
-			return;
-		}
-
-		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
-			total_ramax/total_reada,
-			total_ralen/total_reada,
-			total_rawin/total_reada,
-			(total_async*100)/total_reada);
-#ifdef DEBUG_READAHEAD
-		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
-			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
-#endif
-
-		total_reada	= 0;
-		total_async	= 0;
-		total_ramax	= 0;
-		total_ralen	= 0;
-		total_rawin	= 0;
-
-		restore_flags(flags);
-	}
-}
-#endif  /* defined PROFILE_READAHEAD */
-
-/*
- * Read-ahead context:
- * -------------------
- * The read ahead context fields of the "struct file" are the following:
- * - f_raend : position of the first byte after the last page we tried to
- *	       read ahead.
- * - f_ramax : current read-ahead maximum size.
- * - f_ralen : length of the current IO read block we tried to read-ahead.
- * - f_rawin : length of the current read-ahead window.
- *		if last read-ahead was synchronous then
- *			f_rawin = f_ralen
- *		otherwise (was asynchronous)
- *			f_rawin = previous value of f_ralen + f_ralen
- *
- * Read-ahead limits:
- * ------------------
- * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
- * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
- *
- * Synchronous read-ahead benefits:
- * --------------------------------
- * Using reasonable IO xfer length from peripheral devices increase system 
- * performances.
- * Reasonable means, in this context, not too large but not too small.
- * The actual maximum value is:
- *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
- *      and 32K if defined (4K page size assumed).
- *
- * Asynchronous read-ahead benefits:
- * ---------------------------------
- * Overlapping next read request and user process execution increase system 
- * performance.
- *
- * Read-ahead risks:
- * -----------------
- * We have to guess which further data are needed by the user process.
- * If these data are often not really needed, it's bad for system 
- * performances.
- * However, we know that files are often accessed sequentially by 
- * application programs and it seems that it is possible to have some good 
- * strategy in that guessing.
- * We only try to read-ahead files that seems to be read sequentially.
- *
- * Asynchronous read-ahead risks:
- * ------------------------------
- * In order to maximize overlapping, we must start some asynchronous read
- * request from the device, as soon as possible.
- * We must be very careful about:
- * - The number of effective pending IO read requests.
- *   ONE seems to be the only reasonable value.
- * - The total memory pool usage for the file access stream.
- *   This maximum memory usage is implicitly 2 IO read chunks:
- *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
- *   64k if defined (4K page size assumed).
- */
-
-static void generic_file_readahead(int reada_ok,
-	struct file * filp, struct inode * inode,
-	struct page * page)
-{
-	unsigned long end_index;
-	unsigned long index = page->index;
-	unsigned long max_ahead, ahead;
-	unsigned long raend;
-
-	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
-
-	raend = filp->f_raend;
-	max_ahead = 0;
-
-/*
- * The current page is locked.
- * If the current position is inside the previous read IO request, do not
- * try to reread previously read ahead pages.
- * Otherwise decide or not to read ahead some pages synchronously.
- * If we are not going to read ahead, set the read ahead context for this 
- * page only.
- */
-	if (PageLocked(page)) {
-		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
-			raend = index;
-			if (raend < end_index)
-				max_ahead = filp->f_ramax;
-			filp->f_rawin = 0;
-			filp->f_ralen = 1;
-			if (!max_ahead) {
-				filp->f_raend  = index + filp->f_ralen;
-				filp->f_rawin += filp->f_ralen;
-			}
-		}
-	}
-/*
- * The current page is not locked.
- * If we were reading ahead and,
- * if the current max read ahead size is not zero and,
- * if the current position is inside the last read-ahead IO request,
- *   it is the moment to try to read ahead asynchronously.
- * We will later force unplug device in order to force asynchronous read IO.
- */
-	else if (reada_ok && filp->f_ramax && raend >= 1 &&
-		 index <= raend && index + filp->f_ralen >= raend) {
-/*
- * Add ONE page to max_ahead in order to try to have about the same IO max size
- * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
- * Compute the position of the last page we have tried to read in order to 
- * begin to read ahead just at the next page.
- */
-		raend -= 1;
-		if (raend < end_index)
-			max_ahead = filp->f_ramax + 1;
-
-		if (max_ahead) {
-			filp->f_rawin = filp->f_ralen;
-			filp->f_ralen = 0;
-			reada_ok      = 2;
-		}
-	}
-/*
- * Try to read ahead pages.
- * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
- * scheduler, will work enough for us to avoid too bad actuals IO requests.
- */
-	ahead = 0;
-	while (ahead < max_ahead) {
-		ahead ++;
-		if ((raend + ahead) >= end_index)
-			break;
-		if (page_cache_read(filp, raend + ahead) < 0)
-			break;
-	}
-/*
- * If we tried to read ahead some pages,
- * If we tried to read ahead asynchronously,
- *   Try to force unplug of the device in order to start an asynchronous
- *   read IO request.
- * Update the read-ahead context.
- * Store the length of the current read-ahead window.
- * Double the current max read ahead size.
- *   That heuristic avoid to do some large IO for files that are not really
- *   accessed sequentially.
- */
-	if (ahead) {
-		filp->f_ralen += ahead;
-		filp->f_rawin += filp->f_ralen;
-		filp->f_raend = raend + ahead + 1;
-
-		filp->f_ramax += filp->f_ramax;
-
-		if (filp->f_ramax > MAX_READAHEAD)
-			filp->f_ramax = MAX_READAHEAD;
-
-#ifdef PROFILE_READAHEAD
-		profile_readahead((reada_ok == 2), filp);
-#endif
-	}
-
-	return;
-}
-
 /*
  * Mark a page as having seen activity.
  *
@@ -1214,52 +966,12 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 	struct inode *inode = mapping->host;
 	unsigned long index, offset;
 	struct page *cached_page;
-	int reada_ok;
 	int error;
 
 	cached_page = NULL;
 	index = *ppos >> PAGE_CACHE_SHIFT;
 	offset = *ppos & ~PAGE_CACHE_MASK;
 
-/*
- * If the current position is outside the previous read-ahead window, 
- * we reset the current read-ahead context and set read ahead max to zero
- * (will be set to just needed value later),
- * otherwise, we assume that the file accesses are sequential enough to
- * continue read-ahead.
- */
-	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
-		reada_ok = 0;
-		filp->f_raend = 0;
-		filp->f_ralen = 0;
-		filp->f_ramax = 0;
-		filp->f_rawin = 0;
-	} else {
-		reada_ok = 1;
-	}
-/*
- * Adjust the current value of read-ahead max.
- * If the read operation stay in the first half page, force no readahead.
- * Otherwise try to increase read ahead max just enough to do the read request.
- * Then, at least MIN_READAHEAD if read ahead is ok,
- * and at most MAX_READAHEAD in all cases.
- */
-	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
-		filp->f_ramax = 0;
-	} else {
-		unsigned long needed;
-
-		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
-
-		if (filp->f_ramax < needed)
-			filp->f_ramax = needed;
-
-		if (reada_ok && filp->f_ramax < MIN_READAHEAD)
-			filp->f_ramax = MIN_READAHEAD;
-		if (filp->f_ramax > MAX_READAHEAD)
-			filp->f_ramax = MAX_READAHEAD;
-	}
-
 	for (;;) {
 		struct page *page;
 		unsigned long end_index, nr, ret;
@@ -1275,6 +987,8 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 				break;
 		}
 
+		page_cache_readahead(filp, index);
+
 		nr = nr - offset;
 
 		/*
@@ -1283,15 +997,18 @@ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t *
 
 		write_lock(&mapping->page_lock);
 		page = radix_tree_lookup(&mapping->page_tree, index);
-		if (!page)
+		if (!page) {
+			write_unlock(&mapping->page_lock);
+			handle_ra_thrashing(filp);
+			write_lock(&mapping->page_lock);
 			goto no_cached_page;
+		}
 found_page:
 		page_cache_get(page);
 		write_unlock(&mapping->page_lock);
 
 		if (!Page_Uptodate(page))
 			goto page_not_up_to_date;
-		generic_file_readahead(reada_ok, filp, inode, page);
 page_ok:
 		/* If users can be writing to this page using arbitrary
 		 * virtual addresses, take care about potential aliasing
@@ -1301,10 +1018,9 @@ page_ok:
 			flush_dcache_page(page);
 
 		/*
-		 * Mark the page accessed if we read the
-		 * beginning or we just did an lseek.
+		 * Mark the page accessed if we read the beginning.
 		 */
-		if (!offset || !filp->f_reada)
+		if (!offset)
 			mark_page_accessed(page);
 
 		/*
@@ -1327,12 +1043,7 @@ page_ok:
 			continue;
 		break;
 
-/*
- * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
- */
 page_not_up_to_date:
-		generic_file_readahead(reada_ok, filp, inode, page);
-
 		if (Page_Uptodate(page))
 			goto page_ok;
 
@@ -1359,9 +1070,6 @@ readpage:
 		if (!error) {
 			if (Page_Uptodate(page))
 				goto page_ok;
-
-			/* Again, try some read-ahead while waiting for the page to finish.. */
-			generic_file_readahead(reada_ok, filp, inode, page);
 			wait_on_page(page);
 			if (Page_Uptodate(page))
 				goto page_ok;
@@ -1415,7 +1123,6 @@ no_cached_page:
 	}
 
 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
-	filp->f_reada = 1;
 	if (cached_page)
 		page_cache_release(cached_page);
 	UPDATE_ATIME(inode);
@@ -1740,24 +1447,12 @@ static ssize_t do_readahead(struct file *file, unsigned long index, unsigned lon
 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
 		return -EINVAL;
 
-	/* Limit it to the size of the file.. */
-	max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
-	if (index > max)
-		return 0;
-	max -= index;
-	if (nr > max)
-		nr = max;
-
-	/* And limit it to a sane percentage of the inactive list.. */
+	/* Limit it to a sane percentage of the inactive list.. */
 	max = nr_inactive_pages / 2;
 	if (nr > max)
 		nr = max;
 
-	while (nr) {
-		page_cache_read(file, index);
-		index++;
-		nr--;
-	}
+	do_page_cache_readahead(file, index, nr);
 	return 0;
 }
 
@@ -1771,7 +1466,8 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 	if (file) {
 		if (file->f_mode & FMODE_READ) {
 			unsigned long start = offset >> PAGE_CACHE_SHIFT;
-			unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
+			unsigned long end = (offset + count - 1) >> PAGE_CACHE_SHIFT;
+			unsigned long len = end - start + 1;
 			ret = do_readahead(file, start, len);
 		}
 		fput(file);
@@ -1779,60 +1475,6 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
 	return ret;
 }
 
-/*
- * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
- * sure this is sequential access, we don't need a flexible read-ahead
- * window size -- we can always use a large fixed size window.
- */
-static void nopage_sequential_readahead(struct vm_area_struct * vma,
-	unsigned long pgoff, unsigned long filesize)
-{
-	unsigned long ra_window;
-
-	ra_window = CLUSTER_OFFSET(MAX_READAHEAD + CLUSTER_PAGES - 1);
-
-	/* vm_raend is zero if we haven't read ahead in this area yet.  */
-	if (vma->vm_raend == 0)
-		vma->vm_raend = vma->vm_pgoff + ra_window;
-
-	/*
-	 * If we've just faulted the page half-way through our window,
-	 * then schedule reads for the next window, and release the
-	 * pages in the previous window.
-	 */
-	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
-		unsigned long start = vma->vm_pgoff + vma->vm_raend;
-		unsigned long end = start + ra_window;
-
-		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
-			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
-		if (start > end)
-			return;
-
-		while ((start < end) && (start < filesize)) {
-			if (read_cluster_nonblocking(vma->vm_file,
-							start, filesize) < 0)
-				break;
-			start += CLUSTER_PAGES;
-		}
-		run_task_queue(&tq_disk);
-
-		/* if we're far enough past the beginning of this area,
-		   recycle pages that are in the previous window. */
-		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
-			unsigned long window = ra_window << PAGE_SHIFT;
-
-			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
-			end -= window + window;
-			filemap_sync(vma, end - window, window, MS_INVALIDATE);
-		}
-
-		vma->vm_raend += ra_window;
-	}
-
-	return;
-}
-
 /*
  * filemap_nopage() is invoked via the vma operations vector for a
  * mapped memory region to read in file data during a page fault.
@@ -1841,6 +1483,7 @@ static void nopage_sequential_readahead(struct vm_area_struct * vma,
  * it in the page cache, and handles the special cases reasonably without
  * having a lot of duplicated code.
  */
+
 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
 {
 	int error;
@@ -1866,6 +1509,20 @@ retry_all:
 	if (size > endoff)
 		size = endoff;
 
+	/*
+	 * The readahead code wants to be told about each and every page
+	 * so it can build and shrink its windows appropriately
+	 */
+	if (VM_SequentialReadHint(area))
+		page_cache_readahead(area->vm_file, pgoff);
+
+	/*
+	 * If the offset is outside the mapping size we're off the end
+	 * of a privately mapped file, so we need to map a zero page.
+	 */
+	if ((pgoff < size) && !VM_RandomReadHint(area))
+		page_cache_readaround(file, pgoff);
+
 	/*
 	 * Do we have something in the page cache already?
 	 */
@@ -1882,12 +1539,6 @@ retry_find:
 		goto page_not_uptodate;
 
 success:
- 	/*
-	 * Try read-ahead for sequential areas.
-	 */
-	if (VM_SequentialReadHint(area))
-		nopage_sequential_readahead(area, pgoff, size);
-
 	/*
 	 * Found the page and have a reference on it, need to check sharing
 	 * and possibly copy it over to another page..
@@ -1898,16 +1549,10 @@ success:
 
 no_cached_page:
 	/*
-	 * If the requested offset is within our file, try to read a whole 
-	 * cluster of pages at once.
-	 *
-	 * Otherwise, we're off the end of a privately mapped file,
-	 * so we need to map a zero page.
+	 * We're only likely to ever get here if MADV_RANDOM is in
+	 * effect.
 	 */
-	if ((pgoff < size) && !VM_RandomReadHint(area))
-		error = read_cluster_nonblocking(file, pgoff, size);
-	else
-		error = page_cache_read(file, pgoff);
+	error = page_cache_read(file, pgoff);
 
 	/*
 	 * The page we want has now been added to the page cache.
@@ -2152,7 +1797,7 @@ static long madvise_behavior(struct vm_area_struct * vma,
  * to make sure they are started.  Do not wait for completion.
  */
 static long madvise_willneed(struct vm_area_struct * vma,
-	unsigned long start, unsigned long end)
+				unsigned long start, unsigned long end)
 {
 	long error = -EBADF;
 	struct file * file;
@@ -2177,30 +1822,8 @@ static long madvise_willneed(struct vm_area_struct * vma,
 	if ((vma->vm_mm->rss + (end - start)) > rlim_rss)
 		return error;
 
-	/* round to cluster boundaries if this isn't a "random" area. */
-	if (!VM_RandomReadHint(vma)) {
-		start = CLUSTER_OFFSET(start);
-		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
-
-		while ((start < end) && (start < size)) {
-			error = read_cluster_nonblocking(file, start, size);
-			start += CLUSTER_PAGES;
-			if (error < 0)
-				break;
-		}
-	} else {
-		while ((start < end) && (start < size)) {
-			error = page_cache_read(file, start);
-			start++;
-			if (error < 0)
-				break;
-		}
-	}
-
-	/* Don't wait for someone else to push these requests. */
-	run_task_queue(&tq_disk);
-
-	return error;
+	do_page_cache_readahead(file, start, end - start);
+	return 0;
 }
 
 /*
diff --git a/mm/readahead.c b/mm/readahead.c
new file mode 100644
index 000000000000..6663da13db2e
--- /dev/null
+++ b/mm/readahead.c
@@ -0,0 +1,345 @@
+/*
+ * mm/readahead.c - address_space-level file readahead.
+ *
+ * Copyright (C) 2002, Linus Torvalds
+ *
+ * 09Apr2002	akpm@zip.com.au
+ *		Initial version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+
+/*
+ * The readahead logic manages two readahead windows.  The "current"
+ * and the "ahead" windows.
+ *
+ * VM_MAX_READAHEAD specifies, in kilobytes, the maximum size of
+ * each of the two windows.  So the amount of readahead which is
+ * in front of the file pointer varies between VM_MAX_READAHEAD and
+ * VM_MAX_READAHEAD * 2.
+ *
+ * VM_MAX_READAHEAD only applies if the underlying request queue
+ * has a zero value of ra_sectors.
+ */
+
+#define VM_MAX_READAHEAD	128	/* kbytes */
+#define VM_MIN_READAHEAD	16	/* kbytes (includes current page) */
+
+/*
+ * Return max readahead size for this inode in number-of-pages.
+ */
+static int get_max_readahead(struct inode *inode)
+{
+	unsigned blk_ra_kbytes = 0;
+
+	blk_ra_kbytes = blk_get_readahead(inode->i_dev) / 2;
+	if (blk_ra_kbytes < VM_MIN_READAHEAD)
+		blk_ra_kbytes = VM_MAX_READAHEAD;
+
+	return blk_ra_kbytes >> (PAGE_CACHE_SHIFT - 10);
+}
+
+static int get_min_readahead(struct inode *inode)
+{
+	int ret = VM_MIN_READAHEAD / PAGE_CACHE_SIZE;
+
+	if (ret < 2)
+		ret = 2;
+	return ret;
+}
+
+/*
+ * Readahead design.
+ *
+ * The fields in struct file_ra_state represent the most-recently-executed
+ * readahead attempt:
+ *
+ * start:	Page index at which we started the readahead
+ * size:	Number of pages in that read
+ *              Together, these form the "current window".
+ *              Together, start and size represent the `readahead window'.
+ * next_size:   The number of pages to read when we get the next readahead miss.
+ * prev_page:   The page which the readahead algorithm most-recently inspected.
+ *              prev_page is mainly an optimisation: if page_cache_readahead sees
+ *              that it is again being called for a page which it just looked at,
+ *              it can return immediately without making any state changes.
+ * ahead_start,
+ * ahead_size:  Together, these form the "ahead window".
+ *
+ * The readahead code manages two windows - the "current" and the "ahead"
+ * windows.  The intent is that while the application is walking the pages
+ * in the current window, I/O is underway on the ahead window.  When the
+ * current window is fully traversed, it is replaced by the ahead window
+ * and the ahead window is invalidated.  When this copying happens, the
+ * new current window's pages are probably still locked.  When I/O has
+ * completed, we submit a new batch of I/O, creating a new ahead window.
+ *
+ * So:
+ *
+ *   ----|----------------|----------------|-----
+ *       ^start           ^start+size
+ *                        ^ahead_start     ^ahead_start+ahead_size
+ *
+ *         ^ When this page is read, we submit I/O for the
+ *           ahead window.
+ *
+ * A `readahead hit' occurs when a read request is made against a page which is
+ * inside the current window.  Hits are good, and the window size (next_size) is
+ * grown aggressively when hits occur.  Two pages are added to the next window
+ * size on each hit, which will end up doubling the next window size by the time
+ * I/O is submitted for it.
+ *
+ * If readahead hits are more sparse (say, the application is only reading every
+ * second page) then the window will build more slowly.
+ *
+ * On a readahead miss (the application seeked away) the readahead window is shrunk
+ * by 25%.  We don't want to drop it too aggressively, because it's a good assumption
+ * that an application which has built a good readahead window will continue to
+ * perform linear reads.  Either at the new file position, or at the old one after
+ * another seek.
+ *
+ * There is a special-case: if the first page which the application tries to read
+ * happens to be the first page of the file, it is assumed that a linear read is
+ * about to happen and the window is immediately set to half of the device maximum.
+ * 
+ * A page request at (start + size) is not a miss at all - it's just a part of
+ * sequential file reading.
+ *
+ * This function is to be called for every page which is read, rather than when
+ * it is time to perform readahead.  This is so the readahead algorithm can centrally
+ * work out the access patterns.  This could be costly with many tiny read()s, so
+ * we specifically optimise for that case with prev_page.
+ */
+
+/*
+ * do_page_cache_readahead actually reads a chunk of disk.  It allocates all the
+ * pages first, then submits them all for I/O. This avoids the very bad behaviour
+ * which would occur if page allocations are causing VM writeback.  We really don't
+ * want to intermingle reads and writes like that.
+ */
+void do_page_cache_readahead(struct file *file,
+			unsigned long offset, unsigned long nr_to_read)
+{
+	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
+	struct inode *inode = mapping->host;
+	struct page *page;
+	unsigned long end_index;	/* The last page we want to read */
+	LIST_HEAD(page_pool);
+	int page_idx;
+	int nr_to_really_read = 0;
+
+	if (inode->i_size == 0)
+		return;
+
+ 	end_index = ((inode->i_size - 1) >> PAGE_CACHE_SHIFT);
+
+	/*
+	 * Preallocate as many pages as we will need.
+	 */
+	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
+		unsigned long page_offset = offset + page_idx;
+		
+		if (page_offset > end_index)
+			break;
+
+		read_lock(&mapping->page_lock);
+		page = radix_tree_lookup(&mapping->page_tree, page_offset);
+		read_unlock(&mapping->page_lock);
+		if (page)
+			continue;
+
+		page = page_cache_alloc(mapping);
+		if (!page)
+			break;
+		page->index = page_offset;
+		list_add(&page->list, &page_pool);
+		nr_to_really_read++;
+	}
+
+	/*
+	 * Now start the IO.  We ignore I/O errors - if the page is not
+	 * uptodate then the caller will launch readpage again, and
+	 * will then handle the error.
+	 */
+	for (page_idx = 0; page_idx < nr_to_really_read; page_idx++) {
+		if (list_empty(&page_pool))
+			BUG();
+		page = list_entry(page_pool.prev, struct page, list);
+		list_del(&page->list);
+		if (!add_to_page_cache_unique(page, mapping, page->index))
+			mapping->a_ops->readpage(file, page);
+		page_cache_release(page);
+	}
+
+	/*
+	 * Do this now, rather than at the next wait_on_page().
+	 */
+	run_task_queue(&tq_disk);
+
+	if (!list_empty(&page_pool))
+		BUG();
+
+	return;
+}
+
+/*
+ * page_cache_readahead is the main function.  If performs the adaptive
+ * readahead window size management and submits the readahead I/O.
+ */
+void page_cache_readahead(struct file *file, unsigned long offset)
+{
+	struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
+	struct file_ra_state *ra = &file->f_ra;
+	unsigned long max;
+	unsigned long min;
+
+	/*
+	 * Here we detect the case where the application is performing
+	 * sub-page sized reads.  We avoid doing extra work and bogusly
+	 * perturbing the readahead window expansion logic.
+	 * If next_size is zero, this is the very first read for this
+	 * file handle.
+	 */
+	if (offset == ra->prev_page) {
+		if (ra->next_size != 0)
+			goto out;
+	}
+
+	min = get_min_readahead(inode);
+	max = get_max_readahead(inode);
+
+	if (ra->next_size == 0 && offset == 0) {
+		/*
+		 * Special case - first read from first page.
+		 * We'll assume it's a whole-file read, and
+		 * grow the window fast.
+		 */
+		ra->next_size = max / 2;
+		goto do_io;
+	}
+
+	ra->prev_page = offset;
+
+	if (offset >= ra->start && offset <= (ra->start + ra->size)) {
+		/*
+		 * A readahead hit.  Either inside the window, or one
+		 * page beyond the end.  Expand the next readahead size.
+		 */
+		ra->next_size += 2;
+	} else {
+		/*
+		 * A miss - lseek, pread, etc.  Shrink the readahead window by 25%.
+		 */
+		ra->next_size -= ra->next_size / 4;
+		if (ra->next_size < min)
+			ra->next_size = min;
+	}
+
+	if (ra->next_size > max)
+		ra->next_size = max;
+	if (ra->next_size < min)
+		ra->next_size = min;
+
+	/*
+	 * Is this request outside the current window?
+	 */
+	if (offset < ra->start || offset >= (ra->start + ra->size)) {
+		/*
+		 * A miss against the current window.  Have we merely
+		 * advanced into the ahead window?
+		 */
+		if (offset == ra->ahead_start) {
+			/*
+			 * Yes, we have.  The ahead window now becomes
+			 * the current window.
+			 */
+			ra->start = ra->ahead_start;
+			ra->size = ra->ahead_size;
+			ra->prev_page = ra->start;
+			ra->ahead_start = 0;
+			ra->ahead_size = 0;
+			/*
+			 * Control now returns, probably to sleep until I/O
+			 * completes against the first ahead page.
+			 * When the second page in the old ahead window is
+			 * requested, control will return here and more I/O
+			 * will be submitted to build the new ahead window.
+			 */
+			goto out;
+		}
+do_io:
+		/*
+		 * This is the "unusual" path.  We come here during
+		 * startup or after an lseek.  We invalidate the
+		 * ahead window and get some I/O underway for the new
+		 * current window.
+		 */
+		ra->start = offset;
+		ra->size = ra->next_size;
+		ra->ahead_start = 0;		/* Invalidate these */
+		ra->ahead_size = 0;
+
+		do_page_cache_readahead(file, offset, ra->size);
+	} else {
+		/*
+		 * This read request is within the current window.  It
+		 * is time to submit I/O for the ahead window while
+		 * the application is crunching through the current
+		 * window.
+		 */
+		if (ra->ahead_start == 0) {
+			ra->ahead_start = ra->start + ra->size;
+			ra->ahead_size = ra->next_size;
+			do_page_cache_readahead(file,
+					ra->ahead_start, ra->ahead_size);
+		}
+	}
+out:
+	return;
+}
+
+/*
+ * For mmap reads (typically executables) the access pattern is fairly random,
+ * but somewhat ascending.  So readaround favours pages beyond the target one.
+ * We also boost the window size, as it can easily shrink due to misses.
+ */
+void page_cache_readaround(struct file *file, unsigned long offset)
+{
+	unsigned long target;
+	unsigned long backward;
+	const int min = get_min_readahead(file->f_dentry->d_inode->i_mapping->host) * 2;
+
+	if (file->f_ra.next_size < min)
+		file->f_ra.next_size = min;
+
+	target = offset;
+	backward = file->f_ra.next_size / 4;
+
+	if (backward > target)
+		target = 0;
+	else
+		target -= backward;
+	page_cache_readahead(file, target);
+}
+
+/*
+ * handle_ra_thrashing() is called when it is known that a page which should
+ * have been present (it's inside the readahead window) was in fact evicted by
+ * the VM.
+ *
+ * We shrink the readahead window by three pages.  This is because we grow it
+ * by two pages on a readahead hit.  Theory being that the readahead window size
+ * will stabilise around the maximum level at which there isn't any thrashing.
+ */
+void handle_ra_thrashing(struct file *file)
+{
+	struct inode *inode = file->f_dentry->d_inode;
+	const unsigned long min = get_min_readahead(inode);
+
+	file->f_ra.next_size -= 3;
+	if (file->f_ra.next_size < min)
+		file->f_ra.next_size = min;
+}
-- 
cgit v1.2.3


From 9855b4a17d61f4c7642c02cb905eec99a6f2c061 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@zip.com.au>
Date: Tue, 9 Apr 2002 21:29:40 -0700
Subject: [PATCH] page->buffers abstraction

page->buffers is a bit of a layering violation.  Not all address_spaces
have pages which are backed by buffers.

The exclusive use of page->buffers for buffers means that a piece of
prime real estate in struct page is unavailable to other forms of
address_space.

This patch turns page->buffers into `unsigned long page->private' and
sets in place all the infrastructure which is needed to allow other
address_spaces to use this storage.

This change alows the multipage-bio-writeout patches to use
page->private to cache the results of an earlier get_block(), so
repeated calls into the filesystem are not needed in the case of file
overwriting.

Devlopers should think carefully before calling try_to_free_buffers()
or block_flushpage() or writeout_one_page() or waitfor_one_page()
against a page.  It's only legal to do this if you *know* that the page
is buffer-backed.  And only the address_space knows that.
Arguably, we need new a_ops for writeout_one_page() and
waitfor_one_page().  But I have more patches on the boil which
obsolete these functions in favour of ->writepage() and wait_on_page().

The new PG_private page bit is used to indicate that there
is something at page->private.  The core kernel does not
know what that object actually is, just that it's there.
The kernel must call a_ops->releasepage() to try to make
page->private go away.  And a_ops->flushpage() at truncate
time.
---
 fs/buffer.c                   | 62 +++++++++++++++++++------------------------
 fs/ext3/inode.c               | 24 ++++++++---------
 fs/jbd/transaction.c          | 14 ++++++----
 fs/reiserfs/inode.c           | 10 +++----
 fs/reiserfs/tail_conversion.c |  4 +--
 include/asm-s390/pgtable.h    |  2 +-
 include/asm-s390x/pgtable.h   |  2 +-
 include/linux/fs.h            | 18 +++++++++++++
 include/linux/mm.h            | 23 ++++++++++------
 mm/filemap.c                  | 20 ++++++++------
 mm/page_alloc.c               |  4 +--
 mm/swapfile.c                 | 10 +++----
 mm/vmscan.c                   |  9 ++++---
 13 files changed, 114 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8b8f44fd71da..73c04752a4fa 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1327,21 +1327,13 @@ static void discard_buffer(struct buffer_head * bh)
 
 int try_to_release_page(struct page * page, int gfp_mask)
 {
+	struct address_space * const mapping = page->mapping;
+
 	if (!PageLocked(page))
 		BUG();
 	
-	if (!page->mapping)
-		goto try_to_free;
-	if (!page->mapping->a_ops->releasepage)
-		goto try_to_free;
-	if (page->mapping->a_ops->releasepage(page, gfp_mask))
-		goto try_to_free;
-	/*
-	 * We couldn't release buffer metadata; don't even bother trying
-	 * to release buffers.
-	 */
-	return 0;
-try_to_free:	
+	if (mapping && mapping->a_ops->releasepage)
+		return mapping->a_ops->releasepage(page, gfp_mask);
 	return try_to_free_buffers(page, gfp_mask);
 }
 
@@ -1359,10 +1351,10 @@ int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
 
 	if (!PageLocked(page))
 		BUG();
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		return 1;
 
-	head = page->buffers;
+	head = page_buffers(page);
 	bh = head;
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
@@ -1401,7 +1393,7 @@ void create_empty_buffers(struct page *page, unsigned long blocksize)
 
 	/* FIXME: create_buffers should fail if there's no enough memory */
 	head = create_buffers(page, blocksize, 1);
-	if (page->buffers)
+	if (page_has_buffers(page))
 		BUG();
 
 	bh = head;
@@ -1411,7 +1403,7 @@ void create_empty_buffers(struct page *page, unsigned long blocksize)
 		bh = bh->b_this_page;
 	} while (bh);
 	tail->b_this_page = head;
-	page->buffers = head;
+	set_page_buffers(page, head);
 	page_cache_get(page);
 }
 EXPORT_SYMBOL(create_empty_buffers);
@@ -1467,9 +1459,9 @@ static int __block_write_full_page(struct inode *inode, struct page *page, get_b
 	if (!PageLocked(page))
 		BUG();
 
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		create_empty_buffers(page, 1 << inode->i_blkbits);
-	head = page->buffers;
+	head = page_buffers(page);
 
 	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
 
@@ -1560,9 +1552,9 @@ static int __block_prepare_write(struct inode *inode, struct page *page,
 	char *kaddr = kmap(page);
 
 	blocksize = 1 << inode->i_blkbits;
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize);
-	head = page->buffers;
+	head = page_buffers(page);
 
 	bbits = inode->i_blkbits;
 	block = page->index << (PAGE_CACHE_SHIFT - bbits);
@@ -1653,7 +1645,7 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 
 	blocksize = 1 << inode->i_blkbits;
 
-	for(bh = head = page->buffers, block_start = 0;
+	for(bh = head = page_buffers(page), block_start = 0;
 	    bh != head || !block_start;
 	    block_start=block_end, bh = bh->b_this_page) {
 		block_end = block_start + blocksize;
@@ -1701,9 +1693,9 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
 	if (!PageLocked(page))
 		PAGE_BUG(page);
 	blocksize = 1 << inode->i_blkbits;
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize);
-	head = page->buffers;
+	head = page_buffers(page);
 
 	blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
 	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
@@ -1953,11 +1945,11 @@ int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t
 	if (!page)
 		goto out;
 
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize);
 
 	/* Find the buffer that contains "offset" */
-	bh = page->buffers;
+	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
@@ -2044,7 +2036,7 @@ done:
  */
 int writeout_one_page(struct page *page)
 {
-	struct buffer_head *bh, *head = page->buffers;
+	struct buffer_head *bh, *head = page_buffers(page);
 
 	if (!PageLocked(page))
 		BUG();
@@ -2067,7 +2059,7 @@ EXPORT_SYMBOL(writeout_one_page);
 int waitfor_one_page(struct page *page)
 {
 	int error = 0;
-	struct buffer_head *bh, *head = page->buffers;
+	struct buffer_head *bh, *head = page_buffers(page);
 
 	bh = head;
 	do {
@@ -2210,9 +2202,9 @@ int brw_page(int rw, struct page *page, struct block_device *bdev, sector_t b[],
 	if (!PageLocked(page))
 		panic("brw_page: page not locked for I/O");
 
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		create_empty_buffers(page, size);
-	head = bh = page->buffers;
+	head = bh = page_buffers(page);
 
 	/* Stage 1: lock all the buffers */
 	do {
@@ -2280,7 +2272,7 @@ static inline void link_dev_buffers(struct page * page, struct buffer_head *head
 		bh = bh->b_this_page;
 	} while (bh);
 	tail->b_this_page = head;
-	page->buffers = head;
+	set_page_buffers(page, head);
 	page_cache_get(page);
 }
 
@@ -2299,8 +2291,8 @@ static struct page * grow_dev_page(struct block_device *bdev, unsigned long inde
 	if (!PageLocked(page))
 		BUG();
 
-	bh = page->buffers;
-	if (bh) {
+	if (page_has_buffers(page)) {
+		bh = page_buffers(page);
 		if (bh->b_size == size)
 			return page;
 		if (!try_to_free_buffers(page, GFP_NOFS))
@@ -2321,7 +2313,7 @@ failed:
 
 static void hash_page_buffers(struct page *page, struct block_device *bdev, int block, int size)
 {
-	struct buffer_head *head = page->buffers;
+	struct buffer_head *head = page_buffers(page);
 	struct buffer_head *bh = head;
 	unsigned int uptodate;
 
@@ -2447,7 +2439,7 @@ static int sync_page_buffers(struct buffer_head *head, unsigned int gfp_mask)
  */
 int try_to_free_buffers(struct page * page, unsigned int gfp_mask)
 {
-	struct buffer_head * tmp, * bh = page->buffers;
+	struct buffer_head * tmp, * bh = page_buffers(page);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!bh);
@@ -2484,7 +2476,7 @@ cleaned_buffers_try_again:
 	wake_up(&buffer_wait);
 
 	/* And free the page */
-	page->buffers = NULL;
+	clear_page_buffers(page);
 	page_cache_release(page);
 	write_unlock(&hash_table_lock);
 	spin_unlock(&lru_list_lock);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 86e37917c0fe..19048b5d80d5 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1029,7 +1029,7 @@ static int ext3_prepare_write(struct file *file, struct page *page,
 		goto prepare_write_failed;
 
 	if (ext3_should_journal_data(inode)) {
-		ret = walk_page_buffers(handle, page->buffers,
+		ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, do_journal_get_write_access);
 		if (ret) {
 			/*
@@ -1102,7 +1102,7 @@ static int ext3_commit_write(struct file *file, struct page *page,
 		int partial = 0;
 		loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
 
-		ret = walk_page_buffers(handle, page->buffers,
+		ret = walk_page_buffers(handle, page_buffers(page),
 			from, to, &partial, commit_write_fn);
 		if (!partial)
 			SetPageUptodate(page);
@@ -1112,7 +1112,7 @@ static int ext3_commit_write(struct file *file, struct page *page,
 		EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
 	} else {
 		if (ext3_should_order_data(inode)) {
-			ret = walk_page_buffers(handle, page->buffers,
+			ret = walk_page_buffers(handle, page_buffers(page),
 				from, to, NULL, journal_dirty_sync_data);
 		}
 		/* Be careful here if generic_commit_write becomes a
@@ -1252,7 +1252,7 @@ static int bget_one(handle_t *handle, struct buffer_head *bh)
 static int ext3_writepage(struct page *page)
 {
 	struct inode *inode = page->mapping->host;
-	struct buffer_head *page_buffers;
+	struct buffer_head *page_bufs;
 	handle_t *handle = NULL;
 	int ret = 0, err;
 	int needed;
@@ -1285,14 +1285,14 @@ static int ext3_writepage(struct page *page)
 
 	unlock_kernel();
 
-	page_buffers = NULL;	/* Purely to prevent compiler warning */
+	page_bufs = NULL;	/* Purely to prevent compiler warning */
 
 	/* bget() all the buffers */
 	if (order_data) {
-		if (!page->buffers)
+		if (!page_has_buffers(page))
 			create_empty_buffers(page, inode->i_sb->s_blocksize);
-		page_buffers = page->buffers;
-		walk_page_buffers(handle, page_buffers, 0,
+		page_bufs = page_buffers(page);
+		walk_page_buffers(handle, page_bufs, 0,
 				PAGE_CACHE_SIZE, NULL, bget_one);
 	}
 
@@ -1301,7 +1301,7 @@ static int ext3_writepage(struct page *page)
 	/*
 	 * The page can become unlocked at any point now, and
 	 * truncate can then come in and change things.  So we
-	 * can't touch *page from now on.  But *page_buffers is
+	 * can't touch *page from now on.  But *page_bufs is
 	 * safe due to elevated refcount.
 	 */
 
@@ -1310,7 +1310,7 @@ static int ext3_writepage(struct page *page)
 
 	/* And attach them to the current transaction */
 	if (order_data) {
-		err = walk_page_buffers(handle, page_buffers,
+		err = walk_page_buffers(handle, page_bufs,
 			0, PAGE_CACHE_SIZE, NULL, journal_dirty_async_data);
 		if (!ret)
 			ret = err;
@@ -1392,11 +1392,11 @@ static int ext3_block_truncate_page(handle_t *handle,
 	if (!page)
 		goto out;
 
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		create_empty_buffers(page, blocksize);
 
 	/* Find the buffer that contains "offset" */
-	bh = page->buffers;
+	bh = page_buffers(page);
 	pos = blocksize;
 	while (offset >= pos) {
 		bh = bh->b_this_page;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 859c45380ef0..a0244b4f9102 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1661,10 +1661,11 @@ int journal_try_to_free_buffers(journal_t *journal,
 	struct buffer_head *tmp;
 	int locked_or_dirty = 0;
 	int call_ttfb = 1;
+	int ret;
 
 	J_ASSERT(PageLocked(page));
 
-	bh = page->buffers;
+	bh = page_buffers(page);
 	tmp = bh;
 	spin_lock(&journal_datalist_lock);
 	do {
@@ -1688,7 +1689,10 @@ int journal_try_to_free_buffers(journal_t *journal,
 	 */
 	call_ttfb = 1;
 out:
-	return call_ttfb;
+	ret = 0;
+	if (call_ttfb)
+		ret = try_to_free_buffers(page, gfp_mask);
+	return ret;
 }
 
 /*
@@ -1881,7 +1885,7 @@ int journal_flushpage(journal_t *journal,
 		
 	if (!PageLocked(page))
 		BUG();
-	if (!page->buffers)
+	if (!page_has_buffers(page))
 		return 1;
 
 	/* We will potentially be playing with lists other than just the
@@ -1889,7 +1893,7 @@ int journal_flushpage(journal_t *journal,
 	 * cautious in our locking. */
 	lock_journal(journal);
 
-	head = bh = page->buffers;
+	head = bh = page_buffers(page);
 	do {
 		unsigned int next_off = curr_off + bh->b_size;
 		next = bh->b_this_page;
@@ -1911,7 +1915,7 @@ int journal_flushpage(journal_t *journal,
 	if (!offset) {
 		if (!may_free || !try_to_free_buffers(page, 0))
 			return 0;
-		J_ASSERT(page->buffers == NULL);
+		J_ASSERT(!page_has_buffers(page));
 	}
 	return 1;
 }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 4984cd21aba7..e3de811254a4 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -146,8 +146,8 @@ static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) {
 static inline void fix_tail_page_for_writing(struct page *page) {
     struct buffer_head *head, *next, *bh ;
 
-    if (page && page->buffers) {
-	head = page->buffers ;
+    if (page && page_has_buffers(page)) {
+	head = page_buffers(page) ;
 	bh = head ;
 	do {
 	    next = bh->b_this_page ;
@@ -1685,7 +1685,7 @@ static int grab_tail_page(struct inode *p_s_inode,
 
     kunmap(page) ; /* mapped by block_prepare_write */
 
-    head = page->buffers ;      
+    head = page_buffers(page) ;      
     bh = head;
     do {
 	if (pos >= start) {
@@ -1930,7 +1930,7 @@ static int reiserfs_write_full_page(struct page *page) {
     struct buffer_head *arr[PAGE_CACHE_SIZE/512] ;
     int nr = 0 ;
 
-    if (!page->buffers) {
+    if (!page_has_buffers(page)) {
         block_prepare_write(page, 0, 0, NULL) ;
 	kunmap(page) ;
     }
@@ -1948,7 +1948,7 @@ static int reiserfs_write_full_page(struct page *page) {
 	flush_dcache_page(page) ;
 	kunmap(page) ;
     }
-    head = page->buffers ;
+    head = page_buffers(page) ;
     bh = head ;
     block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits) ;
     do {
diff --git a/fs/reiserfs/tail_conversion.c b/fs/reiserfs/tail_conversion.c
index d157bcdca900..6a4fb6d60c1c 100644
--- a/fs/reiserfs/tail_conversion.c
+++ b/fs/reiserfs/tail_conversion.c
@@ -156,10 +156,10 @@ unmap_buffers(struct page *page, loff_t pos) {
   unsigned long cur_index ;
 
   if (page) {
-    if (page->buffers) {
+    if (page_has_buffers(page)) {
       tail_index = pos & (PAGE_CACHE_SIZE - 1) ;
       cur_index = 0 ;
-      head = page->buffers ;
+      head = page_buffers(page) ;
       bh = head ;
       do {
 	next = bh->b_this_page ;
diff --git a/include/asm-s390/pgtable.h b/include/asm-s390/pgtable.h
index c0591a4b535a..026e0ee40a23 100644
--- a/include/asm-s390/pgtable.h
+++ b/include/asm-s390/pgtable.h
@@ -429,7 +429,7 @@ extern inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
 	                                                                  \
 	if (__page != ZERO_PAGE(__physpage)) {                            \
 		int __users = page_count(__page);                         \
-		__users -= !!__page->buffers + !!__page->mapping;         \
+		__users -= !!PagePrivate(__page) + !!__page->mapping;     \
 	                                                                  \
 		if (__users == 1)                                         \
 			pte_val(__pte) |= _PAGE_MKCLEAR;                  \
diff --git a/include/asm-s390x/pgtable.h b/include/asm-s390x/pgtable.h
index 7d296df217b4..4fee035ea2dc 100644
--- a/include/asm-s390x/pgtable.h
+++ b/include/asm-s390x/pgtable.h
@@ -448,7 +448,7 @@ extern inline pte_t mk_pte_phys(unsigned long physpage, pgprot_t pgprot)
 	                                                                  \
 	if (__page != ZERO_PAGE(__physpage)) {                            \
 		int __users = page_count(__page);                         \
-		__users -= !!__page->buffers + !!__page->mapping;         \
+		__users -= !!PagePrivate(page) + !!__page->mapping;       \
 	                                                                  \
 		if (__users == 1)                                         \
 			pte_val(__pte) |= _PAGE_MKCLEAR;                  \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 75c9b5892f38..f722d24d2242 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -285,6 +285,24 @@ extern void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long
 
 #define touch_buffer(bh)	mark_page_accessed(bh->b_page)
 
+/* If we *know* page->private refers to buffer_heads */
+#define page_buffers(page)					\
+	({							\
+		if (!PagePrivate(page))				\
+			BUG();					\
+		((struct buffer_head *)(page)->private);	\
+	})
+#define page_has_buffers(page)	PagePrivate(page)
+#define set_page_buffers(page, buffers)				\
+	do {							\
+		SetPagePrivate(page);				\
+		page->private = (unsigned long)buffers;		\
+	} while (0)
+#define clear_page_buffers(page)				\
+	do {							\
+		ClearPagePrivate(page);				\
+		page->private = 0;				\
+	} while (0)
 
 #include <linux/pipe_fs_i.h>
 /* #include <linux/umsdos_fs_i.h> */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9f676c226ce8..b74138e819f1 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -154,7 +154,7 @@ typedef struct page {
 					   updated asynchronously */
 	struct list_head lru;		/* Pageout list, eg. active_list;
 					   protected by pagemap_lru_lock !! */
-	struct buffer_head * buffers;	/* Buffer maps us to a disk block. */
+	unsigned long private;		/* fs-private opaque data */
 
 	/*
 	 * On machines where all RAM is mapped into kernel address space,
@@ -177,7 +177,7 @@ typedef struct page {
  *
  * What counts for a page usage:
  * - cache mapping   (page->mapping)
- * - disk mapping    (page->buffers)
+ * - private data    (page->private)
  * - page mapped in a task's page tables, each mapping
  *   is counted separately
  *
@@ -220,13 +220,15 @@ typedef struct page {
  * page->mapping is the pointer to the inode, and page->index is the
  * file offset of the page, in units of PAGE_CACHE_SIZE.
  *
- * A page may have buffers allocated to it. In this case,
- * page->buffers is a circular list of these buffer heads. Else,
- * page->buffers == NULL.
+ * A page contains an opaque `private' member, which belongs to the
+ * page's address_space.  Usually, this is the address of a circular
+ * list of the page's disk buffers.
  *
+ * The PG_private bitflag is set if page->private contains a valid
+ * value.
  * For pages belonging to inodes, the page->count is the number of
- * attaches, plus 1 if buffers are allocated to the page, plus one
- * for the page cache itself.
+ * attaches, plus 1 if `private' contains something, plus one for
+ * the page cache itself.
  *
  * All pages belonging to an inode are in these doubly linked lists:
  * mapping->clean_pages, mapping->dirty_pages and mapping->locked_pages;
@@ -290,6 +292,8 @@ typedef struct page {
 #define PG_reserved		14
 #define PG_launder		15	/* written out by VM pressure.. */
 
+#define PG_private		16	/* Has something at ->private */
+
 /* Make it prettier to test the above... */
 #define UnlockPage(page)	unlock_page(page)
 #define Page_Uptodate(page)	test_bit(PG_uptodate, &(page)->flags)
@@ -306,6 +310,9 @@ typedef struct page {
 #define PageLaunder(page)	test_bit(PG_launder, &(page)->flags)
 #define SetPageLaunder(page)	set_bit(PG_launder, &(page)->flags)
 #define __SetPageReserved(page)	__set_bit(PG_reserved, &(page)->flags)
+#define SetPagePrivate(page)	set_bit(PG_private, &(page)->flags)
+#define ClearPagePrivate(page)	clear_bit(PG_private, &(page)->flags)
+#define PagePrivate(page)	test_bit(PG_private, &(page)->flags)
 
 /*
  * The zone field is never updated after free_area_init_core()
@@ -466,7 +473,7 @@ extern struct address_space swapper_space;
 
 static inline int is_page_cache_freeable(struct page * page)
 {
-	return page_count(page) - !!page->buffers == 1;
+	return page_count(page) - !!PagePrivate(page) == 1;
 }
 
 extern int can_share_swap_page(struct page *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 099b144be1fa..47693b991db7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -150,7 +150,7 @@ void invalidate_inode_pages(struct inode * inode)
 		if (TryLockPage(page))
 			continue;
 
-		if (page->buffers && !try_to_free_buffers(page, 0))
+		if (PagePrivate(page) && !try_to_release_page(page, 0))
 			goto unlock;
 
 		if (page_count(page) != 1)
@@ -182,14 +182,18 @@ static int do_flushpage(struct page *page, unsigned long offset)
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
-	if (page->buffers)
+	if (PagePrivate(page))
 		do_flushpage(page, partial);
 }
 
+/*
+ * AKPM: the PagePrivate test here seems a bit bogus.  It bypasses the
+ * mapping's ->flushpage, which may still want to be called.
+ */
 static void truncate_complete_page(struct page *page)
 {
 	/* Leave it on the LRU if it gets converted into anonymous buffers */
-	if (!page->buffers || do_flushpage(page, 0))
+	if (!PagePrivate(page) || do_flushpage(page, 0))
 		lru_cache_del(page);
 
 	/*
@@ -301,9 +305,9 @@ static inline int invalidate_this_page2(struct address_space * mapping,
 
 	/*
 	 * The page is locked and we hold the mapping lock as well
-	 * so both page_count(page) and page->buffers stays constant here.
+	 * so both page_count(page) and page_buffers stays constant here.
 	 */
-	if (page_count(page) == 1 + !!page->buffers) {
+	if (page_count(page) == 1 + !!page_has_buffers(page)) {
 		/* Restart after this page */
 		list_del(head);
 		list_add_tail(head, curr);
@@ -312,7 +316,7 @@ static inline int invalidate_this_page2(struct address_space * mapping,
 		write_unlock(&mapping->page_lock);
 		truncate_complete_page(page);
 	} else {
-		if (page->buffers) {
+		if (page_has_buffers(page)) {
 			/* Restart after this page */
 			list_del(head);
 			list_add_tail(head, curr);
@@ -409,7 +413,7 @@ static int do_buffer_fdatasync(struct address_space *mapping,
 	while (curr != head) {
 		page = list_entry(curr, struct page, list);
 		curr = curr->next;
-		if (!page->buffers)
+		if (!page_has_buffers(page))
 			continue;
 		if (page->index >= end)
 			continue;
@@ -421,7 +425,7 @@ static int do_buffer_fdatasync(struct address_space *mapping,
 		lock_page(page);
 
 		/* The buffers could have been free'd while we waited for the page lock */
-		if (page->buffers)
+		if (page_has_buffers(page))
 			retval |= fn(page);
 
 		UnlockPage(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 066c2c017022..a183232e28ba 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -97,7 +97,7 @@ static void __free_pages_ok (struct page *page, unsigned int order)
 	struct page *base;
 	zone_t *zone;
 
-	if (page->buffers)
+	if (PagePrivate(page))
 		BUG();
 	if (page->mapping)
 		BUG();
@@ -290,7 +290,7 @@ static struct page * balance_classzone(zone_t * classzone, unsigned int gfp_mask
 					set_page_count(tmp, 1);
 					page = tmp;
 
-					if (page->buffers)
+					if (PagePrivate(page))
 						BUG();
 					if (page->mapping)
 						BUG();
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 32c740c01213..586f48b8e6f8 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -240,7 +240,7 @@ static int exclusive_swap_page(struct page *page)
 		if (p->swap_map[SWP_OFFSET(entry)] == 1) {
 			/* Recheck the page count with the pagecache lock held.. */
 			read_lock(&swapper_space.page_lock);
-			if (page_count(page) - !!page->buffers == 2)
+			if (page_count(page) - !!PagePrivate(page) == 2)
 				retval = 1;
 			read_unlock(&swapper_space.page_lock);
 		}
@@ -265,7 +265,7 @@ int can_share_swap_page(struct page *page)
 		BUG();
 	switch (page_count(page)) {
 	case 3:
-		if (!page->buffers)
+		if (!PagePrivate(page))
 			break;
 		/* Fallthrough */
 	case 2:
@@ -295,7 +295,7 @@ int remove_exclusive_swap_page(struct page *page)
 		BUG();
 	if (!PageSwapCache(page))
 		return 0;
-	if (page_count(page) - !!page->buffers != 2)	/* 2: us + cache */
+	if (page_count(page) - !!PagePrivate(page) != 2) /* 2: us + cache */
 		return 0;
 
 	entry.val = page->index;
@@ -308,7 +308,7 @@ int remove_exclusive_swap_page(struct page *page)
 	if (p->swap_map[SWP_OFFSET(entry)] == 1) {
 		/* Recheck the page count with the pagecache lock held.. */
 		read_lock(&swapper_space.page_lock);
-		if (page_count(page) - !!page->buffers == 2) {
+		if (page_count(page) - !!PagePrivate(page) == 2) {
 			__delete_from_swap_cache(page);
 			SetPageDirty(page);
 			retval = 1;
@@ -344,7 +344,7 @@ void free_swap_and_cache(swp_entry_t entry)
 	if (page) {
 		page_cache_get(page);
 		/* Only cache user (+us), or swap space full? Free it! */
-		if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) {
+		if (page_count(page) - !!PagePrivate(page) == 2 || vm_swap_full()) {
 			delete_from_swap_cache(page);
 			SetPageDirty(page);
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fe363d2f3050..0d730d16afb3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -92,7 +92,8 @@ drop_pte:
 		mm->rss--;
 		UnlockPage(page);
 		{
-			int freeable = page_count(page) - !!page->buffers <= 2;
+			int freeable = page_count(page) -
+				!!PagePrivate(page) <= 2;
 			page_cache_release(page);
 			return freeable;
 		}
@@ -121,7 +122,7 @@ drop_pte:
 	 * Anonymous buffercache pages can be left behind by
 	 * concurrent truncate and pagefault.
 	 */
-	if (page->buffers)
+	if (PagePrivate(page))
 		goto preserve;
 
 	/*
@@ -384,7 +385,7 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
 			continue;
 
 		/* Racy check to avoid trylocking when not worthwhile */
-		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
+		if (!PagePrivate(page) && (page_count(page) != 1 || !page->mapping))
 			goto page_mapped;
 
 		/*
@@ -435,7 +436,7 @@ static int shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask,
 		 * associated with this page. If we succeed we try to free
 		 * the page as well.
 		 */
-		if (page->buffers) {
+		if (PagePrivate(page)) {
 			spin_unlock(&pagemap_lru_lock);
 
 			/* avoid to free a locked page */
-- 
cgit v1.2.3


From 1ed704e93c0ba1dd930f8a451765f054ba218f1b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@zip.com.au>
Date: Tue, 9 Apr 2002 21:29:47 -0700
Subject: [PATCH] writeback daemons

This patch implements a gang-of-threads which are designed to
be used for dirty data writeback. "pdflush" -> dirty page
flush, or something.

The number of threads is dynamically managed by a simple
demand-driven algorithm.

"Oh no, more kernel threads".  Don't worry, kupdate and
bdflush disappear later.

The intent is that no two pdflush threads are ever performing
writeback against the same request queue at the same time.
It would be wasteful to do that.  My current patches don't
quite achieve this; I need to move the state into the request
queue itself...

The driver for implementing the thread pool was to avoid the
possibility where bdflush gets stuck on one device's get_request_wait()
queue while lots of other disks sit idle.  Also generality,
abstraction, and the need to have something in place to perform
the address_space-based writeback when the buffer_head-based
writeback disappears.

There is no provision inside the pdflush code itself to prevent
many threads from working against the same device.  That's
the responsibility of the caller.

The main API function, `pdflush_operation()' attempts to find
a thread to do some work for you.  It is not reliable - it may
return -1 and say "sorry, I didn't do that".  This happens if
all threads are busy.

One _could_ extend pdflush_operation() to queue the work so that
it is guaranteed to happen.  If there's a need, that additional
minor complexity can be added.
---
 include/linux/mm.h    |   3 +
 include/linux/sched.h |   1 +
 mm/Makefile           |   3 +-
 mm/pdflush.c          | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 222 insertions(+), 1 deletion(-)
 create mode 100644 mm/pdflush.c

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b74138e819f1..948e0ca2b480 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -589,6 +589,9 @@ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * m
 
 extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr);
 
+extern int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+extern int pdflush_flush(unsigned long nr_pages);
+
 extern struct page * vmalloc_to_page(void *addr);
 
 #endif /* __KERNEL__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 024e34706cc9..f5fd7b435bd1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -368,6 +368,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
 #define PF_FREE_PAGES	0x00002000	/* per process page freeing */
 #define PF_NOIO		0x00004000	/* avoid generating further I/O */
+#define PF_FLUSHER	0x00008000	/* responsible for disk writeback */
 
 /*
  * Ptrace flags
diff --git a/mm/Makefile b/mm/Makefile
index 65e9a3717247..464eb1810ea6 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -14,6 +14,7 @@ export-objs := shmem.o filemap.o mempool.o page_alloc.o
 obj-y	 := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
 	    vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
 	    page_alloc.o swap_state.o swapfile.o numa.o oom_kill.o \
-	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o
+	    shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
+	    pdflush.o
 
 include $(TOPDIR)/Rules.make
diff --git a/mm/pdflush.c b/mm/pdflush.c
new file mode 100644
index 000000000000..8017d920d0a2
--- /dev/null
+++ b/mm/pdflush.c
@@ -0,0 +1,216 @@
+/*
+ * mm/pdflush.c - worker threads for writing back filesystem data
+ *
+ * Copyright (C) 2002, Linus Torvalds.
+ *
+ * 09Apr2002	akpm@zip.com.au
+ *		Initial version
+ */
+
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/signal.h>
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+
+/*
+ * Minimum and maximum number of pdflush instances
+ */
+#define MIN_PDFLUSH_THREADS	2
+#define MAX_PDFLUSH_THREADS	8
+
+static void start_one_pdflush_thread(void);
+
+
+/*
+ * The pdflush threads are worker threads for writing back dirty data.
+ * Ideally, we'd like one thread per active disk spindle.  But the disk
+ * topology is very hard to divine at this level.   Instead, we take
+ * care in various places to prevent more than one pdflush thread from
+ * performing writeback against a single filesystem.  pdflush threads
+ * have the PF_FLUSHER flag set in current->flags to aid in this.
+ */
+
+/*
+ * All the pdflush threads.  Protected by pdflush_lock
+ */
+static LIST_HEAD(pdflush_list);
+static spinlock_t pdflush_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * The count of currently-running pdflush threads.  Protected
+ * by pdflush_lock.
+ */
+static int nr_pdflush_threads = 0;
+
+/*
+ * The time at which the pdflush thread pool last went empty
+ */
+static unsigned long last_empty_jifs;
+
+/*
+ * The pdflush thread.
+ *
+ * Thread pool management algorithm:
+ * 
+ * - The minumum and maximum number of pdflush instances are bound
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ * 
+ * - If there have been no idle pdflush instances for 1 second, create
+ *   a new one.
+ * 
+ * - If the least-recently-went-to-sleep pdflush thread has been asleep
+ *   for more than one second, terminate a thread.
+ */
+
+/*
+ * A structure for passing work to a pdflush thread.  Also for passing
+ * state information between pdflush threads.  Protected by pdflush_lock.
+ */
+struct pdflush_work {
+	struct task_struct *who;	/* The thread */
+	void (*fn)(unsigned long);	/* A callback function for pdflush to work on */
+	unsigned long arg0;		/* An argument to the callback function */
+	struct list_head list;		/* On pdflush_list, when the thread is idle */
+	unsigned long when_i_went_to_sleep;
+};
+
+/*
+ * preemption is disabled in pdflush.  There was a bug in preempt
+ * which was causing pdflush to get flipped into state TASK_RUNNING
+ * when it performed a spin_unlock.  That bug is probably fixed,
+ * but play it safe.  The preempt-off paths are very short.
+ */
+static int __pdflush(struct pdflush_work *my_work)
+{
+	daemonize();
+	reparent_to_init();
+	strcpy(current->comm, "pdflush");
+
+	/* interruptible sleep, so block all signals */
+	spin_lock_irq(&current->sigmask_lock);
+	siginitsetinv(&current->blocked, 0);
+	recalc_sigpending();
+	spin_unlock_irq(&current->sigmask_lock);
+
+	current->flags |= PF_FLUSHER;
+	my_work->fn = NULL;
+	my_work->who = current;
+
+	preempt_disable();
+	spin_lock_irq(&pdflush_lock);
+	nr_pdflush_threads++;
+	for ( ; ; ) {
+		struct pdflush_work *pdf;
+
+		list_add(&my_work->list, &pdflush_list);
+		my_work->when_i_went_to_sleep = jiffies;
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irq(&pdflush_lock);
+
+		schedule();
+
+		preempt_enable();
+		(*my_work->fn)(my_work->arg0);
+		preempt_disable();
+
+		/*
+		 * Thread creation: For how long have there been zero
+		 * available threads?
+		 */
+		if (jiffies - last_empty_jifs > 1 * HZ) {
+			/* unlocked list_empty() test is OK here */
+			if (list_empty(&pdflush_list)) {
+				/* unlocked nr_pdflush_threads test is OK here */
+				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS)
+					start_one_pdflush_thread();
+			}
+		}
+
+		spin_lock_irq(&pdflush_lock);
+
+		/*
+		 * Thread destruction: For how long has the sleepiest
+		 * thread slept?
+		 */
+		if (list_empty(&pdflush_list))
+			continue;
+		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+			continue;
+		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
+		if (jiffies - pdf->when_i_went_to_sleep > 1 * HZ) {
+			pdf->when_i_went_to_sleep = jiffies;	/* Limit exit rate */
+			break;					/* exeunt */
+		}
+	}
+	nr_pdflush_threads--;
+	spin_unlock_irq(&pdflush_lock);
+	preempt_enable();
+	return 0;
+}
+
+/*
+ * Of course, my_work wants to be just a local in __pdflush().  It is
+ * separated out in this manner to hopefully prevent the compiler from
+ * performing unfortunate optimisations agains the auto variables.  Because
+ * there are visible to other tasks and CPUs.  (No problem has actually
+ * been observed.  This is just paranoia).
+ */
+static int pdflush(void *dummy)
+{
+	struct pdflush_work my_work;
+	return __pdflush(&my_work);
+}
+
+/*
+ * Attempt to wake up a pdflush thread, and get it to do some work for you.
+ * Returns zero if it indeed managed to find a worker thread, and passed your
+ * payload to it.
+ */
+int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (fn == NULL)
+		BUG();		/* Hard to diagnose if it's deferred */
+
+	spin_lock_irqsave(&pdflush_lock, flags);
+	if (list_empty(&pdflush_list)) {
+		spin_unlock_irqrestore(&pdflush_lock, flags);
+		ret = -1;
+	} else {
+		struct pdflush_work *pdf;
+
+		pdf = list_entry(pdflush_list.next, struct pdflush_work, list);
+		list_del_init(&pdf->list);
+		if (list_empty(&pdflush_list))
+			last_empty_jifs = jiffies;
+		spin_unlock_irqrestore(&pdflush_lock, flags);
+		pdf->fn = fn;
+		pdf->arg0 = arg0;
+		wmb();			/* ? */
+		wake_up_process(pdf->who);
+	}
+	return ret;
+}
+
+static void start_one_pdflush_thread(void)
+{
+	kernel_thread(pdflush, NULL,
+			CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+}
+
+static int __init pdflush_init(void)
+{
+	int i;
+
+	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+		start_one_pdflush_thread();
+	return 0;
+}
+
+module_init(pdflush_init);
-- 
cgit v1.2.3


From 13c9d416c0758a001b2a18379715ed6128eed0aa Mon Sep 17 00:00:00 2001
From: Alexander Viro <viro@math.psu.edu>
Date: Tue, 9 Apr 2002 21:32:22 -0700
Subject: [PATCH] jffs2_get_sb() fixes

Fixes races in jffs2_get_sb() - current code has a window when two
mounts of the same mtd device can miss each other, resulting in two
active instances of jffs2 fighting over the same device.
---
 fs/jffs2/super.c   | 46 +++++++++++++++++++++++++---------------------
 fs/super.c         |  2 +-
 include/linux/fs.h |  1 +
 kernel/ksyms.c     |  1 +
 4 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 48f0cf4b7283..3540d160d945 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -94,12 +94,12 @@ static struct super_operations jffs2_super_operations =
 
 static int jffs2_sb_compare(struct super_block *sb, void *data)
 {
-	struct mtd_info *mtd = data;
+	struct jffs2_sb_info *p = data;
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
 	/* The superblocks are considered to be equivalent if the underlying MTD
 	   device is the same one */
-	if (c->mtd == mtd) {
+	if (c->mtd == p->mtd) {
 		D1(printk(KERN_DEBUG "jffs2_sb_compare: match on device %d (\"%s\")\n", mtd->index, mtd->name));
 		return 1;
 	} else {
@@ -111,12 +111,14 @@ static int jffs2_sb_compare(struct super_block *sb, void *data)
 
 static int jffs2_sb_set(struct super_block *sb, void *data)
 {
-	struct mtd_info *mtd = data;
+	struct jffs2_sb_info *p = data;
 
 	/* For persistence of NFS exports etc. we use the same s_dev
 	   each time we mount the device, don't just use an anonymous
 	   device */
-	sb->s_dev = mk_kdev(MTD_BLOCK_MAJOR, mtd->index);
+	sb->u.generic_sbp = p;
+	p->os_priv = sb;
+	sb->s_dev = mk_kdev(MTD_BLOCK_MAJOR, p->mtd->index);
 
 	return 0;
 }
@@ -129,7 +131,13 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 	struct jffs2_sb_info *c;
 	int ret;
 
-	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, mtd);
+	c = kmalloc(sizeof(*c), GFP_KERNEL);
+	if (!c)
+		return ERR_PTR(-ENOMEM);
+	memset(c, 0, sizeof(*c));
+	c->mtd = mtd;
+
+	sb = sget(fs_type, jffs2_sb_compare, jffs2_sb_set, c);
 
 	if (IS_ERR(sb))
 		goto out_put;
@@ -144,19 +152,8 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 	D1(printk(KERN_DEBUG "jffs2_get_sb_mtd(): New superblock for device %d (\"%s\")\n",
 		  mtd->index, mtd->name));
 
-	c = kmalloc(sizeof(*c), GFP_KERNEL);
-	if (!c) {
-		sb = ERR_PTR(-ENOMEM);
-		goto out_put;
-	}
-
-	sb->u.generic_sbp = c;
 	sb->s_op = &jffs2_super_operations;
 
-	memset(c, 0, sizeof(*c));
-	c->os_priv = sb;
-	c->mtd = mtd;
-
 	ret = jffs2_do_fill_super(sb, data, (flags&MS_VERBOSE)?1:0);
 
 	if (ret) {
@@ -164,13 +161,15 @@ static struct super_block *jffs2_get_sb_mtd(struct file_system_type *fs_type,
 		up_write(&sb->s_umount);
 		deactivate_super(sb);
 		sb = ERR_PTR(ret);
-		goto out_put;
+		goto out_put1;
 	}
 
 	sb->s_flags |= MS_ACTIVE;
 	return sb;
 
  out_put:
+	kfree(c);
+ out_put1:
 	put_mtd_device(mtd);
 
 	return sb;
@@ -288,18 +287,23 @@ void jffs2_put_super (struct super_block *sb)
 	kfree(c->blocks);
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
-	put_mtd_device(c->mtd);
-
-	kfree(c);
 
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
+
+static void jffs2_kill_sb(struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	generic_shutdown_super(sb);
+	put_mtd_device(c->mtd);
+	kfree(c);
+}
  
 static struct file_system_type jffs2_fs_type = {
 	owner:		THIS_MODULE,
 	name:		"jffs2",
 	get_sb:		jffs2_get_sb,
-	kill_sb:	generic_shutdown_super
+	kill_sb:	jffs2_kill_sb,
 };
 
 
diff --git a/fs/super.c b/fs/super.c
index 7c259fb46b8e..672a74180307 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -184,7 +184,7 @@ static void remove_super(struct super_block *s)
 	up_write(&s->s_umount);
 }
 
-static void generic_shutdown_super(struct super_block *sb)
+void generic_shutdown_super(struct super_block *sb)
 {
 	struct dentry *root = sb->s_root;
 	struct super_operations *sop = sb->s_op;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f722d24d2242..ce6bd67e4cc9 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -956,6 +956,7 @@ struct super_block *get_sb_single(struct file_system_type *fs_type,
 struct super_block *get_sb_nodev(struct file_system_type *fs_type,
 	int flags, void *data,
 	int (*fill_super)(struct super_block *, void *, int));
+void generic_shutdown_super(struct super_block *sb);
 void kill_block_super(struct super_block *sb);
 void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 3d66f7d8e1f1..3140dc6e2e2c 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -293,6 +293,7 @@ EXPORT_SYMBOL(get_sb_nodev);
 EXPORT_SYMBOL(get_sb_single);
 EXPORT_SYMBOL(kill_anon_super);
 EXPORT_SYMBOL(kill_litter_super);
+EXPORT_SYMBOL(generic_shutdown_super);
 EXPORT_SYMBOL(deactivate_super);
 EXPORT_SYMBOL(sget);
 EXPORT_SYMBOL(set_anon_super);
-- 
cgit v1.2.3