Merge nathans@oss.sgi.com:/oss/bitkeeper/xfs-linux-2.6

into sgi.com:/source2/xfs-linux-2.6
author: Nathan Scott <nathans@sgi.com> 2005-01-20 11:41:18 +1100
committer: Nathan Scott <nathans@sgi.com> 2005-01-20 11:41:18 +1100
commit: 364b1dccc8b290bc69b440b69dede741c356256d (patch)
tree: c8bab6bf2ed7d51628b1a2f3c81ab4ab7b29ae9a
parent: 8aa11989523554e67b3e19a1f4180b11cdf03a3c (diff)
parent: 2008db7cf8b4d2229658af3ceeae7885c6e2ba3f (diff)
22 files changed, 505 insertions, 356 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 9c5d4cff42af..431bb01a824e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -304,82 +304,7 @@ config FS_POSIX_ACL
 	depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL || NFSD_V4
 	default y
 
-config XFS_FS
-	tristate "XFS filesystem support"
-	help
-	  XFS is a high performance journaling filesystem which originated
-	  on the SGI IRIX platform.  It is completely multi-threaded, can
-	  support large files and large filesystems, extended attributes,
-	  variable block sizes, is extent based, and makes extensive use of
-	  Btrees (directories, extents, free space) to aid both performance
-	  and scalability.
-
-	  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
-	  for complete details.  This implementation is on-disk compatible
-	  with the IRIX version of XFS.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called xfs.  Be aware, however, that if the file
-	  system of your root partition is compiled as a module, you'll need
-	  to use an initial ramdisk (initrd) to boot.
-
-config XFS_RT
-	bool "Realtime support (EXPERIMENTAL)"
-	depends on XFS_FS && EXPERIMENTAL
-	help
-	  If you say Y here you will be able to mount and use XFS filesystems
-	  which contain a realtime subvolume. The realtime subvolume is a
-	  separate area of disk space where only file data is stored. The
-	  realtime subvolume is designed to provide very deterministic
-	  data rates suitable for media streaming applications.
-
-	  See the xfs man page in section 5 for a bit more information.
-
-	  This feature is unsupported at this time, is not yet fully
-	  functional, and may cause serious problems.
-
-	  If unsure, say N.
-
-config XFS_QUOTA
-	bool "Quota support"
-	depends on XFS_FS
-	help
-	  If you say Y here, you will be able to set limits for disk usage on
-	  a per user and/or a per group basis under XFS.  XFS considers quota
-	  information as filesystem metadata and uses journaling to provide a
-	  higher level guarantee of consistency.  The on-disk data format for
-	  quota is also compatible with the IRIX version of XFS, allowing a
-	  filesystem to be migrated between Linux and IRIX without any need
-	  for conversion.
-
-	  If unsure, say N.  More comprehensive documentation can be found in
-	  README.quota in the xfsprogs package.  XFS quota can be used either
-	  with or without the generic quota support enabled (CONFIG_QUOTA) -
-	  they are completely independent subsystems.
-
-config XFS_SECURITY
-	bool "Security Label support"
-	depends on XFS_FS
-	help
-	  Security labels support alternative access control models
-	  implemented by security modules like SELinux.  This option
-	  enables an extended attribute namespace for inode security
-	  labels in the XFS filesystem.
-
-	  If you are not using a security module that requires using
-	  extended attributes for inode security labels, say N.
-
-config XFS_POSIX_ACL
-	bool "POSIX ACL support"
-	depends on XFS_FS
-	help
-	  POSIX Access Control Lists (ACLs) support permissions for users and
-	  groups beyond the owner/group/world scheme.
-
-	  To learn more about Access Control Lists, visit the POSIX ACLs for
-	  Linux website <http://acl.bestbits.at/>.
-
-	  If you don't know what Access Control Lists are, say N.
+source "fs/xfs/Kconfig"
 
 config MINIX_FS
 	tristate "Minix fs support"
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 1a53ff65dc3f..96c0ea8f60ce 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -305,11 +305,6 @@ void svc_export_request(struct cache_detail *cd,
 
 static struct svc_export *svc_export_lookup(struct svc_export *, int);
 
-extern struct dentry *
-find_exported_dentry(struct super_block *sb, void *obj, void *parent,
-		     int (*acceptable)(void *context, struct dentry *de),
-		     void *context);
-
 static int check_export(struct inode *inode, int flags)
 {
 
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
new file mode 100644
index 000000000000..ab515dc81ec3
--- /dev/null
+++ b/fs/xfs/Kconfig
@@ -0,0 +1,80 @@
+menu "XFS support"
+
+config XFS_FS
+	tristate "XFS filesystem support"
+	help
+	  XFS is a high performance journaling filesystem which originated
+	  on the SGI IRIX platform.  It is completely multi-threaded, can
+	  support large files and large filesystems, extended attributes,
+	  variable block sizes, is extent based, and makes extensive use of
+	  Btrees (directories, extents, free space) to aid both performance
+	  and scalability.
+
+	  Refer to the documentation at <http://oss.sgi.com/projects/xfs/>
+	  for complete details.  This implementation is on-disk compatible
+	  with the IRIX version of XFS.
+
+	  To compile this file system support as a module, choose M here: the
+	  module will be called xfs.  Be aware, however, that if the file
+	  system of your root partition is compiled as a module, you'll need
+	  to use an initial ramdisk (initrd) to boot.
+
+config XFS_RT
+	bool "Realtime support (EXPERIMENTAL)"
+	depends on XFS_FS && EXPERIMENTAL
+	help
+	  If you say Y here you will be able to mount and use XFS filesystems
+	  which contain a realtime subvolume. The realtime subvolume is a
+	  separate area of disk space where only file data is stored. The
+	  realtime subvolume is designed to provide very deterministic
+	  data rates suitable for media streaming applications.
+
+	  See the xfs man page in section 5 for a bit more information.
+
+	  This feature is unsupported at this time, is not yet fully
+	  functional, and may cause serious problems.
+
+	  If unsure, say N.
+
+config XFS_QUOTA
+	bool "Quota support"
+	depends on XFS_FS
+	help
+	  If you say Y here, you will be able to set limits for disk usage on
+	  a per user and/or a per group basis under XFS.  XFS considers quota
+	  information as filesystem metadata and uses journaling to provide a
+	  higher level guarantee of consistency.  The on-disk data format for
+	  quota is also compatible with the IRIX version of XFS, allowing a
+	  filesystem to be migrated between Linux and IRIX without any need
+	  for conversion.
+
+	  If unsure, say N.  More comprehensive documentation can be found in
+	  README.quota in the xfsprogs package.  XFS quota can be used either
+	  with or without the generic quota support enabled (CONFIG_QUOTA) -
+	  they are completely independent subsystems.
+
+config XFS_SECURITY
+	bool "Security Label support"
+	depends on XFS_FS
+	help
+	  Security labels support alternative access control models
+	  implemented by security modules like SELinux.  This option
+	  enables an extended attribute namespace for inode security
+	  labels in the XFS filesystem.
+
+	  If you are not using a security module that requires using
+	  extended attributes for inode security labels, say N.
+
+config XFS_POSIX_ACL
+	bool "POSIX ACL support"
+	depends on XFS_FS
+	help
+	  POSIX Access Control Lists (ACLs) support permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  To learn more about Access Control Lists, visit the POSIX ACLs for
+	  Linux website <http://acl.bestbits.at/>.
+
+	  If you don't know what Access Control Lists are, say N.
+
+endmenu
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index bb714c1dca18..5dac0f724fc6 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -70,6 +70,7 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= linux-2.6/xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= linux-2.6/xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= linux-2.6/xfs_ioctl32.o
+xfs-$(CONFIG_EXPORTFS)		+= linux-2.6/xfs_export.o
 
 
 xfs-y				+= xfs_alloc.o \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 29fc5b35c268..d09fa326a3bc 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -71,7 +71,7 @@ xfs_page_trace(
 	bhv_desc_t	*bdp;
 	vnode_t		*vp = LINVFS_GET_VP(inode);
 	loff_t		isize = i_size_read(inode);
-	loff_t		offset = page->index << PAGE_CACHE_SHIFT;
+	loff_t		offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int		delalloc = -1, unmapped = -1, unwritten = -1;
 
 	if (page_has_buffers(page))
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d5e75e606d1a..a7cdd5e8d42b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -53,6 +53,7 @@
 #include <linux/workqueue.h>
 #include <linux/percpu.h>
 #include <linux/blkdev.h>
+#include <linux/hash.h>
 
 #include "xfs_linux.h"
 
@@ -127,34 +128,71 @@ ktrace_t *pagebuf_trace_buf;
 	kmem_zone_free(pagebuf_cache, (pb));
 
 /*
- * Pagebuf hashing
+ * Page Region interfaces.
+ *
+ * For pages in filesystems where the blocksize is smaller than the
+ * pagesize, we use the page->private field (long) to hold a bitmap
+ * of uptodate regions within the page.
+ *
+ * Each such region is "bytes per page / bits per long" bytes long.
+ *
+ * NBPPR == number-of-bytes-per-page-region
+ * BTOPR == bytes-to-page-region (rounded up)
+ * BTOPRT == bytes-to-page-region-truncated (rounded down)
  */
+#if (BITS_PER_LONG == 32)
+#define PRSHIFT		(PAGE_CACHE_SHIFT - 5)	/* (32 == 1<<5) */
+#elif (BITS_PER_LONG == 64)
+#define PRSHIFT		(PAGE_CACHE_SHIFT - 6)	/* (64 == 1<<6) */
+#else
+#error BITS_PER_LONG must be 32 or 64
+#endif
+#define NBPPR		(PAGE_CACHE_SIZE/BITS_PER_LONG)
+#define BTOPR(b)	(((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
+#define BTOPRT(b)	(((unsigned int)(b) >> PRSHIFT))
+
+STATIC unsigned long
+page_region_mask(
+	size_t		offset,
+	size_t		length)
+{
+	unsigned long	mask;
+	int		first, final;
 
-#define NBITS	8
-#define NHASH	(1<<NBITS)
+	first = BTOPR(offset);
+	final = BTOPRT(offset + length - 1);
+	first = min(first, final);
 
-typedef struct {
-	struct list_head	pb_hash;
-	spinlock_t		pb_hash_lock;
-} pb_hash_t;
+	mask = ~0UL;
+	mask <<= BITS_PER_LONG - (final - first);
+	mask >>= BITS_PER_LONG - (final);
 
-STATIC pb_hash_t	pbhash[NHASH];
-#define pb_hash(pb)	&pbhash[pb->pb_hash_index]
+	ASSERT(offset + length <= PAGE_CACHE_SIZE);
+	ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
 
-STATIC int
-_bhash(
-	struct block_device *bdev,
-	loff_t		base)
+	return mask;
+}
+
+STATIC inline void
+set_page_region(
+	struct page	*page,
+	size_t		offset,
+	size_t		length)
 {
-	int		bit, hval;
+	page->private |= page_region_mask(offset, length);
+	if (page->private == ~0UL)
+		SetPageUptodate(page);
+}
 
-	base >>= 9;
-	base ^= (unsigned long)bdev / L1_CACHE_BYTES;
-	for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
-		hval ^= (int)base & (NHASH-1);
-		base >>= NBITS;
-	}
-	return hval;
+STATIC inline int
+test_page_region(
+	struct page	*page,
+	size_t		offset,
+	size_t		length)
+{
+	unsigned long	mask = page_region_mask(offset, length);
+
+	return (mask && (page->private & mask) == mask);
 }
 
 /*
@@ -340,7 +378,6 @@ _pagebuf_lookup_pages(
 	uint			flags)
 {
 	struct address_space	*mapping = bp->pb_target->pbr_mapping;
-	unsigned int		sectorshift = bp->pb_target->pbr_sshift;
 	size_t			blocksize = bp->pb_target->pbr_bsize;
 	size_t			size = bp->pb_count_desired;
 	size_t			nbytes, offset;
@@ -400,22 +437,11 @@ _pagebuf_lookup_pages(
 
 		if (!PageUptodate(page)) {
 			page_count--;
-			if (blocksize == PAGE_CACHE_SIZE) {
+			if (blocksize >= PAGE_CACHE_SIZE) {
 				if (flags & PBF_READ)
 					bp->pb_locked = 1;
 			} else if (!PagePrivate(page)) {
-				unsigned long	j, range;
-
-				/*
-				 * In this case page->private holds a bitmap
-				 * of uptodate sectors within the page
-				 */
-				ASSERT(blocksize < PAGE_CACHE_SIZE);
-				range = (offset + nbytes) >> sectorshift;
-				for (j = offset >> sectorshift; j < range; j++)
-					if (!test_bit(j, &page->private))
-						break;
-				if (j == range)
+				if (test_page_region(page, offset, nbytes))
 					page_count++;
 			}
 		}
@@ -483,8 +509,8 @@ _pagebuf_map_pages(
  *	are unlocked.  No I/O is implied by this call.
  */
 xfs_buf_t *
-_pagebuf_find(				/* find buffer for block	*/
-	xfs_buftarg_t		*target,/* target for block		*/
+_pagebuf_find(
+	xfs_buftarg_t		*btp,	/* block device target		*/
 	loff_t			ioff,	/* starting offset of range	*/
 	size_t			isize,	/* length of range		*/
 	page_buf_flags_t	flags,	/* PBF_TRYLOCK			*/
@@ -492,59 +518,55 @@ _pagebuf_find(				/* find buffer for block	*/
 {
 	loff_t			range_base;
 	size_t			range_length;
-	int			hval;
-	pb_hash_t		*h;
+	xfs_bufhash_t		*hash;
 	xfs_buf_t		*pb, *n;
-	int			not_locked;
 
 	range_base = (ioff << BBSHIFT);
 	range_length = (isize << BBSHIFT);
 
-	/* Ensure we never do IOs smaller than the sector size */
-	BUG_ON(range_length < (1 << target->pbr_sshift));
+	/* Check for IOs smaller than the sector size / not sector aligned */
+	ASSERT(!(range_length < (1 << btp->pbr_sshift)));
+	ASSERT(!(range_base & (loff_t)btp->pbr_smask));
 
-	/* Ensure we never do IOs that are not sector aligned */
-	BUG_ON(range_base & (loff_t)target->pbr_smask);
+	hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
 
-	hval = _bhash(target->pbr_bdev, range_base);
-	h = &pbhash[hval];
+	spin_lock(&hash->bh_lock);
 
-	spin_lock(&h->pb_hash_lock);
-	list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
-		if (pb->pb_target == target &&
-		    pb->pb_file_offset == range_base &&
+	list_for_each_entry_safe(pb, n, &hash->bh_list, pb_hash_list) {
+		ASSERT(btp == pb->pb_target);
+		if (pb->pb_file_offset == range_base &&
 		    pb->pb_buffer_length == range_length) {
-			/* If we look at something bring it to the
-			 * front of the list for next time
+			/*
+			 * If we look at something bring it to the
+			 * front of the list for next time.
 			 */
 			atomic_inc(&pb->pb_hold);
-			list_move(&pb->pb_hash_list, &h->pb_hash);
+			list_move(&pb->pb_hash_list, &hash->bh_list);
 			goto found;
 		}
 	}
 
 	/* No match found */
 	if (new_pb) {
-		_pagebuf_initialize(new_pb, target, range_base,
+		_pagebuf_initialize(new_pb, btp, range_base,
 				range_length, flags);
-		new_pb->pb_hash_index = hval;
-		list_add(&new_pb->pb_hash_list, &h->pb_hash);
+		new_pb->pb_hash = hash;
+		list_add(&new_pb->pb_hash_list, &hash->bh_list);
 	} else {
 		XFS_STATS_INC(pb_miss_locked);
 	}
 
-	spin_unlock(&h->pb_hash_lock);
-	return (new_pb);
+	spin_unlock(&hash->bh_lock);
+	return new_pb;
 
 found:
-	spin_unlock(&h->pb_hash_lock);
+	spin_unlock(&hash->bh_lock);
 
 	/* Attempt to get the semaphore without sleeping,
 	 * if this does not work then we need to drop the
 	 * spinlock and do a hard attempt on the semaphore.
 	 */
-	not_locked = down_trylock(&pb->pb_sema);
-	if (not_locked) {
+	if (down_trylock(&pb->pb_sema)) {
 		if (!(flags & PBF_TRYLOCK)) {
 			/* wait for buffer ownership */
 			PB_TRACE(pb, "get_lock", 0);
@@ -711,8 +733,6 @@ pagebuf_readahead(
 	bdi = target->pbr_mapping->backing_dev_info;
 	if (bdi_read_congested(bdi))
 		return;
-	if (bdi_write_congested(bdi))
-		return;
 
 	flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
 	xfs_buf_read_flags(target, ioff, isize, flags);
@@ -866,18 +886,29 @@ void
 pagebuf_rele(
 	xfs_buf_t		*pb)
 {
-	pb_hash_t		*hash = pb_hash(pb);
+	xfs_bufhash_t		*hash = pb->pb_hash;
 
 	PB_TRACE(pb, "rele", pb->pb_relse);
 
-	if (atomic_dec_and_lock(&pb->pb_hold, &hash->pb_hash_lock)) {
+	/*
+	 * pagebuf_lookup buffers are not hashed, not delayed write,
+	 * and don't have their own release routines.  Special case.
+	 */
+	if (unlikely(!hash)) {
+		ASSERT(!pb->pb_relse);
+		if (atomic_dec_and_test(&pb->pb_hold))
+			xfs_buf_free(pb);
+		return;
+	}
+
+	if (atomic_dec_and_lock(&pb->pb_hold, &hash->bh_lock)) {
 		int		do_free = 1;
 
 		if (pb->pb_relse) {
 			atomic_inc(&pb->pb_hold);
-			spin_unlock(&hash->pb_hash_lock);
+			spin_unlock(&hash->bh_lock);
 			(*(pb->pb_relse)) (pb);
-			spin_lock(&hash->pb_hash_lock);
+			spin_lock(&hash->bh_lock);
 			do_free = 0;
 		}
 
@@ -892,10 +923,10 @@ pagebuf_rele(
 
 		if (do_free) {
 			list_del_init(&pb->pb_hash_list);
-			spin_unlock(&hash->pb_hash_lock);
+			spin_unlock(&hash->bh_lock);
 			pagebuf_free(pb);
 		} else {
-			spin_unlock(&hash->pb_hash_lock);
+			spin_unlock(&hash->bh_lock);
 		}
 	}
 }
@@ -935,6 +966,7 @@ pagebuf_cond_lock(			/* lock buffer, if not locked	*/
 	return(locked ? 0 : -EBUSY);
 }
 
+#ifdef DEBUG
 /*
  *	pagebuf_lock_value
  *
@@ -946,6 +978,7 @@ pagebuf_lock_value(
 {
 	return(atomic_read(&pb->pb_sema.count));
 }
+#endif
 
 /*
  *	pagebuf_lock
@@ -1216,7 +1249,6 @@ bio_end_io_pagebuf(
 {
 	xfs_buf_t		*pb = (xfs_buf_t *)bio->bi_private;
 	unsigned int		i, blocksize = pb->pb_target->pbr_bsize;
-	unsigned int		sectorshift = pb->pb_target->pbr_sshift;
 	struct bio_vec		*bvec = bio->bi_io_vec;
 
 	if (bio->bi_size)
@@ -1234,14 +1266,7 @@ bio_end_io_pagebuf(
 			SetPageUptodate(page);
 		} else if (!PagePrivate(page) &&
 				(pb->pb_flags & _PBF_PAGE_CACHE)) {
-			unsigned long	j, range;
-
-			ASSERT(blocksize < PAGE_CACHE_SIZE);
-			range = (bvec->bv_offset + bvec->bv_len) >> sectorshift;
-			for (j = bvec->bv_offset >> sectorshift; j < range; j++)
-				set_bit(j, &page->private);
-			if (page->private == (unsigned long)(PAGE_CACHE_SIZE-1))
-				SetPageUptodate(page);
+			set_page_region(page, bvec->bv_offset, bvec->bv_len);
 		}
 
 		if (_pagebuf_iolocked(pb)) {
@@ -1470,28 +1495,59 @@ pagebuf_iomove(
  */
 void
 xfs_wait_buftarg(
-	xfs_buftarg_t *target)
+	xfs_buftarg_t	*btp)
 {
-	xfs_buf_t	*pb, *n;
-	pb_hash_t	*h;
-	int		i;
+	xfs_buf_t	*bp, *n;
+	xfs_bufhash_t	*hash;
+	uint		i;
 
-	for (i = 0; i < NHASH; i++) {
-		h = &pbhash[i];
+	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+		hash = &btp->bt_hash[i];
 again:
-		spin_lock(&h->pb_hash_lock);
-		list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
-			if (pb->pb_target == target &&
-					!(pb->pb_flags & PBF_FS_MANAGED)) {
-				spin_unlock(&h->pb_hash_lock);
+		spin_lock(&hash->bh_lock);
+		list_for_each_entry_safe(bp, n, &hash->bh_list, pb_hash_list) {
+			ASSERT(btp == bp->pb_target);
+			if (!(bp->pb_flags & PBF_FS_MANAGED)) {
+				spin_unlock(&hash->bh_lock);
 				delay(100);
 				goto again;
 			}
 		}
-		spin_unlock(&h->pb_hash_lock);
+		spin_unlock(&hash->bh_lock);
 	}
 }
 
+/*
+ * Allocate buffer hash table for a given target.
+ * For devices containing metadata (i.e. not the log/realtime devices)
+ * we need to allocate a much larger hash table.
+ */
+STATIC void
+xfs_alloc_bufhash(
+	xfs_buftarg_t		*btp,
+	int			external)
+{
+	unsigned int		i;
+
+	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
+	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
+	btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
+					sizeof(xfs_bufhash_t), KM_SLEEP);
+	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
+		spin_lock_init(&btp->bt_hash[i].bh_lock);
+		INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
+	}
+}
+
+STATIC void
+xfs_free_bufhash(
+	xfs_buftarg_t		*btp)
+{
+	kmem_free(btp->bt_hash,
+			(1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t));
+	btp->bt_hash = NULL;
+}
+
 void
 xfs_free_buftarg(
 	xfs_buftarg_t		*btp,
@@ -1500,6 +1556,7 @@ xfs_free_buftarg(
 	xfs_flush_buftarg(btp, 1);
 	if (external)
 		xfs_blkdev_put(btp->pbr_bdev);
+	xfs_free_bufhash(btp);
 	iput(btp->pbr_mapping->host);
 	kmem_free(btp, sizeof(*btp));
 }
@@ -1514,11 +1571,12 @@ xfs_incore_relse(
 	truncate_inode_pages(btp->pbr_mapping, 0LL);
 }
 
-int
-xfs_setsize_buftarg(
+STATIC int
+xfs_setsize_buftarg_flags(
 	xfs_buftarg_t		*btp,
 	unsigned int		blocksize,
-	unsigned int		sectorsize)
+	unsigned int		sectorsize,
+	int			verbose)
 {
 	btp->pbr_bsize = blocksize;
 	btp->pbr_sshift = ffs(sectorsize) - 1;
@@ -1530,9 +1588,42 @@ xfs_setsize_buftarg(
 			sectorsize, XFS_BUFTARG_NAME(btp));
 		return EINVAL;
 	}
+
+	if (verbose &&
+	    (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
+		printk(KERN_WARNING
+			"XFS: %u byte sectors in use on device %s.  "
+			"This is suboptimal; %u or greater is ideal.\n",
+			sectorsize, XFS_BUFTARG_NAME(btp),
+			(unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
+	}
+
 	return 0;
 }
 
+/*
+* When allocating the initial buffer target we have not yet
+* read in the superblock, so don't know what sized sectors
+* are being used is at this early stage.  Play safe.
+*/
+STATIC int
+xfs_setsize_buftarg_early(
+	xfs_buftarg_t		*btp,
+	struct block_device	*bdev)
+{
+	return xfs_setsize_buftarg_flags(btp,
+			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+}
+
+int
+xfs_setsize_buftarg(
+	xfs_buftarg_t		*btp,
+	unsigned int		blocksize,
+	unsigned int		sectorsize)
+{
+	return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+}
+
 STATIC int
 xfs_mapping_buftarg(
 	xfs_buftarg_t		*btp,
@@ -1568,7 +1659,8 @@ xfs_mapping_buftarg(
 
 xfs_buftarg_t *
 xfs_alloc_buftarg(
-	struct block_device	*bdev)
+	struct block_device	*bdev,
+	int			external)
 {
 	xfs_buftarg_t		*btp;
 
@@ -1576,10 +1668,11 @@ xfs_alloc_buftarg(
 
 	btp->pbr_dev =  bdev->bd_dev;
 	btp->pbr_bdev = bdev;
-	if (xfs_setsize_buftarg(btp, PAGE_CACHE_SIZE, bdev_hardsect_size(bdev)))
+	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
 	if (xfs_mapping_buftarg(btp, bdev))
 		goto error;
+	xfs_alloc_bufhash(btp, external);
 	return btp;
 
 error:
@@ -1843,8 +1936,6 @@ pagebuf_daemon_stop(void)
 int __init
 pagebuf_init(void)
 {
-	int			i;
-
 	pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
 			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (pagebuf_cache == NULL) {
@@ -1865,11 +1956,6 @@ pagebuf_init(void)
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < NHASH; i++) {
-		spin_lock_init(&pbhash[i].pb_hash_lock);
-		INIT_LIST_HEAD(&pbhash[i].pb_hash);
-	}
-
 	return 0;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index dc43517d8e31..74deed8e6d90 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -95,6 +95,11 @@ typedef enum page_buf_flags_e {		/* pb_flags values */
 #define PBF_NOT_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) != 0)
 #define PBF_DONE(pb) (((pb)->pb_flags & (PBF_PARTIAL|PBF_NONE)) == 0)
 
+typedef struct xfs_bufhash {
+	struct list_head	bh_list;
+	spinlock_t		bh_lock;
+} xfs_bufhash_t;
+
 typedef struct xfs_buftarg {
 	dev_t			pbr_dev;
 	struct block_device	*pbr_bdev;
@@ -102,32 +107,35 @@ typedef struct xfs_buftarg {
 	unsigned int		pbr_bsize;
 	unsigned int		pbr_sshift;
 	size_t			pbr_smask;
+
+	/* per-device buffer hash table */
+	uint			bt_hashmask;
+	uint			bt_hashshift;
+	xfs_bufhash_t		*bt_hash;
 } xfs_buftarg_t;
 
 /*
  *	xfs_buf_t:  Buffer structure for page cache-based buffers
  *
  * This buffer structure is used by the page cache buffer management routines
- * to refer to an assembly of pages forming a logical buffer.  The actual
- * I/O is performed with buffer_head or bio structures, as required by drivers,
- * for drivers which do not understand this structure.  The buffer structure is
- * used on temporary basis only, and discarded when released.
- *
- * The real data storage is recorded in the page cache.  Metadata is
- * hashed to the inode for the block device on which the file system resides.
- * File data is hashed to the inode for the file.  Pages which are only
- * partially filled with data have bits set in their block_map entry
- * to indicate which disk blocks in the page are not valid.
+ * to refer to an assembly of pages forming a logical buffer.  The actual I/O
+ * is performed with buffer_head structures, as required by drivers.
+ * 
+ * The buffer structure is used on temporary basis only, and discarded when
+ * released.  The real data storage is recorded in the page cache.  Metadata is
+ * hashed to the block device on which the file system resides.
  */
 
 struct xfs_buf;
+
+/* call-back function on I/O completion */
 typedef void (*page_buf_iodone_t)(struct xfs_buf *);
-			/* call-back function on I/O completion */
+/* call-back function on I/O completion */
 typedef void (*page_buf_relse_t)(struct xfs_buf *);
-			/* call-back function on I/O completion */
+/* pre-write function */
 typedef int (*page_buf_bdstrat_t)(struct xfs_buf *);
 
-#define PB_PAGES	4
+#define PB_PAGES	2
 
 typedef struct xfs_buf {
 	struct semaphore	pb_sema;	/* semaphore for lockables  */
@@ -136,8 +144,9 @@ typedef struct xfs_buf {
 	wait_queue_head_t	pb_waiters;	/* unpin waiters	    */
 	struct list_head	pb_list;
 	page_buf_flags_t	pb_flags;	/* status flags */
-	struct list_head	pb_hash_list;
-	xfs_buftarg_t		*pb_target;	/* logical object */
+	struct list_head	pb_hash_list;	/* hash table list */
+	xfs_bufhash_t		*pb_hash;	/* hash table list start */
+	xfs_buftarg_t		*pb_target;	/* buffer target (device) */
 	atomic_t		pb_hold;	/* reference count */
 	xfs_daddr_t		pb_bn;		/* block number for I/O */
 	loff_t			pb_file_offset;	/* offset in file */
@@ -154,10 +163,9 @@ typedef struct xfs_buf {
 	void			*pb_fspriv2;
 	void			*pb_fspriv3;
 	unsigned short		pb_error;	/* error code on I/O */
-	unsigned short		pb_page_count;	/* size of page array */
-	unsigned short		pb_offset;	/* page offset in first page */
-	unsigned char		pb_locked;	/* page array is locked */
-	unsigned char		pb_hash_index;	/* hash table index	*/
+ 	unsigned short		pb_locked;	/* page array is locked */
+ 	unsigned int		pb_page_count;	/* size of page array */
+	unsigned int		pb_offset;	/* page offset in first page */
 	struct page		**pb_pages;	/* array of page pointers */
 	struct page		*pb_page_array[PB_PAGES]; /* inline pages */
 #ifdef PAGEBUF_LOCK_TRACKING
@@ -455,7 +463,7 @@ extern inline xfs_caddr_t xfs_buf_offset(xfs_buf_t *bp, size_t offset)
 				pagebuf_associate_memory(bp, val, count)
 #define XFS_BUF_ADDR(bp)	((bp)->pb_bn)
 #define XFS_BUF_SET_ADDR(bp, blk)		\
-			((bp)->pb_bn = (blk))
+			((bp)->pb_bn = (xfs_daddr_t)(blk))
 #define XFS_BUF_OFFSET(bp)	((bp)->pb_file_offset)
 #define XFS_BUF_SET_OFFSET(bp, off)		\
 			((bp)->pb_file_offset = (off))
@@ -564,7 +572,7 @@ static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp)
  *	Handling of buftargs.
  */
 
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *);
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int);
 extern void xfs_free_buftarg(xfs_buftarg_t *, int);
 extern void xfs_wait_buftarg(xfs_buftarg_t *);
 extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
new file mode 100644
index 000000000000..772d216d8146
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * Further, this software is distributed without any warranty that it is
+ * free of the rightful claim of any third person regarding infringement
+ * or the like.  Any license provided herein, whether implied or
+ * otherwise, applies only to this software file.  Patent licenses, if
+ * any, provided herein do not apply to combinations of this program with
+ * other software, or any other product whatsoever.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston MA 02111-1307, USA.
+ *
+ * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
+ * Mountain View, CA  94043, or:
+ *
+ * http://www.sgi.com
+ *
+ * For further information regarding this notice, see:
+ *
+ * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
+ */
+
+#include "xfs.h"
+
+
+STATIC struct dentry *
+linvfs_decode_fh(
+	struct super_block	*sb,
+	__u32			*fh,
+	int			fh_len,
+	int			fileid_type,
+	int (*acceptable)(
+		void		*context,
+		struct dentry	*de),
+	void			*context)
+{
+	__u32 parent[2];
+	parent[0] = parent[1] = 0;
+	
+	if (fh_len < 2 || fileid_type > 2)
+		return NULL;
+	
+	if (fileid_type == 2 && fh_len > 2) {
+		if (fh_len == 3) {
+			printk(KERN_WARNING
+			       "XFS: detected filehandle without "
+			       "parent inode generation information.");
+			return ERR_PTR(-ESTALE);
+		}
+			
+		parent[0] = fh[2];
+		parent[1] = fh[3];
+	}
+	
+	return find_exported_dentry(sb, fh, parent, acceptable, context);
+
+}
+
+STATIC struct dentry *
+linvfs_get_dentry(
+	struct super_block	*sb,
+	void			*data)
+{
+	vnode_t			*vp;
+	struct inode		*inode;
+	struct dentry		*result;
+	xfs_fid2_t		xfid;
+	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
+	int			error;
+
+	xfid.fid_len = sizeof(xfs_fid2_t) - sizeof(xfid.fid_len);
+	xfid.fid_pad = 0;
+	xfid.fid_gen = ((__u32 *)data)[1];
+	xfid.fid_ino = ((__u32 *)data)[0];
+
+	VFS_VGET(vfsp, &vp, (fid_t *)&xfid, error);
+	if (error || vp == NULL)
+		return ERR_PTR(-ESTALE) ;
+
+	inode = LINVFS_GET_IP(vp);
+	result = d_alloc_anon(inode);
+        if (!result) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+	return result;
+}
+
+STATIC struct dentry *
+linvfs_get_parent(
+	struct dentry		*child)
+{
+	int			error;
+	vnode_t			*vp, *cvp;
+	struct dentry		*parent;
+	struct dentry		dotdot;
+
+	dotdot.d_name.name = "..";
+	dotdot.d_name.len = 2;
+	dotdot.d_inode = NULL;
+
+	cvp = NULL;
+	vp = LINVFS_GET_VP(child->d_inode);
+	VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
+	if (unlikely(error))
+		return ERR_PTR(-error);
+
+	parent = d_alloc_anon(LINVFS_GET_IP(cvp));
+	if (unlikely(!parent)) {
+		VN_RELE(cvp);
+		return ERR_PTR(-ENOMEM);
+	}
+	return parent;
+}
+
+struct export_operations linvfs_export_ops = {
+	.decode_fh		= linvfs_decode_fh,
+	.get_parent		= linvfs_get_parent,
+	.get_dentry		= linvfs_get_dentry,
+};
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 66462e12597a..0dd97bd7b146 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -81,23 +81,23 @@ __linvfs_read(
 
 
 STATIC ssize_t
-linvfs_read(
+linvfs_aio_read(
 	struct kiocb		*iocb,
 	char			__user *buf,
 	size_t			count,
 	loff_t			pos)
 {
-	return __linvfs_read(iocb, buf, 0, count, pos);
+	return __linvfs_read(iocb, buf, IO_ISAIO, count, pos);
 }
 
 STATIC ssize_t
-linvfs_read_invis(
+linvfs_aio_read_invis(
 	struct kiocb		*iocb,
 	char			__user *buf,
 	size_t			count,
 	loff_t			pos)
 {
-	return __linvfs_read(iocb, buf, IO_INVIS, count, pos);
+	return __linvfs_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
 }
 
 
@@ -125,23 +125,23 @@ __linvfs_write(
 
 
 STATIC ssize_t
-linvfs_write(
+linvfs_aio_write(
 	struct kiocb		*iocb,
 	const char		__user *buf,
 	size_t			count,
 	loff_t			pos)
 {
-	return __linvfs_write(iocb, buf, 0, count, pos);
+	return __linvfs_write(iocb, buf, IO_ISAIO, count, pos);
 }
 
 STATIC ssize_t
-linvfs_write_invis(
+linvfs_aio_write_invis(
 	struct kiocb		*iocb,
 	const char		__user *buf,
 	size_t			count,
 	loff_t			pos)
 {
-	return __linvfs_write(iocb, buf, IO_INVIS, count, pos);
+	return __linvfs_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos);
 }
 
 
@@ -492,8 +492,8 @@ struct file_operations linvfs_file_operations = {
 	.write		= do_sync_write,
 	.readv		= linvfs_readv,
 	.writev		= linvfs_writev,
-	.aio_read	= linvfs_read,
-	.aio_write	= linvfs_write,
+	.aio_read	= linvfs_aio_read,
+	.aio_write	= linvfs_aio_write,
 	.sendfile	= linvfs_sendfile,
 	.ioctl		= linvfs_ioctl,
 	.mmap		= linvfs_file_mmap,
@@ -508,8 +508,8 @@ struct file_operations linvfs_invis_file_operations = {
 	.write		= do_sync_write,
 	.readv		= linvfs_readv_invis,
 	.writev		= linvfs_writev_invis,
-	.aio_read	= linvfs_read_invis,
-	.aio_write	= linvfs_write_invis,
+	.aio_read	= linvfs_aio_read_invis,
+	.aio_write	= linvfs_aio_write_invis,
 	.sendfile	= linvfs_sendfile,
 	.ioctl		= linvfs_ioctl_invis,
 	.mmap		= linvfs_file_mmap,
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 284f2ad68e3a..77c393654537 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -499,7 +499,7 @@ xfs_attrmulti_by_handle(
 	xfs_fsop_attrmulti_handlereq_t am_hreq;
 	struct inode		*inode;
 	vnode_t			*vp;
-	int			i, size;
+	unsigned int		i, size;
 
 	error = xfs_vget_fsop_handlereq(mp, parinode, CAP_SYS_ADMIN, arg,
 					sizeof(xfs_fsop_attrmulti_handlereq_t),
@@ -509,6 +509,11 @@ xfs_attrmulti_by_handle(
 		return -error;
 
 	size = am_hreq.opcount * sizeof(attr_multiop_t);
+	if (!size || size > 16 * PAGE_SIZE) {
+		VN_RELE(vp);
+		return -XFS_ERROR(E2BIG);
+	}
+
 	ops = (xfs_attr_multiop_t *)kmalloc(size, GFP_KERNEL);
 	if (!ops) {
 		VN_RELE(vp);
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 0cd5674cb3c5..407e99359391 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -369,33 +369,6 @@ linvfs_rename(
 	return 0;
 }
 
-STATIC int
-linvfs_readlink(
-	struct dentry	*dentry,
-	char		__user *buf,
-	int		size)
-{
-	vnode_t		*vp = LINVFS_GET_VP(dentry->d_inode);
-	uio_t		uio;
-	iovec_t		iov;
-	int		error;
-
-	iov.iov_base = buf;
-	iov.iov_len = size;
-
-	uio.uio_iov = &iov;
-	uio.uio_offset = 0;
-	uio.uio_segflg = UIO_USERSPACE;
-	uio.uio_resid = size;
-	uio.uio_iovcnt = 1;
-
-	VOP_READLINK(vp, &uio, 0, NULL, error);
-	if (error)
-		return -error;
-
-	return (size - uio.uio_resid);
-}
-
 /*
  * careful here - this function can get called recursively, so
  * we need to be very careful about how much stack we use.
@@ -694,7 +667,7 @@ struct inode_operations linvfs_dir_inode_operations = {
 };
 
 struct inode_operations linvfs_symlink_inode_operations = {
-	.readlink		= linvfs_readlink,
+	.readlink		= generic_readlink,
 	.follow_link		= linvfs_follow_link,
 	.put_link		= linvfs_put_link,
 	.permission		= linvfs_permission,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 88e2b7c7b73c..0dee823e5314 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -317,7 +317,7 @@ xfs_read(
 	xfs_rw_enter_trace(XFS_READ_ENTER, &ip->i_iocore,
 				(void *)iovp, segs, *offset, ioflags);
 	ret = __generic_file_aio_read(iocb, iovp, segs, offset);
-	if (ret == -EIOCBQUEUED)
+	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
 		ret = wait_on_sync_kiocb(iocb);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
@@ -854,7 +854,7 @@ retry:
 
 	current->backing_dev_info = NULL;
 
-	if (ret == -EIOCBQUEUED)
+	if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
 		ret = wait_on_sync_kiocb(iocb);
 
 	if ((ret == -ENOSPC) &&
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index b96f6e2db285..8bfba484e40b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -75,7 +75,6 @@
 
 STATIC struct quotactl_ops linvfs_qops;
 STATIC struct super_operations linvfs_sops;
-STATIC struct export_operations linvfs_export_ops;
 STATIC kmem_zone_t *linvfs_inode_zone;
 STATIC kmem_shaker_t xfs_inode_shaker;
 
@@ -659,63 +658,6 @@ linvfs_freeze_fs(
 	VFS_FREEZE(LINVFS_GET_VFS(sb));
 }
 
-STATIC struct dentry *
-linvfs_get_parent(
-	struct dentry		*child)
-{
-	int			error;
-	vnode_t			*vp, *cvp;
-	struct dentry		*parent;
-	struct dentry		dotdot;
-
-	dotdot.d_name.name = "..";
-	dotdot.d_name.len = 2;
-	dotdot.d_inode = NULL;
-
-	cvp = NULL;
-	vp = LINVFS_GET_VP(child->d_inode);
-	VOP_LOOKUP(vp, &dotdot, &cvp, 0, NULL, NULL, error);
-	if (unlikely(error))
-		return ERR_PTR(-error);
-
-	parent = d_alloc_anon(LINVFS_GET_IP(cvp));
-	if (unlikely(!parent)) {
-		VN_RELE(cvp);
-		return ERR_PTR(-ENOMEM);
-	}
-	return parent;
-}
-
-STATIC struct dentry *
-linvfs_get_dentry(
-	struct super_block	*sb,
-	void			*data)
-{
-	vnode_t			*vp;
-	struct inode		*inode;
-	struct dentry		*result;
-	xfs_fid2_t		xfid;
-	vfs_t			*vfsp = LINVFS_GET_VFS(sb);
-	int			error;
-
-	xfid.fid_len = sizeof(xfs_fid2_t) - sizeof(xfid.fid_len);
-	xfid.fid_pad = 0;
-	xfid.fid_gen = ((__u32 *)data)[1];
-	xfid.fid_ino = ((__u32 *)data)[0];
-
-	VFS_VGET(vfsp, &vp, (fid_t *)&xfid, error);
-	if (error || vp == NULL)
-		return ERR_PTR(-ESTALE) ;
-
-	inode = LINVFS_GET_IP(vp);
-	result = d_alloc_anon(inode);
-        if (!result) {
-		iput(inode);
-		return ERR_PTR(-ENOMEM);
-	}
-	return result;
-}
-
 STATIC int
 linvfs_show_options(
 	struct seq_file		*m,
@@ -808,7 +750,9 @@ linvfs_fill_super(
 	}
 
 	sb_min_blocksize(sb, BBSIZE);
+#ifdef CONFIG_EXPORTFS
 	sb->s_export_op = &linvfs_export_ops;
+#endif
 	sb->s_qcop = &linvfs_qops;
 	sb->s_op = &linvfs_sops;
 
@@ -877,12 +821,6 @@ linvfs_get_sb(
 	return get_sb_bdev(fs_type, flags, dev_name, data, linvfs_fill_super);
 }
 
-
-STATIC struct export_operations linvfs_export_ops = {
-	.get_parent		= linvfs_get_parent,
-	.get_dentry		= linvfs_get_dentry,
-};
-
 STATIC struct super_operations linvfs_sops = {
 	.alloc_inode		= linvfs_alloc_inode,
 	.destroy_inode		= linvfs_destroy_inode,
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 866c7ad75f92..ec7e0035c731 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -133,4 +133,6 @@ extern int  xfs_blkdev_get(struct xfs_mount *, const char *,
 				struct block_device **);
 extern void xfs_blkdev_put(struct block_device *);
 
+extern struct export_operations linvfs_export_ops;
+
 #endif	/* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 3466465fa8d5..da76c1f1e11c 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -379,6 +379,7 @@ typedef struct vnodeops {
 /*
  * Flags for read/write calls - same values as IRIX
  */
+#define IO_ISAIO	0x00001		/* don't wait for completion */
 #define IO_ISDIRECT	0x00004		/* bypass page cache */
 #define IO_INVIS	0x00020		/* don't update inode timestamps */
 
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
index 4a49cde9b857..795a11e2dbe9 100644
--- a/fs/xfs/xfs_clnt.h
+++ b/fs/xfs/xfs_clnt.h
@@ -64,6 +64,7 @@ struct xfs_mount_args {
 	int	sunit;		/* stripe unit (BBs) */
 	int	swidth;		/* stripe width (BBs), multiple of sunit */
 	uchar_t iosizelog;	/* log2 of the preferred I/O size */
+	int	ihashsize;	/* inode hash table size (buckets) */
 };
 
 /*
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bef91642de88..1bf1cc15bdcd 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -55,22 +55,32 @@
 #include "xfs_inode.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_bit.h"
 
 /*
  * Initialize the inode hash table for the newly mounted file system.
- *
- * mp -- this is the mount point structure for the file system being
- *       initialized
+ * Choose an initial table size based on user specified value, else
+ * use a simple algorithm using the maximum number of inodes as an
+ * indicator for table size, and cap it at 16 pages (gettin' big).
  */
 void
 xfs_ihash_init(xfs_mount_t *mp)
 {
-	int	i;
+	__uint64_t	icount;
+	uint		i, flags = KM_SLEEP | KM_MAYFAIL;
+
+	if (!mp->m_ihsize) {
+		icount = mp->m_maxicount ? mp->m_maxicount :
+			 (mp->m_sb.sb_dblocks << mp->m_sb.sb_inopblog);
+		mp->m_ihsize = 1 << max_t(uint, xfs_highbit64(icount) / 3, 8);
+		mp->m_ihsize = min_t(uint, mp->m_ihsize, 16 * PAGE_SIZE);
+	}
 
-	mp->m_ihsize = XFS_BUCKETS(mp);
-	mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize
-				      * sizeof(xfs_ihash_t), KM_SLEEP);
-	ASSERT(mp->m_ihash != NULL);
+	while (!(mp->m_ihash = (xfs_ihash_t *)kmem_zalloc(mp->m_ihsize *
+						sizeof(xfs_ihash_t), flags))) {
+		if ((mp->m_ihsize >>= 1) <= NBPP)
+			flags = KM_SLEEP;
+	}
 	for (i = 0; i < mp->m_ihsize; i++) {
 		rwlock_init(&(mp->m_ihash[i].ih_lock));
 	}
@@ -88,29 +98,19 @@ xfs_ihash_free(xfs_mount_t *mp)
 
 /*
  * Initialize the inode cluster hash table for the newly mounted file system.
- *
- * mp -- this is the mount point structure for the file system being
- *       initialized
+ * Its size is derived from the ihash table size.
  */
 void
 xfs_chash_init(xfs_mount_t *mp)
 {
-	int	i;
+	uint	i;
 
-	/*
-	 * m_chash size is based on m_ihash
-	 * with a minimum of 37 entries
-	 */
-	mp->m_chsize = (XFS_BUCKETS(mp)) /
-			 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog);
-	if (mp->m_chsize < 37) {
-		mp->m_chsize = 37;
-	}
+	mp->m_chsize = max_t(uint, 1, mp->m_ihsize /
+			 (XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog));
+	mp->m_chsize = min_t(uint, mp->m_chsize, mp->m_ihsize);
 	mp->m_chash = (xfs_chash_t *)kmem_zalloc(mp->m_chsize
 						 * sizeof(xfs_chash_t),
 						 KM_SLEEP);
-	ASSERT(mp->m_chash != NULL);
-
 	for (i = 0; i < mp->m_chsize; i++) {
 		spinlock_init(&mp->m_chash[i].ch_lock,"xfshash");
 	}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97bd24ba4198..3d427c2b065c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -182,10 +182,6 @@ typedef struct xfs_ihash {
 	uint			ih_version;
 } xfs_ihash_t;
 
-/*
- * Inode hashing and hash bucket locking.
- */
-#define XFS_BUCKETS(mp) (37*(mp)->m_sb.sb_agcount-1)
 #define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize))
 
 /*
@@ -193,7 +189,6 @@ typedef struct xfs_ihash {
  * find inodes that share a cluster and can be flushed to disk at the same
  * time.
  */
-
 typedef struct xfs_chashlist {
 	struct xfs_chashlist	*chl_next;
 	struct xfs_inode	*chl_ip;
@@ -207,6 +202,8 @@ typedef struct xfs_chash {
 	lock_t			ch_lock;
 } xfs_chash_t;
 
+#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
+
 
 /*
  * This is the xfs in-core inode structure.
@@ -450,12 +447,6 @@ xfs_inode_t *xfs_bhvtoi(struct bhv_desc *bhvp);
 #define BHV_IS_XFS(bdp)		(BHV_OPS(bdp) == &xfs_vnodeops)
 
 /*
- * Pick the inode cluster hash bucket
- * (m_chash is the same size as m_ihash)
- */
-#define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize))
-
-/*
  * For multiple groups support: if S_ISGID bit is set in the parent
  * directory, group of new file is set to that of the parent, and
  * new subdirectory gets S_ISGID bit from parent.
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a37420eb5fc6..ef727508e3a2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -296,7 +296,7 @@ typedef struct xfs_mount {
 	xfs_agnumber_t		m_agirotor;	/* last ag dir inode alloced */
 	lock_t			m_agirotor_lock;/* .. and lock protecting it */
 	xfs_agnumber_t		m_maxagi;	/* highest inode alloc group */
-	int			m_ihsize;	/* size of next field */
+	uint			m_ihsize;	/* size of next field */
 	struct xfs_ihash	*m_ihash;	/* fs private inode hash table*/
 	struct xfs_inode	*m_inodes;	/* active inode list */
 	struct list_head	m_del_inodes;	/* inodes to reclaim */
@@ -376,7 +376,7 @@ typedef struct xfs_mount {
 	xfs_dablk_t		m_dirdatablk;	/* blockno of dir data v2 */
 	xfs_dablk_t		m_dirleafblk;	/* blockno of dir non-data v2 */
 	xfs_dablk_t		m_dirfreeblk;	/* blockno of dirfreeindex v2 */
-	int			m_chsize;	/* size of next field */
+	uint			m_chsize;	/* size of next field */
 	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
 						 * hash table */
 	struct xfs_dmops	m_dm_ops;	/* vector of DMI ops */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 863a67c586d9..7745d23eae7f 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -252,6 +252,7 @@ xfs_start_flags(
 			ap->logbufsize);
 		return XFS_ERROR(EINVAL);
 	}
+	mp->m_ihsize = ap->ihashsize;
 	mp->m_logbsize = ap->logbufsize;
 	mp->m_fsname_len = strlen(ap->fsname) + 1;
 	mp->m_fsname = kmem_alloc(mp->m_fsname_len, KM_SLEEP);
@@ -468,19 +469,19 @@ xfs_mount(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(ddev);
+	mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0);
 	if (!mp->m_ddev_targp) {
 		xfs_blkdev_put(logdev);
 		xfs_blkdev_put(rtdev);
 		return error;
 	}
 	if (rtdev) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev);
+		mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1);
 		if (!mp->m_rtdev_targp)
 			goto error0;
 	}
 	mp->m_logdev_targp = (logdev && logdev != ddev) ?
-				xfs_alloc_buftarg(logdev) : mp->m_ddev_targp;
+				xfs_alloc_buftarg(logdev, 1) : mp->m_ddev_targp;
 	if (!mp->m_logdev_targp)
 		goto error0;
 
@@ -1579,7 +1580,7 @@ xfs_syncsub(
 }
 
 /*
- * xfs_vget - called by DMAPI to get vnode from file handle
+ * xfs_vget - called by DMAPI and NFSD to get vnode from file handle
  */
 STATIC int
 xfs_vget(
@@ -1621,7 +1622,7 @@ xfs_vget(
 		return XFS_ERROR(EIO);
 	}
 
-	if (ip->i_d.di_mode == 0 || (igen && (ip->i_d.di_gen != igen))) {
+	if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) {
 		xfs_iput_new(ip, XFS_ILOCK_SHARED);
 		*vpp = NULL;
 		return XFS_ERROR(ENOENT);
@@ -1646,6 +1647,7 @@ xfs_vget(
 #define MNTOPT_SWIDTH	"swidth"	/* data volume stripe width */
 #define MNTOPT_NOUUID	"nouuid"	/* ignore filesystem UUID */
 #define MNTOPT_MTPT	"mtpt"		/* filesystem mount point */
+#define MNTOPT_IHASHSIZE    "ihashsize"    /* size of inode hash table */
 #define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
 #define MNTOPT_NOLOGFLUSH   "nologflush"   /* don't hard flush on log writes */
 #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */
@@ -1734,6 +1736,13 @@ xfs_parseargs(
 			iosize = simple_strtoul(value, &eov, 10);
 			args->flags |= XFSMNT_IOSIZE;
 			args->iosizelog = (uint8_t) iosize;
+		} else if (!strcmp(this_char, MNTOPT_IHASHSIZE)) {
+			if (!value || !*value) {
+				printk("XFS: %s option requires an argument\n",
+					this_char); 
+				return EINVAL;
+			}
+			args->ihashsize = simple_strtoul(value, &eov, 10);
 		} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
 			args->flags |= XFSMNT_WSYNC;
 		} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index da847a19d4a5..c6f6c4ee63ae 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -3900,7 +3900,7 @@ xfs_finish_reclaim(
 	int		error;
 
 	if (vp && VN_BAD(vp))
-		return 0;
+		goto reclaim;
 
 	/* The hash lock here protects a thread in xfs_iget_core from
 	 * racing with us on linking the inode back with a vnode.
@@ -3948,8 +3948,7 @@ xfs_finish_reclaim(
 			 */
 			if (error) {
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				xfs_ireclaim(ip);
-				return (0);
+				goto reclaim;
 			}
 			xfs_iflock(ip); /* synchronize with xfs_iflush_done */
 		}
@@ -3968,6 +3967,7 @@ xfs_finish_reclaim(
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 	}
 
+ reclaim:
 	xfs_ireclaim(ip);
 	return 0;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ca3c328b217..1f69baf46f6e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1149,6 +1149,10 @@ struct export_operations {
 
 };
 
+extern struct dentry *
+find_exported_dentry(struct super_block *sb, void *obj, void *parent,
+		     int (*acceptable)(void *context, struct dentry *de),
+		     void *context);
 
 struct file_system_type {
 	const char *name;
author	Nathan Scott <nathans@sgi.com>	2005-01-20 11:41:18 +1100
committer	Nathan Scott <nathans@sgi.com>	2005-01-20 11:41:18 +1100
commit	364b1dccc8b290bc69b440b69dede741c356256d (patch)
tree	c8bab6bf2ed7d51628b1a2f3c81ab4ab7b29ae9a
parent	8aa11989523554e67b3e19a1f4180b11cdf03a3c (diff)
parent	2008db7cf8b4d2229658af3ceeae7885c6e2ba3f (diff)